diff --git a/.github/workflows/backport_release.yaml b/.github/workflows/backport_release.yaml new file mode 100644 index 000000000..ede6bde33 --- /dev/null +++ b/.github/workflows/backport_release.yaml @@ -0,0 +1,519 @@ +name: Backport Release + +on: + workflow_dispatch: + inputs: + commit: + description: 'Full 40-char SHA of the tip commit of the backport source branch (the PR head commit that passed tests). The branch is resolved from this SHA and must be unique.' + required: true + type: string + +permissions: + contents: read + pull-requests: read + checks: read + +jobs: + backport-release: + name: Create backport release + runs-on: ubuntu-latest + environment: backport release + + steps: + - name: Generate GitHub App token + id: app-token + uses: actions/create-github-app-token@bcd2ba49218906704ab6c1aa796996da409d3eb1 + with: + app-id: ${{ secrets.FEN_RELEASE_APP_ID }} + private-key: ${{ secrets.FEN_RELEASE_PRIVATE_KEY }} + + - name: Checkout repository + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd + with: + token: ${{ steps.app-token.outputs.token }} + fetch-depth: 0 + fetch-tags: true + + - name: Configure git + run: | + git config user.name "fen-release[bot]" + git config user.email "fen-release[bot]@users.noreply.github.com" + + - name: Resolve source branch from commit SHA + id: resolve + env: + SOURCE_COMMIT: ${{ inputs.commit }} + DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + run: | + set -euo pipefail + + # Require a full 40-char lowercase-hex SHA. Short SHAs are ambiguous + # and we will be comparing this value against API responses (PR head + # SHA, ref tips) that always return the full form. + if [[ ! "${SOURCE_COMMIT}" =~ ^[0-9a-f]{40}$ ]]; then + echo "::error::Input commit '${SOURCE_COMMIT}' is not a full 40-char lowercase hex SHA." + exit 1 + fi + + # Fetch all remote branches so we can search for which one(s) point + # at this SHA. `actions/checkout` with fetch-depth: 0 fetches full + # history of the checked-out ref but does not necessarily populate + # every refs/remotes/origin/*, so do it explicitly. + git fetch --prune origin '+refs/heads/*:refs/remotes/origin/*' + + # Verify the commit actually exists in this repo's object DB. + if ! git cat-file -e "${SOURCE_COMMIT}^{commit}" 2>/dev/null; then + echo "::error::Commit ${SOURCE_COMMIT} was not found in the repository." + exit 1 + fi + + # Find every remote branch whose tip == SOURCE_COMMIT. Exactly one + # branch must point at it. If zero, the commit isn't anyone's tip + # (likely stale, force-pushed past, or never the PR head). If more + # than one, the (branch -> SHA) mapping is ambiguous and we refuse + # to guess — the operator must give us a unique branch to release. + mapfile -t matching_branches < <( + git for-each-ref \ + --format='%(refname:strip=3)' \ + --points-at="${SOURCE_COMMIT}" \ + refs/remotes/origin/ \ + | grep -vx 'HEAD' || true + ) + + if [[ "${#matching_branches[@]}" -eq 0 ]]; then + echo "::error::No branch on origin has ${SOURCE_COMMIT} as its tip." + echo "::error::Either the branch was updated after you copied this SHA, or this commit was never the head of a branch." + exit 1 + fi + + if [[ "${#matching_branches[@]}" -gt 1 ]]; then + echo "::error::More than one branch on origin has ${SOURCE_COMMIT} as its tip; cannot pick one:" + for b in "${matching_branches[@]}"; do + echo "::error:: - ${b}" + done + echo "::error::Refusing to proceed with an ambiguous source branch." + exit 1 + fi + + source_branch="${matching_branches[0]}" + + if [[ "${source_branch}" == "${DEFAULT_BRANCH}" ]]; then + echo "::error::Source branch must not be the default branch ('${DEFAULT_BRANCH}')." + exit 1 + fi + + echo "Resolved commit ${SOURCE_COMMIT} to branch '${source_branch}'." + echo "source_branch=${source_branch}" >> "$GITHUB_OUTPUT" + + - name: Determine latest stable release + id: latest + env: + GH_TOKEN: ${{ steps.app-token.outputs.token }} + run: | + set -euo pipefail + + # List all tags matching vMAJOR.MINOR.PATCH and pick the highest by numeric + # comparison of each component. We DO NOT use `sort -V` because it treats + # v0.19.99 as higher than v0.20.1. + latest_tag="$( + git tag --list 'v[0-9]*.[0-9]*.[0-9]*' \ + | grep -E '^v[0-9]+\.[0-9]+\.[0-9]+$' \ + | awk -F'[v.]' '{ printf "%010d %010d %010d %s\n", $2, $3, $4, $0 }' \ + | sort -k1,1n -k2,2n -k3,3n \ + | tail -n1 \ + | awk '{print $4}' + )" + + if [[ -z "${latest_tag}" ]]; then + echo "::error::No stable release tags (vMAJOR.MINOR.PATCH) were found." + exit 1 + fi + + # Parse components + ver="${latest_tag#v}" + major="${ver%%.*}" + rest="${ver#*.}" + minor="${rest%%.*}" + patch="${rest#*.}" + + new_patch=$((patch + 1)) + new_version="v${major}.${minor}.${new_patch}" + release_branch="release/v${major}.${minor}" + + latest_sha="$(git rev-list -n 1 "refs/tags/${latest_tag}")" + + echo "latest_tag=${latest_tag}" >> "$GITHUB_OUTPUT" + echo "latest_sha=${latest_sha}" >> "$GITHUB_OUTPUT" + echo "major=${major}" >> "$GITHUB_OUTPUT" + echo "minor=${minor}" >> "$GITHUB_OUTPUT" + echo "patch=${patch}" >> "$GITHUB_OUTPUT" + echo "new_version=${new_version}" >> "$GITHUB_OUTPUT" + echo "new_version_no_v=${major}.${minor}.${new_patch}" >> "$GITHUB_OUTPUT" + echo "release_branch=${release_branch}" >> "$GITHUB_OUTPUT" + + echo "Latest stable release: ${latest_tag} (${latest_sha})" + echo "New version will be: ${new_version}" + echo "Release branch: ${release_branch}" + + - name: Validate source branch is cut directly from the latest stable release + env: + SOURCE_BRANCH: ${{ steps.resolve.outputs.source_branch }} + SOURCE_COMMIT: ${{ inputs.commit }} + LATEST_TAG_SHA: ${{ steps.latest.outputs.latest_sha }} + LATEST_TAG: ${{ steps.latest.outputs.latest_tag }} + run: | + set -euo pipefail + + # Use the user-provided SHA directly rather than re-resolving the branch + # tip — the resolve step already proved the branch tip equals SOURCE_COMMIT, + # and pinning to the SHA here makes the rest of the job TOCTOU-safe against + # someone pushing to the branch mid-run. + source_sha="${SOURCE_COMMIT}" + + # Walking first-parent from the source tip must reach LATEST_TAG_SHA. + # We capture rev-list into a variable and grep against a here-string + # rather than piping `rev-list | grep -q`: under `set -o pipefail`, + # `grep -q` would exit on first match and SIGPIPE the still-streaming + # `rev-list`, propagating exit 141 as a spurious "not found". + first_parent_chain="$(git rev-list --first-parent "${source_sha}")" + if ! grep -Fxq "${LATEST_TAG_SHA}" <<< "${first_parent_chain}"; then + echo "::error::Source branch '${SOURCE_BRANCH}' is not cut from '${LATEST_TAG}'." + echo "::error::Its first-parent history does not include ${LATEST_TAG_SHA}." + exit 1 + fi + + # Additionally, every commit added on top of the tag (the set we are + # about to publish) must itself be a descendant of the tag along + # first-parent — i.e. no sibling commits from master sneak in via a + # non-first-parent path. Enforce by requiring that the symmetric + # difference is empty in one direction: commits in source that are + # NOT first-parent-reachable from source starting at the tag. + # We do this by intersecting: + # A = commits reachable from source but not from tag (full DAG) + # B = commits on the first-parent chain from source down to tag + # and requiring A == B. + all_added="$(git rev-list "${LATEST_TAG_SHA}..${source_sha}" | sort)" + first_parent_added="$( + git rev-list --first-parent "${LATEST_TAG_SHA}..${source_sha}" | sort + )" + + if [[ "${all_added}" != "${first_parent_added}" ]]; then + echo "::error::Source branch '${SOURCE_BRANCH}' contains commits not on its first-parent chain from '${LATEST_TAG}'." + echo "::error::This usually means the branch was cut from master (not from the tag) or contains a merge from master." + echo "Commits reachable but not on first-parent chain:" + comm -23 <(printf '%s\n' "${all_added}") <(printf '%s\n' "${first_parent_added}") \ + | while read -r sha; do + echo " $(git log -1 --format='%h %s' "${sha}")" + done + exit 1 + fi + + added_count="$(printf '%s\n' "${all_added}" | grep -c . || true)" + echo "Source branch is cut directly from ${LATEST_TAG} with ${added_count} commit(s) on top." + + - name: Validate PR exists, is open, named correctly, has latest commit, and checks pass + env: + GH_TOKEN: ${{ steps.app-token.outputs.token }} + SOURCE_BRANCH: ${{ steps.resolve.outputs.source_branch }} + SOURCE_COMMIT: ${{ inputs.commit }} + NEW_VERSION: ${{ steps.latest.outputs.new_version }} + REPO: ${{ github.repository }} + run: | + set -euo pipefail + + expected_title="ComfyUI backport release ${NEW_VERSION}" + + # Find open PRs from this branch into master. The --state open filter + # is load-bearing: a closed/merged PR with passing checks must not be + # accepted as authorization for a new release. + pr_json="$( + gh pr list \ + --repo "${REPO}" \ + --state open \ + --head "${SOURCE_BRANCH}" \ + --base master \ + --json number,title,headRefOid,state \ + --limit 10 + )" + + pr_count="$(echo "${pr_json}" | jq 'length')" + if [[ "${pr_count}" -eq 0 ]]; then + echo "::error::No open PR found from '${SOURCE_BRANCH}' into 'master'. The PR must exist and be open." + exit 1 + fi + + # Pick the PR matching the expected title + pr_number="$(echo "${pr_json}" | jq -r --arg t "${expected_title}" ' + map(select(.title == $t)) | .[0].number // empty + ')" + pr_head_sha="$(echo "${pr_json}" | jq -r --arg t "${expected_title}" ' + map(select(.title == $t)) | .[0].headRefOid // empty + ')" + + if [[ -z "${pr_number}" ]]; then + echo "::error::No open PR from '${SOURCE_BRANCH}' into 'master' is titled '${expected_title}'." + echo "Found PRs:" + echo "${pr_json}" | jq -r '.[] | " #\(.number): \(.title)"' + exit 1 + fi + + # The PR's current head commit must equal the SHA the operator gave us. + # This is what closes the door on releasing stale code: if anyone has + # pushed to the branch since the operator validated tests passed, the + # PR head will have advanced past SOURCE_COMMIT and we abort. (The + # resolve step already proved the branch tip == SOURCE_COMMIT; this + # ties that same SHA to the PR that authorizes the release.) + if [[ "${pr_head_sha}" != "${SOURCE_COMMIT}" ]]; then + echo "::error::PR #${pr_number} head commit is ${pr_head_sha}, but the operator-provided commit is ${SOURCE_COMMIT}." + echo "::error::The PR has new commits since this release was authorized. Re-run with the new head SHA after verifying its checks." + exit 1 + fi + + echo "Found open PR #${pr_number} titled '${expected_title}' at head ${pr_head_sha} (matches operator-provided commit)." + + # Verify all check runs on the head commit have completed successfully. + # A check is considered passing if conclusion is success, neutral, or skipped. + checks_json="$( + gh api \ + --paginate \ + "repos/${REPO}/commits/${pr_head_sha}/check-runs" \ + --jq '.check_runs[] | {name: .name, status: .status, conclusion: .conclusion}' + )" + + if [[ -z "${checks_json}" ]]; then + echo "::error::No check runs found on PR head commit ${pr_head_sha}." + exit 1 + fi + + echo "Check runs on ${pr_head_sha}:" + echo "${checks_json}" | jq -s '.' + + failing="$(echo "${checks_json}" | jq -s ' + map(select( + .status != "completed" + or (.conclusion as $c + | ["success","neutral","skipped"] + | index($c) | not) + )) + ')" + + failing_count="$(echo "${failing}" | jq 'length')" + if [[ "${failing_count}" -gt 0 ]]; then + echo "::error::One or more checks have not passed on PR head commit ${pr_head_sha}:" + echo "${failing}" | jq -r '.[] | " - \(.name): status=\(.status) conclusion=\(.conclusion)"' + exit 1 + fi + + echo "All checks have passed on ${pr_head_sha}." + + - name: Prepare release branch + id: prepare + env: + GH_TOKEN: ${{ steps.app-token.outputs.token }} + REPO: ${{ github.repository }} + RELEASE_BRANCH: ${{ steps.latest.outputs.release_branch }} + LATEST_TAG: ${{ steps.latest.outputs.latest_tag }} + LATEST_TAG_SHA: ${{ steps.latest.outputs.latest_sha }} + PATCH: ${{ steps.latest.outputs.patch }} + run: | + set -euo pipefail + + # Try to fetch the release branch. If patch == 0, it shouldn't exist yet + # and we'll create it from the latest stable tag. If patch > 0, it must + # already exist and its tip must equal the latest stable tag commit (i.e. + # the previous patch release). + if git ls-remote --exit-code --heads origin "${RELEASE_BRANCH}" >/dev/null 2>&1; then + echo "Release branch '${RELEASE_BRANCH}' already exists on origin." + git fetch origin "refs/heads/${RELEASE_BRANCH}:refs/remotes/origin/${RELEASE_BRANCH}" + git checkout -B "${RELEASE_BRANCH}" "refs/remotes/origin/${RELEASE_BRANCH}" + + current_tip="$(git rev-parse HEAD)" + if [[ "${current_tip}" != "${LATEST_TAG_SHA}" ]]; then + echo "::error::Release branch '${RELEASE_BRANCH}' tip (${current_tip}) is not at the latest stable release '${LATEST_TAG}' (${LATEST_TAG_SHA})." + echo "::error::Refusing to release on top of a divergent branch." + exit 1 + fi + echo "branch_existed=true" >> "$GITHUB_OUTPUT" + else + if [[ "${PATCH}" != "0" ]]; then + echo "::error::Release branch '${RELEASE_BRANCH}' does not exist on origin, but the latest stable release '${LATEST_TAG}' has patch=${PATCH} (>0). This is inconsistent." + exit 1 + fi + echo "Release branch '${RELEASE_BRANCH}' does not exist. Creating from ${LATEST_TAG}." + git checkout -B "${RELEASE_BRANCH}" "refs/tags/${LATEST_TAG}" + echo "branch_existed=false" >> "$GITHUB_OUTPUT" + fi + + - name: Fast-forward merge source branch into release branch + env: + SOURCE_BRANCH: ${{ steps.resolve.outputs.source_branch }} + SOURCE_COMMIT: ${{ inputs.commit }} + RELEASE_BRANCH: ${{ steps.latest.outputs.release_branch }} + run: | + set -euo pipefail + + # --ff-only guarantees no merge commit is created. If a fast-forward is + # not possible (i.e. the release branch has commits the source branch + # doesn't), the merge will fail and we abort. Because we already validated + # that the source branch is rooted on the latest stable tag, and the + # release branch tip equals that same tag, this fast-forward should + # always succeed for a well-formed backport branch. + # + # We merge the operator-provided SHA, not the branch ref, so a push to + # the branch in the window between resolve and now cannot smuggle new + # commits into the release. + if ! git merge --ff-only "${SOURCE_COMMIT}"; then + echo "::error::Cannot fast-forward '${RELEASE_BRANCH}' to ${SOURCE_COMMIT} (tip of '${SOURCE_BRANCH}'). A merge commit would be required. Aborting." + exit 1 + fi + + echo "Fast-forwarded '${RELEASE_BRANCH}' to ${SOURCE_COMMIT} (tip of '${SOURCE_BRANCH}')." + + - name: Bump version files + env: + NEW_VERSION_NO_V: ${{ steps.latest.outputs.new_version_no_v }} + run: | + set -euo pipefail + + if [[ ! -f comfyui_version.py ]]; then + echo "::error::comfyui_version.py not found in repo root." + exit 1 + fi + if [[ ! -f pyproject.toml ]]; then + echo "::error::pyproject.toml not found in repo root." + exit 1 + fi + + # Replace the version string in comfyui_version.py. + # Expected format: __version__ = "X.Y.Z" + python3 - "$NEW_VERSION_NO_V" <<'PY' + import re, sys, pathlib + new = sys.argv[1] + + p = pathlib.Path("comfyui_version.py") + src = p.read_text() + new_src, n = re.subn( + r'(__version__\s*=\s*[\'"])[^\'"]+([\'"])', + lambda m: f'{m.group(1)}{new}{m.group(2)}', + src, + count=1, + ) + if n != 1: + sys.exit("Could not find __version__ assignment in comfyui_version.py") + p.write_text(new_src) + + p = pathlib.Path("pyproject.toml") + src = p.read_text() + # Replace the first `version = "..."` inside [project] or [tool.poetry]. + new_src, n = re.subn( + r'(?m)^(version\s*=\s*")[^"]+(")', + lambda m: f'{m.group(1)}{new}{m.group(2)}', + src, + count=1, + ) + if n != 1: + sys.exit("Could not find version assignment in pyproject.toml") + p.write_text(new_src) + PY + + echo "Updated version to ${NEW_VERSION_NO_V} in comfyui_version.py and pyproject.toml." + git --no-pager diff -- comfyui_version.py pyproject.toml + + - name: Commit version bump and tag release + env: + NEW_VERSION: ${{ steps.latest.outputs.new_version }} + run: | + set -euo pipefail + + git add comfyui_version.py pyproject.toml + git commit -m "ComfyUI ${NEW_VERSION}" + + if git rev-parse -q --verify "refs/tags/${NEW_VERSION}" >/dev/null; then + echo "::error::Tag ${NEW_VERSION} already exists locally." + exit 1 + fi + git tag "${NEW_VERSION}" + + - name: Verify tag does not already exist on origin + env: + NEW_VERSION: ${{ steps.latest.outputs.new_version }} + run: | + set -euo pipefail + if git ls-remote --exit-code --tags origin "refs/tags/${NEW_VERSION}" >/dev/null 2>&1; then + echo "::error::Tag ${NEW_VERSION} already exists on origin. Aborting." + exit 1 + fi + + - name: Push release branch and tag + env: + RELEASE_BRANCH: ${{ steps.latest.outputs.release_branch }} + NEW_VERSION: ${{ steps.latest.outputs.new_version }} + run: | + set -euo pipefail + + # Push the branch first, then the tag. Atomic-ish: if the branch push + # fails we never publish the tag. + git push origin "refs/heads/${RELEASE_BRANCH}:refs/heads/${RELEASE_BRANCH}" + git push origin "refs/tags/${NEW_VERSION}" + + echo "Released ${NEW_VERSION} on ${RELEASE_BRANCH}." + + - name: Delete remote source branch + env: + GH_TOKEN: ${{ steps.app-token.outputs.token }} + REPO: ${{ github.repository }} + SOURCE_BRANCH: ${{ steps.resolve.outputs.source_branch }} + SOURCE_COMMIT: ${{ inputs.commit }} + RELEASE_BRANCH: ${{ steps.latest.outputs.release_branch }} + DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + run: | + set -euo pipefail + + # Belt-and-braces: the resolve step already refuses the default branch, + # but never delete the default or the release branch under any + # circumstances. + if [[ "${SOURCE_BRANCH}" == "${DEFAULT_BRANCH}" || "${SOURCE_BRANCH}" == "${RELEASE_BRANCH}" ]]; then + echo "::error::Refusing to delete '${SOURCE_BRANCH}' (matches default or release branch)." + exit 1 + fi + + # Delete the source branch on origin, but only if its tip is still the + # SHA we released from. If someone pushed new commits to it after we + # resolved it, leave it alone — those commits would be silently lost. + current_tip="$(git ls-remote origin "refs/heads/${SOURCE_BRANCH}" | awk '{print $1}')" + if [[ -z "${current_tip}" ]]; then + echo "Source branch '${SOURCE_BRANCH}' no longer exists on origin; nothing to delete." + exit 0 + fi + if [[ "${current_tip}" != "${SOURCE_COMMIT}" ]]; then + echo "::warning::Source branch '${SOURCE_BRANCH}' tip (${current_tip}) no longer matches released commit (${SOURCE_COMMIT}). Leaving it in place." + exit 0 + fi + + git push origin --delete "refs/heads/${SOURCE_BRANCH}" + echo "Deleted remote branch '${SOURCE_BRANCH}'." + + - name: Summary + if: always() + env: + NEW_VERSION: ${{ steps.latest.outputs.new_version }} + RELEASE_BRANCH: ${{ steps.latest.outputs.release_branch }} + LATEST_TAG: ${{ steps.latest.outputs.latest_tag }} + SOURCE_BRANCH: ${{ steps.resolve.outputs.source_branch }} + SOURCE_COMMIT: ${{ inputs.commit }} + run: | + # SOURCE_BRANCH is empty if the resolve step never produced an output + # (e.g. the workflow failed in or before that step). Show a placeholder + # in that case so the summary table still renders cleanly. + source_branch_display="${SOURCE_BRANCH:-(unresolved)}" + { + echo "## Backport release" + echo "" + echo "| Field | Value |" + echo "|---|---|" + echo "| Source commit | \`${SOURCE_COMMIT}\` |" + echo "| Source branch | \`${source_branch_display}\` |" + echo "| Previous stable | \`${LATEST_TAG}\` |" + echo "| New version | \`${NEW_VERSION}\` |" + echo "| Release branch | \`${RELEASE_BRANCH}\` |" + } >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/detect-unreviewed-merge.yml b/.github/workflows/detect-unreviewed-merge.yml new file mode 100644 index 000000000..4fabecb94 --- /dev/null +++ b/.github/workflows/detect-unreviewed-merge.yml @@ -0,0 +1,24 @@ +name: Detect Unreviewed Merge + +# SOC 2 compliance — reusable workflow lives in Comfy-Org/github-workflows, +# tracking issues are filed in Comfy-Org/unreviewed-merges. + +on: + push: + branches: [master] + +concurrency: + group: detect-unreviewed-merge-${{ github.sha }} + cancel-in-progress: false + +permissions: + contents: read + pull-requests: read + +jobs: + detect: + uses: Comfy-Org/github-workflows/.github/workflows/detect-unreviewed-merge.yml@4d9cb6b87f953bb7cd69954280e1465fb9bd2040 # v1 + with: + approval-mode: latest-per-reviewer + secrets: + UNREVIEWED_MERGES_TOKEN: ${{ secrets.UNREVIEWED_MERGES_TOKEN }} diff --git a/CODEOWNERS b/CODEOWNERS index 946dbf946..043c0ec75 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,2 +1,5 @@ -# Admins * @comfyanonymous @kosinkadink @guill @alexisrolland @rattus128 @kijai + +/CODEOWNERS @comfyanonymous +/.ci/ @comfyanonymous +/.github/ @comfyanonymous diff --git a/README.md b/README.md index 0eecd8a4b..dc2389266 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ [website-url]: https://www.comfy.org/ [discord-shield]: https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fdiscord.com%2Fapi%2Finvites%2Fcomfyorg%3Fwith_counts%3Dtrue&query=%24.approximate_member_count&logo=discord&logoColor=white&label=Discord&color=green&suffix=%20total -[discord-url]: https://www.comfy.org/discord +[discord-url]: https://discord.com/invite/comfyorg [twitter-shield]: https://img.shields.io/twitter/follow/ComfyUI [twitter-url]: https://x.com/ComfyUI @@ -433,7 +433,7 @@ See also: [https://www.comfy.org/](https://www.comfy.org/) ## Frontend Development -As of August 15, 2024, we have transitioned to a new frontend, which is now hosted in a separate repository: [ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend). This repository now hosts the compiled JS (from TS/Vue) under the `web/` directory. +As of August 15, 2024, we have transitioned to a new frontend, which is now hosted in a separate repository: [ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend). The compiled JS files (from TS/Vue) are published to [pypi](https://pypi.org/project/comfyui-frontend-package) and installed as a dependency in ComfyUI. ### Reporting Issues and Requesting Features diff --git a/app/assets/api/routes.py b/app/assets/api/routes.py index 68126b6a5..6555974e9 100644 --- a/app/assets/api/routes.py +++ b/app/assets/api/routes.py @@ -160,10 +160,12 @@ def _build_asset_response(result: schemas.AssetDetailResult | schemas.UploadResu preview_url = None else: preview_url = _build_preview_url_from_view(result.tags, result.ref.user_metadata) + asset_content_hash = result.asset.hash if result.asset else None return schemas_out.Asset( id=result.ref.id, name=result.ref.name, - asset_hash=result.asset.hash if result.asset else None, + hash=asset_content_hash, + asset_hash=asset_content_hash, size=int(result.asset.size_bytes) if result.asset else None, mime_type=result.asset.mime_type if result.asset else None, tags=result.tags, diff --git a/app/assets/api/schemas_out.py b/app/assets/api/schemas_out.py index d99b1098d..0e748b907 100644 --- a/app/assets/api/schemas_out.py +++ b/app/assets/api/schemas_out.py @@ -10,6 +10,7 @@ class Asset(BaseModel): id: str name: str + hash: str | None = None asset_hash: str | None = None size: int | None = None mime_type: str | None = None diff --git a/app/assets/services/metadata_extract.py b/app/assets/services/metadata_extract.py index a004929bc..bdfe60218 100644 --- a/app/assets/services/metadata_extract.py +++ b/app/assets/services/metadata_extract.py @@ -4,7 +4,6 @@ Tier 1: Filesystem metadata (zero parsing) Tier 2: Safetensors header metadata (fast JSON read only) """ -from __future__ import annotations import json import logging diff --git a/app/custom_node_manager.py b/app/custom_node_manager.py index 281febca9..738af2abd 100644 --- a/app/custom_node_manager.py +++ b/app/custom_node_manager.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import os import folder_paths import glob diff --git a/app/frontend_management.py b/app/frontend_management.py index d0596b276..8e84e8dd9 100644 --- a/app/frontend_management.py +++ b/app/frontend_management.py @@ -1,4 +1,3 @@ -from __future__ import annotations import argparse import logging import os @@ -62,6 +61,8 @@ def get_comfy_package_versions(): def check_comfy_packages_versions(): """Warn for every comfy* package whose installed version is below requirements.txt.""" from packaging.version import InvalidVersion, parse as parse_pep440 + outdated_packages = [] + for pkg in get_comfy_package_versions(): installed_str = pkg["installed"] required_str = pkg["required"] @@ -73,19 +74,26 @@ def check_comfy_packages_versions(): logging.error(f"Failed to check {pkg['name']} version: {e}") continue if outdated: - app.logger.log_startup_warning( - f""" + outdated_packages.append((pkg["name"], installed_str, required_str)) + else: + logging.info("{} version: {}".format(pkg["name"], installed_str)) + + if outdated_packages: + package_warnings = "\n".join( + f"Installed {name} version {installed} is lower than the recommended version {required}." + for name, installed, required in outdated_packages + ) + app.logger.log_startup_warning( + f""" ________________________________________________________________________ WARNING WARNING WARNING WARNING WARNING -Installed {pkg["name"]} version {installed_str} is lower than the recommended version {required_str}. +{package_warnings} {get_missing_requirements_message()} ________________________________________________________________________ """.strip() - ) - else: - logging.info("{} version: {}".format(pkg["name"], installed_str)) + ) REQUEST_TIMEOUT = 10 # seconds diff --git a/app/logger.py b/app/logger.py index 3d26d98fe..bde815822 100644 --- a/app/logger.py +++ b/app/logger.py @@ -5,6 +5,40 @@ import logging import sys import threading +ANSI_NAMED_COLORS = { + 'black': '\033[30m', + 'red': '\033[31m', + 'green': '\033[32m', + 'yellow': '\033[33m', + 'blue': '\033[34m', + 'magenta': '\033[35m', + 'cyan': '\033[36m', + 'white': '\033[37m', +} + +ANSI_LEVEL_COLORS = { + 'DEBUG': ANSI_NAMED_COLORS['cyan'], + 'INFO': ANSI_NAMED_COLORS['green'], + 'WARNING': ANSI_NAMED_COLORS['yellow'], + 'ERROR': ANSI_NAMED_COLORS['red'], + 'CRITICAL': ANSI_NAMED_COLORS['magenta'], +} + +ANSI_RESET = '\033[0m' +ANSI_BOLD = '\033[1m' + + +class ColoredFormatter(logging.Formatter): + def format(self, record): + color = ANSI_LEVEL_COLORS.get(record.levelname, '') + bold = ANSI_BOLD if record.levelno >= logging.WARNING else '' + level_tag = f"{bold}{color}[{record.levelname}]{ANSI_RESET} " + message = super().format(record) + line_color = ANSI_NAMED_COLORS.get(getattr(record, 'color', ''), '') + if line_color: + return f"{level_tag}{line_color}{message}{ANSI_RESET}" + return level_tag + message + logs = None stdout_interceptor = None stderr_interceptor = None @@ -68,8 +102,10 @@ def setup_logger(log_level: str = 'INFO', capacity: int = 300, use_stdout: bool logger = logging.getLogger() logger.setLevel(log_level) + formatter = ColoredFormatter("%(message)s") + stream_handler = logging.StreamHandler() - stream_handler.setFormatter(logging.Formatter("%(message)s")) + stream_handler.setFormatter(formatter) if use_stdout: # Only errors and critical to stderr @@ -77,7 +113,7 @@ def setup_logger(log_level: str = 'INFO', capacity: int = 300, use_stdout: bool # Lesser to stdout stdout_handler = logging.StreamHandler(sys.stdout) - stdout_handler.setFormatter(logging.Formatter("%(message)s")) + stdout_handler.setFormatter(formatter) stdout_handler.addFilter(lambda record: record.levelno < logging.ERROR) logger.addHandler(stdout_handler) diff --git a/app/model_manager.py b/app/model_manager.py index f124d1117..8f6e34b33 100644 --- a/app/model_manager.py +++ b/app/model_manager.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import os import base64 import json diff --git a/app/user_manager.py b/app/user_manager.py index 0517b3344..7b11e381c 100644 --- a/app/user_manager.py +++ b/app/user_manager.py @@ -1,4 +1,3 @@ -from __future__ import annotations import json import os import re diff --git a/blueprints/Audio Generation (Stable Audio 3 Medium Base).json b/blueprints/Audio Generation (Stable Audio 3 Medium Base).json new file mode 100644 index 000000000..e561fe634 --- /dev/null +++ b/blueprints/Audio Generation (Stable Audio 3 Medium Base).json @@ -0,0 +1,2091 @@ +{ + "revision": 0, + "last_node_id": 52, + "last_link_id": 0, + "nodes": [ + { + "id": 52, + "type": "8b66c757-fe2f-4184-91f3-479a19deb565", + "pos": [ + 370, + 1120 + ], + "size": [ + 420, + 450 + ], + "flags": { + "collapsed": false + }, + "order": 0, + "mode": 0, + "inputs": [ + { + "label": "user_input", + "name": "user_input", + "type": "STRING", + "widget": { + "name": "user_input" + }, + "link": null + }, + { + "label": "duration", + "name": "duration", + "type": "FLOAT", + "widget": { + "name": "duration" + }, + "link": null + }, + { + "label": "seed", + "name": "seed", + "type": "INT", + "widget": { + "name": "seed" + }, + "link": null + }, + { + "label": "use_reprompt", + "name": "use_reprompt", + "type": "BOOLEAN", + "widget": { + "name": "use_reprompt" + }, + "link": null + }, + { + "label": "reprompt_category", + "name": "category", + "type": "COMBO", + "widget": { + "name": "category" + }, + "link": null + }, + { + "label": "ckpt_name", + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": null + }, + { + "label": "sa_clip", + "name": "sa_clip", + "type": "COMBO", + "widget": { + "name": "sa_clip" + }, + "link": null + }, + { + "label": "qwen_clip", + "name": "qwen_clip", + "type": "COMBO", + "widget": { + "name": "qwen_clip" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "AUDIO", + "name": "AUDIO", + "type": "AUDIO", + "links": [] + } + ], + "properties": { + "proxyWidgets": [ + [ + "31", + "value" + ], + [ + "36", + "value" + ], + [ + "3", + "seed" + ], + [ + "35", + "value" + ], + [ + "43", + "choice" + ], + [ + "25", + "ckpt_name" + ], + [ + "26", + "clip_name" + ], + [ + "29", + "clip_name" + ] + ] + }, + "widgets_values": [], + "title": "Audio Generation (Stable Audio 3 Medium Base)" + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "8b66c757-fe2f-4184-91f3-479a19deb565", + "version": 1, + "state": { + "lastGroupId": 8, + "lastNodeId": 56, + "lastLinkId": 84, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Audio Generation (Stable Audio 3 Medium Base)", + "inputNode": { + "id": -10, + "bounding": [ + -810, + 400, + 155.953125, + 208 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + 1750, + 1041, + 128, + 68 + ] + }, + "inputs": [ + { + "id": "78ae2515-114b-494a-becc-43c7b6c2dc2f", + "name": "user_input", + "type": "STRING", + "linkIds": [ + 68 + ], + "label": "user_input", + "pos": [ + -678.046875, + 424 + ] + }, + { + "id": "5ca95030-aff4-4544-b545-f0d814e0e49a", + "name": "duration", + "type": "FLOAT", + "linkIds": [ + 82 + ], + "label": "duration", + "pos": [ + -678.046875, + 444 + ] + }, + { + "id": "718eb10f-da1a-4cea-a9c7-3040f98fe960", + "name": "seed", + "type": "INT", + "linkIds": [ + 76 + ], + "label": "seed", + "pos": [ + -678.046875, + 464 + ] + }, + { + "id": "dc020099-39e6-4009-9937-408409d71736", + "name": "use_reprompt", + "type": "BOOLEAN", + "linkIds": [ + 83 + ], + "label": "use_reprompt", + "pos": [ + -678.046875, + 484 + ] + }, + { + "id": "edae394c-6324-44d6-8ac5-d8caa5ae2169", + "name": "category", + "type": "COMBO", + "linkIds": [ + 78 + ], + "label": "reprompt_category", + "pos": [ + -678.046875, + 504 + ] + }, + { + "id": "be19b747-6a47-4028-9c30-d52f54a712ea", + "name": "ckpt_name", + "type": "COMBO", + "linkIds": [ + 79 + ], + "label": "ckpt_name", + "pos": [ + -678.046875, + 524 + ] + }, + { + "id": "bc9241a2-bc20-4c5d-8cb1-f2958f598642", + "name": "sa_clip", + "type": "COMBO", + "linkIds": [ + 80 + ], + "label": "sa_clip", + "pos": [ + -678.046875, + 544 + ] + }, + { + "id": "a33a2468-6d6d-4cb6-937c-3510bf16ebac", + "name": "qwen_clip", + "type": "COMBO", + "linkIds": [ + 81 + ], + "label": "qwen_clip", + "pos": [ + -678.046875, + 564 + ] + } + ], + "outputs": [ + { + "id": "bbe988dd-5c03-44fd-a965-c712f9204988", + "name": "AUDIO", + "type": "AUDIO", + "linkIds": [ + 27 + ], + "localized_name": "AUDIO", + "pos": [ + 1774, + 1065 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 7, + "type": "CLIPTextEncode", + "pos": [ + 620, + 420 + ], + "size": [ + 440, + 140 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 35 + }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "slot_index": 0, + "links": [ + 6 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPTextEncode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "" + ], + "color": "#223", + "bgcolor": "#335" + }, + { + "id": 12, + "type": "VAEDecodeAudio", + "pos": [ + 1450, + 110 + ], + "size": [ + 230, + 100 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "localized_name": "samples", + "name": "samples", + "type": "LATENT", + "link": 13 + }, + { + "localized_name": "vae", + "name": "vae", + "type": "VAE", + "link": 39 + } + ], + "outputs": [ + { + "localized_name": "AUDIO", + "name": "AUDIO", + "type": "AUDIO", + "slot_index": 0, + "links": [ + 27 + ] + } + ], + "properties": { + "Node name for S&R": "VAEDecodeAudio", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 11, + "type": "EmptyLatentAudio", + "pos": [ + 630, + 610 + ], + "size": [ + 430, + 140 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "seconds", + "name": "seconds", + "type": "FLOAT", + "widget": { + "name": "seconds" + }, + "link": 50 + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { + "name": "batch_size" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "LATENT", + "name": "LATENT", + "type": "LATENT", + "links": [ + 12 + ] + } + ], + "properties": { + "Node name for S&R": "EmptyLatentAudio", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 60, + 1 + ] + }, + { + "id": 3, + "type": "KSampler", + "pos": [ + 1100, + 100 + ], + "size": [ + 320, + 350 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 30 + }, + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "link": 4 + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "link": 6 + }, + { + "localized_name": "latent_image", + "name": "latent_image", + "type": "LATENT", + "link": 12 + }, + { + "localized_name": "seed", + "name": "seed", + "type": "INT", + "widget": { + "name": "seed" + }, + "link": 76 + }, + { + "localized_name": "steps", + "name": "steps", + "type": "INT", + "widget": { + "name": "steps" + }, + "link": null + }, + { + "localized_name": "cfg", + "name": "cfg", + "type": "FLOAT", + "widget": { + "name": "cfg" + }, + "link": null + }, + { + "localized_name": "sampler_name", + "name": "sampler_name", + "type": "COMBO", + "widget": { + "name": "sampler_name" + }, + "link": null + }, + { + "localized_name": "scheduler", + "name": "scheduler", + "type": "COMBO", + "widget": { + "name": "scheduler" + }, + "link": null + }, + { + "localized_name": "denoise", + "name": "denoise", + "type": "FLOAT", + "widget": { + "name": "denoise" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "LATENT", + "name": "LATENT", + "type": "LATENT", + "slot_index": 0, + "links": [ + 13 + ] + } + ], + "properties": { + "Node name for S&R": "KSampler", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 0, + "randomize", + 50, + 7, + "lcm", + "simple", + 1 + ] + }, + { + "id": 29, + "type": "CLIPLoader", + "pos": [ + 690, + 1580 + ], + "size": [ + 430, + 170 + ], + "flags": {}, + "order": 8, + "mode": 0, + "showAdvanced": false, + "inputs": [ + { + "localized_name": "clip_name", + "name": "clip_name", + "type": "COMBO", + "widget": { + "name": "clip_name" + }, + "link": 81 + }, + { + "localized_name": "type", + "name": "type", + "type": "COMBO", + "widget": { + "name": "type" + }, + "link": null + }, + { + "localized_name": "device", + "name": "device", + "shape": 7, + "type": "COMBO", + "widget": { + "name": "device" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "links": [ + 40 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPLoader", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "models": [ + { + "name": "qwen3.5_2b_bf16.safetensors", + "url": "https://huggingface.co/Comfy-Org/Qwen3.5/resolve/main/text_encoders/qwen3.5_2b_bf16.safetensors", + "directory": "text_encoders" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "qwen3.5_2b_bf16.safetensors", + "stable_diffusion", + "default" + ] + }, + { + "id": 6, + "type": "CLIPTextEncode", + "pos": [ + 610, + 130 + ], + "size": [ + 450, + 240 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 34 + }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": 49 + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "slot_index": 0, + "links": [ + 4 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPTextEncode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "" + ], + "color": "#232", + "bgcolor": "#353" + }, + { + "id": 34, + "type": "ComfySwitchNode", + "pos": [ + 210, + 610 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 47 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 46 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 48 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 49 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode" + }, + "widgets_values": [ + false + ] + }, + { + "id": 41, + "type": "ComfyMathExpression", + "pos": [ + 1370, + 1360 + ], + "size": [ + 230, + 80 + ], + "flags": { + "collapsed": true + }, + "order": 16, + "mode": 0, + "inputs": [ + { + "label": "a", + "localized_name": "values.a", + "name": "values.a", + "type": "FLOAT,INT,BOOLEAN", + "link": 56 + }, + { + "label": "b", + "localized_name": "values.b", + "name": "values.b", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": null + }, + { + "localized_name": "expression", + "name": "expression", + "type": "STRING", + "widget": { + "name": "expression" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": null + }, + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 57 + ] + }, + { + "localized_name": "BOOL", + "name": "BOOL", + "type": "BOOLEAN", + "links": null + } + ], + "properties": { + "Node name for S&R": "ComfyMathExpression" + }, + "widgets_values": [ + "a" + ] + }, + { + "id": 42, + "type": "PreviewAny", + "pos": [ + 1370, + 1310 + ], + "size": [ + 230, + 40 + ], + "flags": { + "collapsed": true + }, + "order": 17, + "mode": 0, + "inputs": [ + { + "localized_name": "source", + "name": "source", + "type": "*", + "link": 57 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 58 + ] + } + ], + "properties": { + "Node name for S&R": "PreviewAny" + }, + "widgets_values": [ + null, + null, + null + ] + }, + { + "id": 39, + "type": "StringReplace", + "pos": [ + 1040, + 900 + ], + "size": [ + 270, + 280 + ], + "flags": {}, + "order": 14, + "mode": 0, + "inputs": [ + { + "localized_name": "string", + "name": "string", + "type": "STRING", + "widget": { + "name": "string" + }, + "link": 52 + }, + { + "localized_name": "find", + "name": "find", + "type": "STRING", + "widget": { + "name": "find" + }, + "link": null + }, + { + "localized_name": "replace", + "name": "replace", + "type": "STRING", + "widget": { + "name": "replace" + }, + "link": 53 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 59 + ] + } + ], + "title": "Text Replace (USER INPUT)", + "properties": { + "Node name for S&R": "StringReplace" + }, + "widgets_values": [ + "", + "USER_INPUT", + "" + ] + }, + { + "id": 28, + "type": "TextGenerate", + "pos": [ + 1200, + 1580 + ], + "size": [ + 430, + 420 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 40 + }, + { + "localized_name": "image", + "name": "image", + "shape": 7, + "type": "IMAGE", + "link": null + }, + { + "localized_name": "video", + "name": "video", + "shape": 7, + "type": "IMAGE", + "link": null + }, + { + "localized_name": "audio", + "name": "audio", + "shape": 7, + "type": "AUDIO", + "link": null + }, + { + "localized_name": "prompt", + "name": "prompt", + "type": "STRING", + "widget": { + "name": "prompt" + }, + "link": 60 + }, + { + "localized_name": "max_length", + "name": "max_length", + "type": "INT", + "widget": { + "name": "max_length" + }, + "link": null + }, + { + "localized_name": "sampling_mode", + "name": "sampling_mode", + "type": "COMFY_DYNAMICCOMBO_V3", + "widget": { + "name": "sampling_mode" + }, + "link": null + }, + { + "localized_name": "temperature", + "name": "sampling_mode.temperature", + "type": "FLOAT", + "widget": { + "name": "sampling_mode.temperature" + }, + "link": null + }, + { + "localized_name": "top_k", + "name": "sampling_mode.top_k", + "type": "INT", + "widget": { + "name": "sampling_mode.top_k" + }, + "link": null + }, + { + "localized_name": "top_p", + "name": "sampling_mode.top_p", + "type": "FLOAT", + "widget": { + "name": "sampling_mode.top_p" + }, + "link": null + }, + { + "localized_name": "min_p", + "name": "sampling_mode.min_p", + "type": "FLOAT", + "widget": { + "name": "sampling_mode.min_p" + }, + "link": null + }, + { + "localized_name": "repetition_penalty", + "name": "sampling_mode.repetition_penalty", + "type": "FLOAT", + "widget": { + "name": "sampling_mode.repetition_penalty" + }, + "link": null + }, + { + "localized_name": "seed", + "name": "sampling_mode.seed", + "type": "INT", + "widget": { + "name": "sampling_mode.seed" + }, + "link": null + }, + { + "localized_name": "presence_penalty", + "name": "sampling_mode.presence_penalty", + "shape": 7, + "type": "FLOAT", + "widget": { + "name": "sampling_mode.presence_penalty" + }, + "link": null + }, + { + "localized_name": "thinking", + "name": "thinking", + "shape": 7, + "type": "BOOLEAN", + "widget": { + "name": "thinking" + }, + "link": null + }, + { + "localized_name": "use_default_template", + "name": "use_default_template", + "shape": 7, + "type": "BOOLEAN", + "widget": { + "name": "use_default_template" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "generated_text", + "name": "generated_text", + "type": "STRING", + "links": [ + 46, + 84 + ] + } + ], + "properties": { + "Node name for S&R": "TextGenerate", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "", + 256, + "on", + 0.7, + 64, + 0.95, + 0.05, + 1.05, + 0, + 0, + false, + true + ] + }, + { + "id": 31, + "type": "PrimitiveStringMultiline", + "pos": [ + -390, + 160 + ], + "size": [ + 400, + 200 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "STRING", + "widget": { + "name": "value" + }, + "link": 68 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 47, + 53 + ] + } + ], + "title": "User: short description (USER_INPUT in template)", + "properties": { + "Node name for S&R": "PrimitiveStringMultiline" + }, + "widgets_values": [ + "" + ] + }, + { + "id": 43, + "type": "CustomCombo", + "pos": [ + 140, + 910 + ], + "size": [ + 550, + 320 + ], + "flags": {}, + "order": 18, + "mode": 0, + "inputs": [ + { + "localized_name": "choice", + "name": "choice", + "type": "COMBO", + "widget": { + "name": "choice" + }, + "link": 78 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 65 + ] + }, + { + "localized_name": "INDEX", + "name": "INDEX", + "type": "INT", + "links": null + } + ], + "title": "Custom Combo (Category index)", + "properties": { + "Node name for S&R": "CustomCombo" + }, + "widgets_values": [ + "Music", + 0, + "Music", + "Instrument", + "SFX", + "One-shot", + "" + ] + }, + { + "id": 49, + "type": "JsonExtractString", + "pos": [ + 720, + 1200 + ], + "size": [ + 300, + 180 + ], + "flags": {}, + "order": 19, + "mode": 0, + "inputs": [ + { + "localized_name": "json_string", + "name": "json_string", + "type": "STRING", + "widget": { + "name": "json_string" + }, + "link": null + }, + { + "localized_name": "key", + "name": "key", + "type": "STRING", + "widget": { + "name": "key" + }, + "link": 65 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 66 + ] + } + ], + "properties": { + "Node name for S&R": "JsonExtractString" + }, + "widgets_values": [ + "{\n \"Music\": \"You are an expert musician and musicologist and prompt engineer. Transform the user's input into a detailed, vivid music prompt for a full instrumental track.\\n\\n1. Start with the genre or style and optional adjectives (e.g., upbeat, dreamy, aggressive).\\n2. List the main instruments that define the track.\\n3. Add supporting elements or layers such as pads, harmonics, effects, or field recordings.\\n4. Include rhythm or percussion elements like drums, hi-hats, congas, brushes, or polyrhythms.\\n5. Integrate mood and energy naturally in the sentence (e.g., \\\"creating suspenseful tension\\\" or \\\"bright and uplifting\\\").\\n6. Specify the BPM.\\n7. Specify the track length as an integer in seconds. Use ranges: energetic/dance 120-180s, pop/rock 180-210s, cinematic/ambient 240-300s.\\n8. Combine all elements into one natural, fluid sentence. Avoid semicolons.\\n\\nTemplate:\\nGenre/Style with main instruments, supporting instruments/layers, and rhythm/percussion creating mood/energy. BPM: X. Length: Y seconds\\n\\nExamples:\\n- Jazz ballad with smooth saxophone lead, piano chords, upright bass, brushed drums, and soft strings that swing gently for a warm and cozy evening. BPM: 85. Length: 180 seconds\\n- EDM festival track with pulsing synth leads, plucked arpeggios, layered pads, side-chained bass, punchy kick and snare, and hi-hat rolls creating bright, energetic, and uplifting dance energy. BPM: 128. Length: 150 seconds\\n- Lo-fi hip-hop chill track with mellow electric piano, soft vinyl crackle, subtle synth pads, low-pass filtered drums, percussion loops, and soft plucked bass for a relaxed, dreamy vibe. BPM: 75. Length: 150 seconds\\n- Heavy metal anthem with distorted electric guitars, bass guitar, double bass drums, and cymbal crashes with fast palm-muted riffs creating intense, aggressive energy. BPM: 160. Length: 180 seconds\\n- Melancholic piano piece with soft piano lead, string pads, subtle atmospheric synths, and minimal brush percussion evoking a reflective rainy-day feeling. BPM: 60. Length: 240 seconds\\n- Suspenseful electronic thriller with pulsing bass synth, arpeggiated lead synth, cinematic pads, glitchy percussion, and high string stabs creating dark and tense energy. BPM: 100. Length: 200 seconds\\n- Dreamy ambient soundscape with layered pads, soft bell textures, gentle drones, and wind and water field recordings for ethereal and spacious meditation. BPM: 40. Length: 300 seconds\\n- Fingerpicking acoustic guitar solo with harmonics, subtle reverb, occasional shaker and soft stomp percussion, and soft pad layers for warm intimate storytelling. BPM: 70. Length: 120 seconds\\n- Synthwave 80s retro track with arpeggiated synth leads, analog pads, electric bass, punchy electronic drums, gated reverb snares, and atmospheric FX for nostalgic and vibrant energy. BPM: 110. Length: 180 seconds\\n- Tribal percussion ensemble with congas, djembes, bongos, shakers, and frame drums layered with deep synthetic sub-bass in complex polyrhythms. BPM: 100. Length: 140 seconds\\n- 1920s swing jazz with brass section, upright bass, piano, brushed drums, banjo, clarinet, and soft strings that swing lively for energetic dance vibes. BPM: 110. Length: 180 seconds\\n- Futuristic electronic sci-fi track with pulsing bass synth, evolving lead synths, layered pads, glitch percussion, robotic FX, and sub-bass for tense cinematic energy. BPM: 125. Length: 200 seconds\\n- Ambient underwater soundscape with flowing water textures, soft piano motifs, synth drones, distant bells, and underwater reverb for spacious meditative immersion. BPM: 45. Length: 300 seconds\\n- Horror cinematic track with dissonant strings, eerie piano stabs, cinematic percussion including taiko and low toms, and synth FX producing suspenseful creepy tension. BPM: 90. Length: 240 seconds\\n- Reggae track with offbeat guitar, warm basslines, snare, kick, congas, and horn stabs giving laid-back groovy energy. BPM: 85. Length: 150 seconds\\n- Blues track with soulful electric guitar solos, walking bass, piano, and shuffle drums creating expressive and emotive storytelling. BPM: 90. Length: 180 seconds\\n- Latin salsa with congas, timbales, horns, piano montunos, bass, and layered percussion for vibrant danceable energy. BPM: 120. Length: 210 seconds\\n- Afrobeat track with electric guitar stabs, horns, layered percussion, congas, shakers, bass groove, and synth pads for vibrant rhythmic energy. BPM: 105. Length: 200 seconds\\n- Indie rock track with electric guitar riffs, bass, live drum kit, layered synths, and subtle strings for energetic yet emotional feel. BPM: 110. Length: 180 seconds\\n- Funk groove with slap bass, electric guitar chords, brass stabs, drums, congas, and rhythmic keyboards creating high-energy danceable rhythm. BPM: 105. Length: 180 seconds\\n- Drum and bass track with fast breakbeat drums, deep sub-bass, sharp synth leads, pads, and atmospheric FX for high-energy club motion. BPM: 175. Length: 150 seconds\\n- Dark ambient track with drones, distant bells, low rumbles, soft wind textures, and synth pads producing eerie immersive tension. BPM: 50. Length: 300 seconds\\n- Tropical house track with marimba, steel drums, soft synths, smooth bass, layered percussion, and light piano riffs for sunny chill dance vibes. BPM: 110. Length: 180 seconds\\n- Progressive rock track with electric guitar leads, organ, bass, drum kit, synth layers, and occasional strings for epic layered energy. BPM: 100. Length: 220 seconds\\n- Music box melody with delicate metallic tones and soft resonance, lullaby style, with gentle ambient reverb. BPM: 60. Length: 20 seconds\\n- Soft piano arpeggio with warm felted tone and slow attack, lullaby style, with intimate room ambience. BPM: 60. Length: 30 seconds\\n- Harp gentle plucked pattern with airy resonance, lullaby style, with dreamy reverb tail. BPM: 65. Length: 25 seconds\\n- Acoustic guitar fingerstyle pattern with warm nylon strings and soft dynamics, lullaby style, with subtle room resonance. BPM: 60. Length: 30 seconds\\n- Ambient synth pad with smooth evolving texture and soft harmonics, lullaby style, with wide stereo ambience. BPM: 50. Length: 40 seconds\\n- Early rock piano with walking left-hand bass line, shuffle rhythms, and blues scale improvisations in energetic 1950s boogie-woogie style. BPM: 160. Length: 180 seconds\\n- Trip Hop track with jazzy sampled vibraphone, mid-tempo breakbeat drums, harp, Latin ethnic percussion, and sweeping cinematic strings creating airy, relaxing, soulful lounge vibes. BPM: 90. Length: 180 seconds\\n- Country outlaw cinematic instrumental with blues pedal steel guitar, rustic mandolin, fiddle call-and-response, tape-driven rattly drum kit, autoharp, and soaring accordion solo for raw, emotional southern blues expression. BPM: 85. Length: 200 seconds\\n- Neo Classical track with sweeping string section, elegant horns, and delicate piano creating soothing, hypnotic, modern, soft, and classic mood. BPM: 70. Length: 180 seconds\\n- Art Rock desert track with desolate piano chords, western-themed rhythm guitars, unique lead guitars, rattly vintage drum kit, and supporting bass creating lonely, expansive, beautiful, and strange atmospheres. BPM: 95. Length: 180 seconds\\n- Cinematic Sci-Fi score with dramatic horn section, building marcato strings, gliding bassoon, thunderous cymbals, subdued timpani, and subtle synth drones producing awe-inspiring, uplifting, epic intergalactic energy. BPM: 100. Length: 220 seconds\\n- West Coast Hip Hop instrumental with cascading harp melodies, smooth Rhodes piano chops, vintage boom bap drums, and walking double bass producing raw, street, and soulful block-party vibes. BPM: 92. Length: 180 seconds\\n- Synthwave futuristic track with pulsating synth bass, exciting chords, soaring leads, and reverberating drum machine patterns creating gritty, pounding, and cool energy. BPM: 110. Length: 180 seconds\\n- Breakbeat track with complex percussion, intricate breakbeats, gritty synths, lush pads, and 808 bassline producing fresh, modern, futuristic, and rave-ready energy. BPM: 140. Length: 160 seconds\\n- Lounge Jazz 1960s smooth track with laid-back drums, piano chords, double bass, soft electric piano, subtle flute, and unique percussion creating beautiful, atmospheric, eclectic, retro, and chill vibes. BPM: 85. Length: 180 seconds\\n- Latin Jazz 1950s blissful track with laid-back Latin drums, euphoric piano chords, double bass, orchestral accompaniment, acoustic guitar, and vibraphone producing nostalgic, beautiful, atmospheric, cinematic, and chill mood. BPM: 95. Length: 180 seconds\\n- Acid Jazz 1970s summertime track with smooth electric piano, trippy synth leads, laid-back vintage drum kit, fuzzy electric bass, and uplifting violin producing retro, psychedelic, jazzy, relaxing energy. BPM: 100. Length: 180 seconds\\n- Progressive Soul 1970s track with feel-good piano, psychedelic organ, groovy vintage drum kit with percussion, fuzzy electric bass, and synth strings producing retro, raw, soulful, joyous atmosphere. BPM: 90. Length: 180 seconds\\n- Discotheque 1970s French-inspired track with sultry piano, psychedelic guitars, groovy drum kit, fuzzy electric bass, and melancholic organ producing retro, raw, laid-back, and relaxing mood. BPM: 105. Length: 180 seconds\\n- Soul Jazz 1970s track with expressive saxophone, smooth piano, groovy drum kit, rhythmic upright bass, sweeping strings, and minimal vibraphone producing retro, raw, laid-back, and epic energy. BPM: 95. Length: 180 seconds\\n- Vintage R&B 1970s live studio track with subtle brass, smooth piano, sweeping strings, and minimal drums producing retro, beautiful, uplifting, nostalgic mood. BPM: 85. Length: 180 seconds\\n- 50s Pop track with Latin influence, string section, bold brass, vibraphone, acoustic guitar, flute, ethnic percussion, and brushed drums creating sexy, epic, vintage, retro, melancholic, jazzy, dramatic energy. BPM: 100. Length: 180 seconds\\n- A piece of calm, quiet, mellow, serene music perfect for a peaceful film score, featuring soft modulating piano, ambient sfx and foley, beautiful vibraphone, and subtle synthesizer drones. The mood is cinematic, thoughtful, serene and nostalgic. BPM: 55. Length: 300 seconds\",\n \"Instrument\": \"You are a music metadata expert. Given an instrument, generate a descriptive prompt for a generative audio model.\\n\\n1. Identify the instrument.\\n2. Add playing style or technique.\\n3. Include details about material, timbre, or texture.\\n4. Add musical style or mood. Specify the genre, context, or emotional character.\\n5. Add spatial or production qualities.\\n6. Specify BPM: Always include a BPM appropriate to the style and context.\\n7. Specify length: Provide an integer in seconds (6–20 s for loops, 20–180 s for stems).\\n\\nExamples:\\n- Synth arpeggio loop with bright detuned oscillators. BPM: 120. Length: 8 seconds\\n- Chord stab loop with sharp percussive attack. BPM: 90. Length: 6 seconds\\n- Guitar muted strum loop with tight rhythmic feel. BPM: 100. Length: 8 seconds\\n- Pluck sequence loop with bright resonant tone. BPM: 128. Length: 10 seconds\\n- Marimba and vibraphone percussive loop with resonant wooden and metallic tones. BPM: 110. Length: 12 seconds\\n- Drum loop with deep muffled kick on beat one, snappy rimshot snare on beats two and four with rolling ghost note fills, and tight closed hi-hats with subtle open accents. BPM: 85. Length: 10 seconds\\n- Drum groove loop with brushed snare swinging on the ride, soft feathered kick on downbeats, and light closed hi-hat taps on the upbeats. BPM: 130. Length: 12 seconds\\n- Kick and hi-hat loop with four-on-the-floor punchy kick, tight closed hi-hats on every eighth note, and a sharp dry snare on beats two and four. BPM: 130. Length: 15 seconds\\n- Vinyl crackle drum loop with warm low-pass filtered kick, dusty snare with tape saturation, and shuffled closed hi-hats with subtle vinyl crackle ambiance. BPM: 80. Length: 10 seconds\\n- Ambient pad loop with evolving texture. BPM: 80. Length: 12 seconds\\n- Melodic synth bass groove loop with pumping sidechain feel. BPM: 122. Length: 10 seconds\\n- Melodic Bass slap and pop rhythm loop. BPM: 100. Length: 8 seconds\\n- Acoustic bass walking line loop with natural wooden resonance. BPM: 120. Length: 12 seconds\\n- String pizzicato motif loop, suspenseful, with tight string texture. BPM: 90. Length: 8 seconds\\n- Brass staccato riff loop with sharp bright attack. BPM: 130. Length: 10 seconds\\n- Flute airy melodic loop with wooden headjoint resonance. BPM: 100. Length: 6 seconds\\n- Pan flute ambient loop with breathy timbre. BPM: 75. Length: 8 seconds\\n- Clarinet riff loop with warm smooth reed tone. BPM: 120. Length: 10 seconds\\n- Oboe motif loop, orchestral, with rich double reed resonance. BPM: 80. Length: 8 seconds\\n- Recorder Renaissance motif loop with soft wooden timbre. BPM: 100. Length: 6 seconds\\n- Electric sitar riff loop with buzzing resonant tone. BPM: 90. Length: 10 seconds\\n- Koto plucked motif loop with resonant wooden strings. BPM: 90. Length: 8 seconds\\n- Shamisen folk melody loop with percussive twang. BPM: 100. Length: 8 seconds\\n- Banjo fingerpicking loop with metallic string resonance. BPM: 110. Length: 10 seconds\\n- Mandolin tremolo loop with crisp wooden body tone. BPM: 120. Length: 10 seconds\\n- Acoustic guitar chord vamp loop with natural room resonance. BPM: 110. Length: 12 seconds\\n- Nylon string guitar arpeggio loop with warm, soft timbre. BPM: 90. Length: 15 seconds\\n- Electric guitar riff loop with driven distorted tone. BPM: 130. Length: 10 seconds\\n- Slide guitar melody loop with warm resonant glide. BPM: 100. Length: 12 seconds\\n- Steel guitar slide loop with bright pedal steel tone. BPM: 95. Length: 12 seconds\\n- Harpsichord arpeggio loop with crisp plucked attack. BPM: 120. Length: 10 seconds\\n- Rhodes chord vamp loop with warm electric piano tone. BPM: 100. Length: 12 seconds\\n- Clavinet funky rhythm loop. BPM: 105. Length: 10 seconds\\n- Organ chord vamp loop with full drawbar warmth. BPM: 90. Length: 12 seconds\\n- Drum loop with booming 808 kick on beat one, crisp snare on beat three, and rapid triplet hi-hat rolls with open hat accents for aggressive high-energy feel. BPM: 140. Length: 8 seconds\\n- Breakbeat drum loop with chopped Amen-style snare flurries, driving kick on the one, fast sixteenth-note closed hi-hats, and syncopated open hat accents. BPM: 170. Length: 10 seconds\\n- Glitch percussion loop with stuttered kick transients, randomised snare hits processed with bit-crushing, and erratic hi-hat patterns with pitch-shifted metallic ticks. BPM: 120. Length: 12 seconds\\n- Metallic hits loop with distorted kick impacts, processed metal-plate snare slams, and grinding hi-hat noise bursts for aggressive mechanical texture. BPM: 120. Length: 10 seconds\\n- Timpani hits loop, cinematic, with deep resonant kick-like timpani strikes on beat one, rolling snare-style timpani fills, and no hi-hats for a grand orchestral feel. BPM: 70. Length: 8 seconds\\n- Snare roll loop, dramatic, with accelerating snare drum rolls building from soft to crashing, deep supporting kick pulses, and no hi-hats for maximum impact. BPM: 100. Length: 8 seconds\\n- Accordion motif loop with bright reedy bellows tone. BPM: 100. Length: 10 seconds\\n- Harmonica blues riff loop with expressive reed timbre. BPM: 90. Length: 10 seconds\\n- Trombone riff loop with warm sliding brass tone. BPM: 120. Length: 10 seconds\\n- French horn melodic loop, cinematic. BPM: 80. Length: 12 seconds\\n- Soprano sax ballad loop. BPM: 70. Length: 12 seconds\\n- Alto sax bebop riff loop. BPM: 200. Length: 10 seconds\\n- Electric violin melodic loop with reverb. BPM: 90. Length: 10 seconds\\n- String pad loop with cinematic texture. BPM: 70. Length: 15 seconds\\n- Granular synth evolving texture loop. BPM: 90. Length: 15 seconds\\n- Piano motif loop with soft felt hammer tone. BPM: 80. Length: 10 seconds\\n- Pad and synth loop with lush detuned shimmer. BPM: 85. Length: 12 seconds\\n- Synth lead loop with sidechain pumping compression. BPM: 128. Length: 10 seconds\\n- Analog synth bassline loop with deep warm low-end. BPM: 122. Length: 12 seconds\\n- FM synth lead motif loop with bright metallic shimmer. BPM: 110. Length: 10 seconds\\n- Bass groove loop with tight rhythmic two-bar pattern. BPM: 100. Length: 16 seconds\\n- Acoustic guitar fingerstyle motif loop with warm wood resonance. BPM: 90. Length: 45 seconds\\n- Sombre acoustic guitar motif loop with cavernous reverb, delicate fingerpicking, and expressive melancholic tone. BPM: 70. Length: 45 seconds\\n- Electric guitar rock riff motif loop. BPM: 130. Length: 40 seconds\\n- Vintage electric guitar motif loop, live-recorded in a vintage studio, with expressive and dynamic solo performance. BPM: 90. Length: 40 seconds\\n- Piano chord progression motif loop with rich harmonic movement. BPM: 120. Length: 60 seconds\\n- String ensemble cinematic motif loop with rich wooden resonance. BPM: 80. Length: 120 seconds\\n- Brass ensemble cinematic motif loop with bright metallic timbre. BPM: 90. Length: 90 seconds\\n- Ethnic percussion ensemble motif loop with deep resonant djembe kick tones, slapped snare-like rim hits on congas, and layered shakers and bells providing hi-hat-like rhythmic texture with polyrhythmic patterns. BPM: 100. Length: 90 seconds\\n- Synth ambient motif loop with evolving textures. BPM: 80. Length: 180 seconds\\n- Motif loop with warm dusty vinyl crackle and tape saturation. BPM: 80. Length: 60 seconds\\n- Synth lead and bass motif loop with bright punchy energy. BPM: 128. Length: 90 seconds\\n- Funk band motif loop: bass, drums, guitar. BPM: 100. Length: 90 seconds\\n- Ethnic flute motif for cinematic use. BPM: 80. Length: 30 seconds\\n- Steel drum melodic motif loop with bright metallic resonance. BPM: 110. Length: 20 seconds\\n- Marimba percussive motif loop with resonant wooden tone. BPM: 100. Length: 20 seconds\\n- Vibraphone melodic motif loop with metallic shimmer. BPM: 90. Length: 25 seconds\\n- Piano cinematic motif loop with resonant wooden tone. BPM: 80. Length: 30 seconds\\n- Violin expressive cinematic motif loop with rich wooden resonance. BPM: 75. Length: 25 seconds\\n- Cello expressive motif loop with deep wooden resonance. BPM: 70. Length: 30 seconds\\n- Trumpet expressive motif loop with brassy overtones. BPM: 100. Length: 25 seconds\\n- Sax expressive motif loop with warm reed timbre. BPM: 95. Length: 25 seconds\\n- Ethnic drum ensemble motif loop with booming natural-skin bass drum kicks, sharp hand-slap snare accents on djembes and talking drums, and layered wooden and metal percussion providing rhythmic hi-hat-like patterns. BPM: 95. Length: 30 seconds\\n- Ambient drone motif loop. BPM: 60. Length: 180 seconds\\n- Orchestral tension motif loop. BPM: 90. Length: 150 seconds\\n- Electronic track motif loop with drums, bass, synth. BPM: 128. Length: 180 seconds\",\n \"SFX\": \"You are a professional sound design expert. Convert the user's input into a precise, vivid sound effects description suitable for generative audio models.\\n\\nDescribe clearly:\\n- Sound source\\n- Physical character (texture, timbre, material: metal, wood, glass, concrete, etc.)\\n- Spatial qualities (indoor/outdoor, cave/open field/underwater, dry/reverberant, close-up/distant, echoing/muffled)\\n- Temporal evolution (attack, decay, movement, transitions over time)\\n- Include motion or spatial movement if applicable (passing, approaching, stereo movement)\\n\\nAudio length rules:\\n- Very short sounds (impacts, clicks, gunshots): 1–3 seconds\\n- Medium actions (footsteps, object movement, transitions): 3–6 seconds\\n- Ambience / environments: 6–15 seconds\\n- Always append: Length: X seconds (integer only, no decimals).\\n\\nOutput constraints:\\n- Length: 1–2 dense sentences maximum\\n- Output ONLY the final rewritten prompt\\n- No explanations, no formatting, no quotes\\n- Use concise but dense technical language\\n- Focus strictly on sound effects or ambience\\n- Always append: Length: X seconds (integer only, no decimals).\\n\\nQuality guidelines:\\n- Be specific and avoid vague terms\\n- Prioritize clarity and realism\\n- Combine elements into one coherent scene\\n- Avoid redundancy\\n\\nExamples:\\n- Heavy rain hitting a metal roof during a thunderstorm, distant thunder rumbles, stereo, realistic ambience. Length: 45 seconds\\n- Quiet forest at dawn with birds chirping, soft wind through leaves, distant stream flowing. Length: 60 seconds\\n- Busy city street at night, cars passing, muffled conversations, occasional horn, urban ambience. Length: 50 seconds\\n- Ocean waves crashing against rocky cliffs, strong wind, dramatic and cinematic. Length: 70 seconds\\n- Wooden door creaking open slowly in an old house, echoing interior, eerie tone. Length: 3 seconds\\n- Glass bottle shattering on concrete, sharp impact, scattered fragments. Length: 2 seconds\\n- Footsteps on gravel, steady walking pace, close perspective. Length: 8 seconds\\n- Typing rapidly on a mechanical keyboard, crisp tactile clicks. Length: 5 seconds\\n- Punch impact with deep bass hit, cinematic trailer style. Length: 2 seconds\\n- Car speeding past at high velocity, doppler effect, realistic whoosh. Length: 3 seconds\\n- Object falling from height and hitting ground with a heavy thud. Length: 2 seconds\\n- Sword swing whooshing through air, fast motion, clean metallic tone. Length: 2 seconds\\n- Futuristic laser blast, clean energy pulse, high-tech sound design. Length: 1 seconds\\n- Spaceship engine humming, low frequency rumble, interior perspective. Length: 90 seconds\\n- Magical spell casting, shimmering particles, rising tonal energy. Length: 8 seconds\\n- Teleportation effect, glitchy digital distortion with a soft whoosh. Length: 5 seconds\\n- Dark eerie drone with distant whispers, creepy, slow build tension. Length: 120 seconds\\n- Sudden horror jump scare sting, sharp violin hit, cinematic. Length: 1 second\\n- Metal scraping slowly in a dark tunnel, echoing and ominous. Length: 20 seconds\\n- Explosion with debris scattering, deep bass, cinematic realism. Length: 4 seconds\\n- Building collapsing, rumbling concrete, dust and debris falling. Length: 25 seconds\\n- Fire crackling intensely, wood burning, close-up detail. Length: 80 seconds\\n- Gunshot in a large empty warehouse, loud echo decay. Length: 2 seconds\\n- Retro arcade coin insert sound, 8-bit style. Length: 1 second\\n- Level up chime, bright, rewarding, fantasy RPG style. Length: 2 seconds\\n- Error buzzer, short, digital, UI feedback. Length: 1 second\\n- Menu navigation clicks, soft futuristic interface sounds. Length: 3 seconds\\n- Layered soundscape: rain, thunder, footsteps, and distant sirens all blending naturally. Length: 90 seconds\\n- Rapid sequence of three impacts: metal hit, glass break, wood crack, spaced evenly. Length: 4 seconds\\n- Sound moving from left to right stereo field: passing motorcycle. Length: 5 seconds\\n- Close vs far perspective transition: footsteps approaching then fading away. Length: 6 seconds\\n- Tape stop sub drop, a massive sub-bass note that mimics a vinyl record or tape machine being turned off, the pitch and speed drop simultaneously, causing the high-end harmonics to smear and thicken as the sound grinds to a halt at a sub-sonic frequency. Length: 11 seconds\\n- Gravel and leaves footsteps, the sound of a hard boot stepping onto dry leaves or gravel, crisp and natural with detailed texture. Length: 11 seconds\\n- Ghostship moan, a massive, deep wooden groan with a low-frequency moan, like heavy timber under immense structural tension, swaying slowly, processed with long, dark wooden room reverb for a sense of scale. Length: 11 seconds\\n- Bicycle chain, a continuous metallic whirring sound of a chain moving over sprockets, with individual teeth catching the links, processed with resonant band-pass filter to emphasize metallic singing. Length: 11 seconds\\n- Warp drive, a sound that starts with a massive suck-back of ambient noise, followed by a supersonic crack and high-pitched zing that disappears into the distance, giving the sense of stretching space-time. Length: 11 seconds\\n- Ice cubes, high-pitched musical clinking of hard ice hitting a thin glass, bright resonant ring with subtle liquid sloshing around the edges. Length: 11 seconds\\n- Paper shuffle, the sound of a thick stack of heavy bond paper being squared up on a desk, dry papery thud with a quick fanning sound as air moves between the pages. Length: 11 seconds\\n- Drawer slam, a blunt, powerful thud made by slamming a wooden desk drawer shut, pronounced low-mid body, slightly distorted for aggressive character. Length: 3 seconds\",\n \"One-shot\": \"You are a music metadata expert. Given an instrument or sound, generate a descriptive prompt for a short, isolated one-shot audio sample for music production.\\n\\n1. Identify the instrument or sound source.\\n2. Describe the playing technique or hit type (e.g., pluck, slam, tap, stab).\\n3. Include details about material, timbre, or texture.\\n4. Add spatial or production qualities (dry/wet, room, close-mic).\\n5. Specify length: short integer in seconds (1–11 s).\\n\\nExamples:\\n- Piano key hit with bright percussive attack and resonant wooden body. Length: 2 seconds\\n- Kick drum punchy low-end hit with warm skin resonance. Length: 2 seconds\\n- Snare drum rimshot accent with crisp snare wires. Length: 2 seconds\\n- Acoustic guitar fingerstyle note with warm spruce tone. Length: 3 seconds\\n- Bass pluck with jazzy tone and resonant wooden body. Length: 3 seconds\\n- Electric guitar power chord with distortion. Length: 3 seconds\\n- Metallic glitch percussion hit with sharp metallic texture. Length: 2 seconds\\n- Tabla resonant tone hit with natural skin timbre. Length: 2 seconds\\n- Djembe slap accent with dry wooden resonance. Length: 2 seconds\\n- Synth stab with reverb tail. Length: 3 seconds\\n- Violin expressive note with vibrato and rich wooden resonance. Length: 3 seconds\\n- Cello legato note, cinematic, with warm resonant body. Length: 3 seconds\\n- Trumpet bright accent with slightly brassy overtones. Length: 2 seconds\\n- Melodic saxophone jazz riff with smooth reed timbre and a slight vibrato bend. Length: 3 seconds\\n- Harp pluck with airy tone and resonant strings. Length: 2 seconds\\n- Glockenspiel bell-like note with bright metallic clarity. Length: 2 seconds\\n- Metallic clang sound design hit. Length: 2 seconds\\n- Granular texture hit. Length: 3 seconds\\n- Reversed piano hit. Length: 2 seconds\\n- Synth riser effect. Length: 6 seconds\\n- Percussion impact hit. Length: 2 seconds\\n- Cinematic hit. Length: 2 seconds\\n- Dry clap, a crisp, natural single hand clap recorded in a dead room with an extremely sharp transient and no room reflections. Length: 1 second\\n- Studio hat, a classic, natural recording of 14-inch hi-hats played tightly closed, zero ring, very fast decay. Length: 1 second\\n- Disco open hat, bright 14-inch open hi-hat with long, shimmering decay, perfect for disco or dance grooves. Length: 1 second\\n- Pillow kick, acoustic kick drum muffled with a heavy blanket, producing a short, dry \\\"thump\\\" with almost zero resonance. Length: 1 second\\n- Short 808, punchy 808 kick with sharp, distorted transient and fast-decaying sub-tail. Length: 1 second\\n- Egg shaker, classic plastic egg shaker recorded with a small-diaphragm condenser mic, producing a light, consistent \\\"tick\\\" with very short sustain. Length: 1 second\\n- African drums, dynamic African drums and percussion ensemble with natural acoustic textures. Length: 3 seconds\\n- Latin drums, dynamic Latin drums and percussion ensemble featuring authentic rhythmic patterns. Length: 3 seconds\\n- String quartet, euphoric string quartet with dynamic and emotional playing, full of expressive harmonies and movement. Length: 3 seconds\\n- Piano, nostalgic, atmospheric piano piece with dynamic and emotional performance, intimate and resonant. Length: 3 seconds\\n- Analogue drift pad, warm polyphonic pad with three detuned oscillators (saw + triangle), subtle pitch drift, and lush bucket-brigade chorus for wide, nostalgic stereo image. Length: 11 seconds\\n- Phase distortion bass, Casio CZ-style phase-distorted sine wave warped into a jagged sawtooth for retro synth bass tone. Length: 11 seconds\\n- Vibrato saxophone, bright lyrical alto sax with fast fluttery vibrato, reedy vintage tone, captured with ribbon mic for warm nostalgic sound. Length: 11 seconds\\n- Lofi upright bass, upright bass recorded with ribbon mic in a wooden room, natural air with slightly boxy resonance, tape-saturated for dusty 1950s jazz feel. Length: 2 seconds\"\n}", + "Music" + ] + }, + { + "id": 40, + "type": "StringReplace", + "pos": [ + 1350, + 900 + ], + "size": [ + 260, + 280 + ], + "flags": {}, + "order": 15, + "mode": 0, + "inputs": [ + { + "localized_name": "string", + "name": "string", + "type": "STRING", + "widget": { + "name": "string" + }, + "link": 59 + }, + { + "localized_name": "find", + "name": "find", + "type": "STRING", + "widget": { + "name": "find" + }, + "link": null + }, + { + "localized_name": "replace", + "name": "replace", + "type": "STRING", + "widget": { + "name": "replace" + }, + "link": 58 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 60 + ] + } + ], + "title": "Text Replace (AUDIO LENGTH)", + "properties": { + "Node name for S&R": "StringReplace" + }, + "widgets_values": [ + "", + "AUDIO_LENGTH", + "" + ] + }, + { + "id": 38, + "type": "StringReplace", + "pos": [ + 720, + 900 + ], + "size": [ + 290, + 280 + ], + "flags": {}, + "order": 13, + "mode": 0, + "inputs": [ + { + "localized_name": "string", + "name": "string", + "type": "STRING", + "widget": { + "name": "string" + }, + "link": null + }, + { + "localized_name": "find", + "name": "find", + "type": "STRING", + "widget": { + "name": "find" + }, + "link": null + }, + { + "localized_name": "replace", + "name": "replace", + "type": "STRING", + "widget": { + "name": "replace" + }, + "link": 66 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 52 + ] + } + ], + "title": "Text Replace (PROMPT TEMPLATE)", + "properties": { + "Node name for S&R": "StringReplace" + }, + "widgets_values": [ + "SYSTEM_PROMPTS\n\nInput: USER_INPUT\nTarget audio length: AUDIO_LENGTH seconds.\nOutput:", + "SYSTEM_PROMPTS", + "" + ] + }, + { + "id": 35, + "type": "PrimitiveBoolean", + "pos": [ + -390, + 570 + ], + "size": [ + 400, + 100 + ], + "flags": {}, + "order": 11, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "BOOLEAN", + "widget": { + "name": "value" + }, + "link": 83 + } + ], + "outputs": [ + { + "localized_name": "BOOLEAN", + "name": "BOOLEAN", + "type": "BOOLEAN", + "links": [ + 48 + ] + } + ], + "title": "Boolean (Enable_Reprompt)", + "properties": { + "Node name for S&R": "PrimitiveBoolean" + }, + "widgets_values": [ + true + ] + }, + { + "id": 36, + "type": "PrimitiveFloat", + "pos": [ + -390, + 410 + ], + "size": [ + 400, + 110 + ], + "flags": {}, + "order": 12, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "FLOAT", + "widget": { + "name": "value" + }, + "link": 82 + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": [ + 50, + 56 + ] + } + ], + "title": "Float (Duration)", + "properties": { + "Node name for S&R": "PrimitiveFloat" + }, + "widgets_values": [ + 150 + ] + }, + { + "id": 25, + "type": "CheckpointLoaderSimple", + "pos": [ + 100, + 130 + ], + "size": [ + 440, + 190 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "ckpt_name", + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": 79 + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "links": [ + 30 + ] + }, + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "links": [] + }, + { + "localized_name": "VAE", + "name": "VAE", + "type": "VAE", + "links": [ + 39 + ] + } + ], + "properties": { + "Node name for S&R": "CheckpointLoaderSimple", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "models": [ + { + "name": "stable_audio_3_medium_base.safetensors", + "url": "https://huggingface.co/Comfy-Org/stable-audio-3/resolve/main/checkpoints/stable_audio_3_medium_base.safetensors", + "directory": "checkpoints" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "stable_audio_3_medium_base.safetensors" + ] + }, + { + "id": 26, + "type": "CLIPLoader", + "pos": [ + 100, + 390 + ], + "size": [ + 440, + 170 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "localized_name": "clip_name", + "name": "clip_name", + "type": "COMBO", + "widget": { + "name": "clip_name" + }, + "link": 80 + }, + { + "localized_name": "type", + "name": "type", + "type": "COMBO", + "widget": { + "name": "type" + }, + "link": null + }, + { + "localized_name": "device", + "name": "device", + "shape": 7, + "type": "COMBO", + "widget": { + "name": "device" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "links": [ + 34, + 35 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPLoader", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "models": [ + { + "name": "t5gemma_b_b_ul2.safetensors", + "url": "https://huggingface.co/Comfy-Org/stable-audio-3/resolve/main/text_encoders/t5gemma_b_b_ul2.safetensors", + "directory": "text_encoders" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "t5gemma_b_b_ul2.safetensors", + "stable_audio", + "default" + ] + }, + { + "id": 54, + "type": "PreviewAny", + "pos": [ + 1720, + 1580 + ], + "size": [ + 420, + 550 + ], + "flags": {}, + "order": 20, + "mode": 4, + "inputs": [ + { + "localized_name": "source", + "name": "source", + "type": "*", + "link": 84 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": null + } + ], + "properties": { + "Node name for S&R": "PreviewAny" + }, + "widgets_values": [ + null, + null, + null + ] + } + ], + "groups": [ + { + "id": 1, + "title": "Loaders: checkpoint & CLIP", + "bounding": [ + 80, + 50, + 485.721654232725, + 527.2848777754299 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 2, + "title": "CLIP encode: conditioning", + "bounding": [ + 600, + 60, + 470, + 510 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 3, + "title": "User inputs: prompt & duration", + "bounding": [ + -400, + 10, + 430, + 740 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 7, + "title": "Reprompt: full branch (template + LLM)", + "bounding": [ + 60, + 780, + 1630, + 1360 + ], + "color": "#444", + "flags": {} + }, + { + "id": 4, + "title": "Reprompt: JSON extract & template fills", + "bounding": [ + 120, + 820, + 1520, + 650 + ], + "color": "#444", + "flags": {} + }, + { + "id": 5, + "title": "Helpers: duration to string", + "bounding": [ + 1340, + 1180, + 280, + 250 + ], + "color": "#444", + "flags": {} + }, + { + "id": 6, + "title": "Reprompt: Qwen TextGenerate", + "bounding": [ + 680, + 1510, + 960, + 614.65625 + ], + "color": "#444", + "flags": {} + }, + { + "id": 8, + "title": "Audio generation: Stable Audio", + "bounding": [ + 60, + 10, + 1627.3616782294932, + 737.0545987464304 + ], + "color": "#3f789e", + "flags": {} + } + ], + "links": [ + { + "id": 35, + "origin_id": 26, + "origin_slot": 0, + "target_id": 7, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 13, + "origin_id": 3, + "origin_slot": 0, + "target_id": 12, + "target_slot": 0, + "type": "LATENT" + }, + { + "id": 39, + "origin_id": 25, + "origin_slot": 2, + "target_id": 12, + "target_slot": 1, + "type": "VAE" + }, + { + "id": 50, + "origin_id": 36, + "origin_slot": 0, + "target_id": 11, + "target_slot": 0, + "type": "FLOAT" + }, + { + "id": 30, + "origin_id": 25, + "origin_slot": 0, + "target_id": 3, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 4, + "origin_id": 6, + "origin_slot": 0, + "target_id": 3, + "target_slot": 1, + "type": "CONDITIONING" + }, + { + "id": 6, + "origin_id": 7, + "origin_slot": 0, + "target_id": 3, + "target_slot": 2, + "type": "CONDITIONING" + }, + { + "id": 12, + "origin_id": 11, + "origin_slot": 0, + "target_id": 3, + "target_slot": 3, + "type": "LATENT" + }, + { + "id": 34, + "origin_id": 26, + "origin_slot": 0, + "target_id": 6, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 49, + "origin_id": 34, + "origin_slot": 0, + "target_id": 6, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 47, + "origin_id": 31, + "origin_slot": 0, + "target_id": 34, + "target_slot": 0, + "type": "STRING" + }, + { + "id": 46, + "origin_id": 28, + "origin_slot": 0, + "target_id": 34, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 48, + "origin_id": 35, + "origin_slot": 0, + "target_id": 34, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 56, + "origin_id": 36, + "origin_slot": 0, + "target_id": 41, + "target_slot": 0, + "type": "FLOAT" + }, + { + "id": 57, + "origin_id": 41, + "origin_slot": 1, + "target_id": 42, + "target_slot": 0, + "type": "INT" + }, + { + "id": 52, + "origin_id": 38, + "origin_slot": 0, + "target_id": 39, + "target_slot": 0, + "type": "STRING" + }, + { + "id": 53, + "origin_id": 31, + "origin_slot": 0, + "target_id": 39, + "target_slot": 2, + "type": "STRING" + }, + { + "id": 40, + "origin_id": 29, + "origin_slot": 0, + "target_id": 28, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 60, + "origin_id": 40, + "origin_slot": 0, + "target_id": 28, + "target_slot": 4, + "type": "STRING" + }, + { + "id": 65, + "origin_id": 43, + "origin_slot": 0, + "target_id": 49, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 59, + "origin_id": 39, + "origin_slot": 0, + "target_id": 40, + "target_slot": 0, + "type": "STRING" + }, + { + "id": 58, + "origin_id": 42, + "origin_slot": 0, + "target_id": 40, + "target_slot": 2, + "type": "STRING" + }, + { + "id": 66, + "origin_id": 49, + "origin_slot": 0, + "target_id": 38, + "target_slot": 2, + "type": "STRING" + }, + { + "id": 27, + "origin_id": 12, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "AUDIO" + }, + { + "id": 68, + "origin_id": -10, + "origin_slot": 0, + "target_id": 31, + "target_slot": 0, + "type": "STRING" + }, + { + "id": 76, + "origin_id": -10, + "origin_slot": 2, + "target_id": 3, + "target_slot": 4, + "type": "INT" + }, + { + "id": 78, + "origin_id": -10, + "origin_slot": 4, + "target_id": 43, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 79, + "origin_id": -10, + "origin_slot": 5, + "target_id": 25, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 80, + "origin_id": -10, + "origin_slot": 6, + "target_id": 26, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 81, + "origin_id": -10, + "origin_slot": 7, + "target_id": 29, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 82, + "origin_id": -10, + "origin_slot": 1, + "target_id": 36, + "target_slot": 0, + "type": "FLOAT" + }, + { + "id": 83, + "origin_id": -10, + "origin_slot": 3, + "target_id": 35, + "target_slot": 0, + "type": "BOOLEAN" + }, + { + "id": 84, + "origin_id": 28, + "origin_slot": 0, + "target_id": 54, + "target_slot": 0, + "type": "STRING" + } + ], + "extra": {}, + "category": "Audio/Music generation", + "description": "Generates music, instrument loops, sound effects, and one-shots from text using the Stable Audio 3 Medium base checkpoint, with optional Qwen 3.5 category-based prompt expansion (Music, Instrument, SFX, One-shot)." + } + ] + }, + "extra": {} +} \ No newline at end of file diff --git a/blueprints/Audio Generation (Stable Audio 3 Medium).json b/blueprints/Audio Generation (Stable Audio 3 Medium).json new file mode 100644 index 000000000..30add5b05 --- /dev/null +++ b/blueprints/Audio Generation (Stable Audio 3 Medium).json @@ -0,0 +1,2091 @@ +{ + "revision": 0, + "last_node_id": 52, + "last_link_id": 0, + "nodes": [ + { + "id": 52, + "type": "8b66c757-fe2f-4184-91f3-479a19deb565", + "pos": [ + 370, + 1120 + ], + "size": [ + 420, + 450 + ], + "flags": { + "collapsed": false + }, + "order": 0, + "mode": 0, + "inputs": [ + { + "label": "user_input", + "name": "user_input", + "type": "STRING", + "widget": { + "name": "user_input" + }, + "link": null + }, + { + "label": "duration", + "name": "duration", + "type": "FLOAT", + "widget": { + "name": "duration" + }, + "link": null + }, + { + "label": "seed", + "name": "seed", + "type": "INT", + "widget": { + "name": "seed" + }, + "link": null + }, + { + "label": "use_reprompt", + "name": "use_reprompt", + "type": "BOOLEAN", + "widget": { + "name": "use_reprompt" + }, + "link": null + }, + { + "label": "reprompt_category", + "name": "category", + "type": "COMBO", + "widget": { + "name": "category" + }, + "link": null + }, + { + "label": "ckpt_name", + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": null + }, + { + "label": "sa_clip", + "name": "sa_clip", + "type": "COMBO", + "widget": { + "name": "sa_clip" + }, + "link": null + }, + { + "label": "qwen_clip", + "name": "qwen_clip", + "type": "COMBO", + "widget": { + "name": "qwen_clip" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "AUDIO", + "name": "AUDIO", + "type": "AUDIO", + "links": [] + } + ], + "title": "Audio Generation (Stable Audio 3 Medium)", + "properties": { + "proxyWidgets": [ + [ + "31", + "value" + ], + [ + "36", + "value" + ], + [ + "3", + "seed" + ], + [ + "35", + "value" + ], + [ + "43", + "choice" + ], + [ + "25", + "ckpt_name" + ], + [ + "26", + "clip_name" + ], + [ + "29", + "clip_name" + ] + ] + }, + "widgets_values": [] + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "8b66c757-fe2f-4184-91f3-479a19deb565", + "version": 1, + "state": { + "lastGroupId": 8, + "lastNodeId": 56, + "lastLinkId": 84, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Audio Generation (Stable Audio 3 Medium)", + "inputNode": { + "id": -10, + "bounding": [ + -810, + 400, + 155.953125, + 208 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + 1750, + 1041, + 128, + 68 + ] + }, + "inputs": [ + { + "id": "78ae2515-114b-494a-becc-43c7b6c2dc2f", + "name": "user_input", + "type": "STRING", + "linkIds": [ + 68 + ], + "label": "user_input", + "pos": [ + -678.046875, + 424 + ] + }, + { + "id": "5ca95030-aff4-4544-b545-f0d814e0e49a", + "name": "duration", + "type": "FLOAT", + "linkIds": [ + 82 + ], + "label": "duration", + "pos": [ + -678.046875, + 444 + ] + }, + { + "id": "718eb10f-da1a-4cea-a9c7-3040f98fe960", + "name": "seed", + "type": "INT", + "linkIds": [ + 76 + ], + "label": "seed", + "pos": [ + -678.046875, + 464 + ] + }, + { + "id": "dc020099-39e6-4009-9937-408409d71736", + "name": "use_reprompt", + "type": "BOOLEAN", + "linkIds": [ + 83 + ], + "label": "use_reprompt", + "pos": [ + -678.046875, + 484 + ] + }, + { + "id": "edae394c-6324-44d6-8ac5-d8caa5ae2169", + "name": "category", + "type": "COMBO", + "linkIds": [ + 78 + ], + "label": "reprompt_category", + "pos": [ + -678.046875, + 504 + ] + }, + { + "id": "be19b747-6a47-4028-9c30-d52f54a712ea", + "name": "ckpt_name", + "type": "COMBO", + "linkIds": [ + 79 + ], + "label": "ckpt_name", + "pos": [ + -678.046875, + 524 + ] + }, + { + "id": "bc9241a2-bc20-4c5d-8cb1-f2958f598642", + "name": "sa_clip", + "type": "COMBO", + "linkIds": [ + 80 + ], + "label": "sa_clip", + "pos": [ + -678.046875, + 544 + ] + }, + { + "id": "a33a2468-6d6d-4cb6-937c-3510bf16ebac", + "name": "qwen_clip", + "type": "COMBO", + "linkIds": [ + 81 + ], + "label": "qwen_clip", + "pos": [ + -678.046875, + 564 + ] + } + ], + "outputs": [ + { + "id": "bbe988dd-5c03-44fd-a965-c712f9204988", + "name": "AUDIO", + "type": "AUDIO", + "linkIds": [ + 27 + ], + "localized_name": "AUDIO", + "pos": [ + 1774, + 1065 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 7, + "type": "CLIPTextEncode", + "pos": [ + 620, + 420 + ], + "size": [ + 440, + 140 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 35 + }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "slot_index": 0, + "links": [ + 6 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPTextEncode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "" + ], + "color": "#223", + "bgcolor": "#335" + }, + { + "id": 12, + "type": "VAEDecodeAudio", + "pos": [ + 1450, + 110 + ], + "size": [ + 230, + 100 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "localized_name": "samples", + "name": "samples", + "type": "LATENT", + "link": 13 + }, + { + "localized_name": "vae", + "name": "vae", + "type": "VAE", + "link": 39 + } + ], + "outputs": [ + { + "localized_name": "AUDIO", + "name": "AUDIO", + "type": "AUDIO", + "slot_index": 0, + "links": [ + 27 + ] + } + ], + "properties": { + "Node name for S&R": "VAEDecodeAudio", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 11, + "type": "EmptyLatentAudio", + "pos": [ + 630, + 610 + ], + "size": [ + 430, + 140 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "seconds", + "name": "seconds", + "type": "FLOAT", + "widget": { + "name": "seconds" + }, + "link": 50 + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { + "name": "batch_size" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "LATENT", + "name": "LATENT", + "type": "LATENT", + "links": [ + 12 + ] + } + ], + "properties": { + "Node name for S&R": "EmptyLatentAudio", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 60, + 1 + ] + }, + { + "id": 3, + "type": "KSampler", + "pos": [ + 1100, + 100 + ], + "size": [ + 320, + 350 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 30 + }, + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "link": 4 + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "link": 6 + }, + { + "localized_name": "latent_image", + "name": "latent_image", + "type": "LATENT", + "link": 12 + }, + { + "localized_name": "seed", + "name": "seed", + "type": "INT", + "widget": { + "name": "seed" + }, + "link": 76 + }, + { + "localized_name": "steps", + "name": "steps", + "type": "INT", + "widget": { + "name": "steps" + }, + "link": null + }, + { + "localized_name": "cfg", + "name": "cfg", + "type": "FLOAT", + "widget": { + "name": "cfg" + }, + "link": null + }, + { + "localized_name": "sampler_name", + "name": "sampler_name", + "type": "COMBO", + "widget": { + "name": "sampler_name" + }, + "link": null + }, + { + "localized_name": "scheduler", + "name": "scheduler", + "type": "COMBO", + "widget": { + "name": "scheduler" + }, + "link": null + }, + { + "localized_name": "denoise", + "name": "denoise", + "type": "FLOAT", + "widget": { + "name": "denoise" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "LATENT", + "name": "LATENT", + "type": "LATENT", + "slot_index": 0, + "links": [ + 13 + ] + } + ], + "properties": { + "Node name for S&R": "KSampler", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 0, + "randomize", + 8, + 1, + "lcm", + "simple", + 1 + ] + }, + { + "id": 29, + "type": "CLIPLoader", + "pos": [ + 690, + 1580 + ], + "size": [ + 430, + 170 + ], + "flags": {}, + "order": 8, + "mode": 0, + "showAdvanced": false, + "inputs": [ + { + "localized_name": "clip_name", + "name": "clip_name", + "type": "COMBO", + "widget": { + "name": "clip_name" + }, + "link": 81 + }, + { + "localized_name": "type", + "name": "type", + "type": "COMBO", + "widget": { + "name": "type" + }, + "link": null + }, + { + "localized_name": "device", + "name": "device", + "shape": 7, + "type": "COMBO", + "widget": { + "name": "device" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "links": [ + 40 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPLoader", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "models": [ + { + "name": "qwen3.5_2b_bf16.safetensors", + "url": "https://huggingface.co/Comfy-Org/Qwen3.5/resolve/main/text_encoders/qwen3.5_2b_bf16.safetensors", + "directory": "text_encoders" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "qwen3.5_2b_bf16.safetensors", + "stable_diffusion", + "default" + ] + }, + { + "id": 6, + "type": "CLIPTextEncode", + "pos": [ + 610, + 130 + ], + "size": [ + 450, + 240 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 34 + }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": 49 + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "slot_index": 0, + "links": [ + 4 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPTextEncode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "" + ], + "color": "#232", + "bgcolor": "#353" + }, + { + "id": 34, + "type": "ComfySwitchNode", + "pos": [ + 210, + 610 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 47 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 46 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 48 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 49 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode" + }, + "widgets_values": [ + false + ] + }, + { + "id": 41, + "type": "ComfyMathExpression", + "pos": [ + 1370, + 1360 + ], + "size": [ + 230, + 80 + ], + "flags": { + "collapsed": true + }, + "order": 16, + "mode": 0, + "inputs": [ + { + "label": "a", + "localized_name": "values.a", + "name": "values.a", + "type": "FLOAT,INT,BOOLEAN", + "link": 56 + }, + { + "label": "b", + "localized_name": "values.b", + "name": "values.b", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": null + }, + { + "localized_name": "expression", + "name": "expression", + "type": "STRING", + "widget": { + "name": "expression" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": null + }, + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 57 + ] + }, + { + "localized_name": "BOOL", + "name": "BOOL", + "type": "BOOLEAN", + "links": null + } + ], + "properties": { + "Node name for S&R": "ComfyMathExpression" + }, + "widgets_values": [ + "a" + ] + }, + { + "id": 42, + "type": "PreviewAny", + "pos": [ + 1370, + 1310 + ], + "size": [ + 230, + 40 + ], + "flags": { + "collapsed": true + }, + "order": 17, + "mode": 0, + "inputs": [ + { + "localized_name": "source", + "name": "source", + "type": "*", + "link": 57 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 58 + ] + } + ], + "properties": { + "Node name for S&R": "PreviewAny" + }, + "widgets_values": [ + null, + null, + null + ] + }, + { + "id": 39, + "type": "StringReplace", + "pos": [ + 1040, + 900 + ], + "size": [ + 270, + 280 + ], + "flags": {}, + "order": 14, + "mode": 0, + "inputs": [ + { + "localized_name": "string", + "name": "string", + "type": "STRING", + "widget": { + "name": "string" + }, + "link": 52 + }, + { + "localized_name": "find", + "name": "find", + "type": "STRING", + "widget": { + "name": "find" + }, + "link": null + }, + { + "localized_name": "replace", + "name": "replace", + "type": "STRING", + "widget": { + "name": "replace" + }, + "link": 53 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 59 + ] + } + ], + "title": "Text Replace (USER INPUT)", + "properties": { + "Node name for S&R": "StringReplace" + }, + "widgets_values": [ + "", + "USER_INPUT", + "" + ] + }, + { + "id": 28, + "type": "TextGenerate", + "pos": [ + 1200, + 1580 + ], + "size": [ + 430, + 420 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 40 + }, + { + "localized_name": "image", + "name": "image", + "shape": 7, + "type": "IMAGE", + "link": null + }, + { + "localized_name": "video", + "name": "video", + "shape": 7, + "type": "IMAGE", + "link": null + }, + { + "localized_name": "audio", + "name": "audio", + "shape": 7, + "type": "AUDIO", + "link": null + }, + { + "localized_name": "prompt", + "name": "prompt", + "type": "STRING", + "widget": { + "name": "prompt" + }, + "link": 60 + }, + { + "localized_name": "max_length", + "name": "max_length", + "type": "INT", + "widget": { + "name": "max_length" + }, + "link": null + }, + { + "localized_name": "sampling_mode", + "name": "sampling_mode", + "type": "COMFY_DYNAMICCOMBO_V3", + "widget": { + "name": "sampling_mode" + }, + "link": null + }, + { + "localized_name": "temperature", + "name": "sampling_mode.temperature", + "type": "FLOAT", + "widget": { + "name": "sampling_mode.temperature" + }, + "link": null + }, + { + "localized_name": "top_k", + "name": "sampling_mode.top_k", + "type": "INT", + "widget": { + "name": "sampling_mode.top_k" + }, + "link": null + }, + { + "localized_name": "top_p", + "name": "sampling_mode.top_p", + "type": "FLOAT", + "widget": { + "name": "sampling_mode.top_p" + }, + "link": null + }, + { + "localized_name": "min_p", + "name": "sampling_mode.min_p", + "type": "FLOAT", + "widget": { + "name": "sampling_mode.min_p" + }, + "link": null + }, + { + "localized_name": "repetition_penalty", + "name": "sampling_mode.repetition_penalty", + "type": "FLOAT", + "widget": { + "name": "sampling_mode.repetition_penalty" + }, + "link": null + }, + { + "localized_name": "seed", + "name": "sampling_mode.seed", + "type": "INT", + "widget": { + "name": "sampling_mode.seed" + }, + "link": null + }, + { + "localized_name": "presence_penalty", + "name": "sampling_mode.presence_penalty", + "shape": 7, + "type": "FLOAT", + "widget": { + "name": "sampling_mode.presence_penalty" + }, + "link": null + }, + { + "localized_name": "thinking", + "name": "thinking", + "shape": 7, + "type": "BOOLEAN", + "widget": { + "name": "thinking" + }, + "link": null + }, + { + "localized_name": "use_default_template", + "name": "use_default_template", + "shape": 7, + "type": "BOOLEAN", + "widget": { + "name": "use_default_template" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "generated_text", + "name": "generated_text", + "type": "STRING", + "links": [ + 46, + 84 + ] + } + ], + "properties": { + "Node name for S&R": "TextGenerate", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "", + 256, + "on", + 0.7, + 64, + 0.95, + 0.05, + 1.05, + 0, + 0, + false, + true + ] + }, + { + "id": 31, + "type": "PrimitiveStringMultiline", + "pos": [ + -390, + 160 + ], + "size": [ + 400, + 200 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "STRING", + "widget": { + "name": "value" + }, + "link": 68 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 47, + 53 + ] + } + ], + "title": "User: short description (USER_INPUT in template)", + "properties": { + "Node name for S&R": "PrimitiveStringMultiline" + }, + "widgets_values": [ + "" + ] + }, + { + "id": 43, + "type": "CustomCombo", + "pos": [ + 140, + 910 + ], + "size": [ + 550, + 320 + ], + "flags": {}, + "order": 18, + "mode": 0, + "inputs": [ + { + "localized_name": "choice", + "name": "choice", + "type": "COMBO", + "widget": { + "name": "choice" + }, + "link": 78 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 65 + ] + }, + { + "localized_name": "INDEX", + "name": "INDEX", + "type": "INT", + "links": null + } + ], + "title": "Custom Combo (Category index)", + "properties": { + "Node name for S&R": "CustomCombo" + }, + "widgets_values": [ + "Music", + 0, + "Music", + "Instrument", + "SFX", + "One-shot", + "" + ] + }, + { + "id": 49, + "type": "JsonExtractString", + "pos": [ + 720, + 1200 + ], + "size": [ + 300, + 180 + ], + "flags": {}, + "order": 19, + "mode": 0, + "inputs": [ + { + "localized_name": "json_string", + "name": "json_string", + "type": "STRING", + "widget": { + "name": "json_string" + }, + "link": null + }, + { + "localized_name": "key", + "name": "key", + "type": "STRING", + "widget": { + "name": "key" + }, + "link": 65 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 66 + ] + } + ], + "properties": { + "Node name for S&R": "JsonExtractString" + }, + "widgets_values": [ + "{\n \"Music\": \"You are an expert musician and musicologist and prompt engineer. Transform the user's input into a detailed, vivid music prompt for a full instrumental track.\\n\\n1. Start with the genre or style and optional adjectives (e.g., upbeat, dreamy, aggressive).\\n2. List the main instruments that define the track.\\n3. Add supporting elements or layers such as pads, harmonics, effects, or field recordings.\\n4. Include rhythm or percussion elements like drums, hi-hats, congas, brushes, or polyrhythms.\\n5. Integrate mood and energy naturally in the sentence (e.g., \\\"creating suspenseful tension\\\" or \\\"bright and uplifting\\\").\\n6. Specify the BPM.\\n7. Specify the track length as an integer in seconds. Use ranges: energetic/dance 120-180s, pop/rock 180-210s, cinematic/ambient 240-300s.\\n8. Combine all elements into one natural, fluid sentence. Avoid semicolons.\\n\\nTemplate:\\nGenre/Style with main instruments, supporting instruments/layers, and rhythm/percussion creating mood/energy. BPM: X. Length: Y seconds\\n\\nExamples:\\n- Jazz ballad with smooth saxophone lead, piano chords, upright bass, brushed drums, and soft strings that swing gently for a warm and cozy evening. BPM: 85. Length: 180 seconds\\n- EDM festival track with pulsing synth leads, plucked arpeggios, layered pads, side-chained bass, punchy kick and snare, and hi-hat rolls creating bright, energetic, and uplifting dance energy. BPM: 128. Length: 150 seconds\\n- Lo-fi hip-hop chill track with mellow electric piano, soft vinyl crackle, subtle synth pads, low-pass filtered drums, percussion loops, and soft plucked bass for a relaxed, dreamy vibe. BPM: 75. Length: 150 seconds\\n- Heavy metal anthem with distorted electric guitars, bass guitar, double bass drums, and cymbal crashes with fast palm-muted riffs creating intense, aggressive energy. BPM: 160. Length: 180 seconds\\n- Melancholic piano piece with soft piano lead, string pads, subtle atmospheric synths, and minimal brush percussion evoking a reflective rainy-day feeling. BPM: 60. Length: 240 seconds\\n- Suspenseful electronic thriller with pulsing bass synth, arpeggiated lead synth, cinematic pads, glitchy percussion, and high string stabs creating dark and tense energy. BPM: 100. Length: 200 seconds\\n- Dreamy ambient soundscape with layered pads, soft bell textures, gentle drones, and wind and water field recordings for ethereal and spacious meditation. BPM: 40. Length: 300 seconds\\n- Fingerpicking acoustic guitar solo with harmonics, subtle reverb, occasional shaker and soft stomp percussion, and soft pad layers for warm intimate storytelling. BPM: 70. Length: 120 seconds\\n- Synthwave 80s retro track with arpeggiated synth leads, analog pads, electric bass, punchy electronic drums, gated reverb snares, and atmospheric FX for nostalgic and vibrant energy. BPM: 110. Length: 180 seconds\\n- Tribal percussion ensemble with congas, djembes, bongos, shakers, and frame drums layered with deep synthetic sub-bass in complex polyrhythms. BPM: 100. Length: 140 seconds\\n- 1920s swing jazz with brass section, upright bass, piano, brushed drums, banjo, clarinet, and soft strings that swing lively for energetic dance vibes. BPM: 110. Length: 180 seconds\\n- Futuristic electronic sci-fi track with pulsing bass synth, evolving lead synths, layered pads, glitch percussion, robotic FX, and sub-bass for tense cinematic energy. BPM: 125. Length: 200 seconds\\n- Ambient underwater soundscape with flowing water textures, soft piano motifs, synth drones, distant bells, and underwater reverb for spacious meditative immersion. BPM: 45. Length: 300 seconds\\n- Horror cinematic track with dissonant strings, eerie piano stabs, cinematic percussion including taiko and low toms, and synth FX producing suspenseful creepy tension. BPM: 90. Length: 240 seconds\\n- Reggae track with offbeat guitar, warm basslines, snare, kick, congas, and horn stabs giving laid-back groovy energy. BPM: 85. Length: 150 seconds\\n- Blues track with soulful electric guitar solos, walking bass, piano, and shuffle drums creating expressive and emotive storytelling. BPM: 90. Length: 180 seconds\\n- Latin salsa with congas, timbales, horns, piano montunos, bass, and layered percussion for vibrant danceable energy. BPM: 120. Length: 210 seconds\\n- Afrobeat track with electric guitar stabs, horns, layered percussion, congas, shakers, bass groove, and synth pads for vibrant rhythmic energy. BPM: 105. Length: 200 seconds\\n- Indie rock track with electric guitar riffs, bass, live drum kit, layered synths, and subtle strings for energetic yet emotional feel. BPM: 110. Length: 180 seconds\\n- Funk groove with slap bass, electric guitar chords, brass stabs, drums, congas, and rhythmic keyboards creating high-energy danceable rhythm. BPM: 105. Length: 180 seconds\\n- Drum and bass track with fast breakbeat drums, deep sub-bass, sharp synth leads, pads, and atmospheric FX for high-energy club motion. BPM: 175. Length: 150 seconds\\n- Dark ambient track with drones, distant bells, low rumbles, soft wind textures, and synth pads producing eerie immersive tension. BPM: 50. Length: 300 seconds\\n- Tropical house track with marimba, steel drums, soft synths, smooth bass, layered percussion, and light piano riffs for sunny chill dance vibes. BPM: 110. Length: 180 seconds\\n- Progressive rock track with electric guitar leads, organ, bass, drum kit, synth layers, and occasional strings for epic layered energy. BPM: 100. Length: 220 seconds\\n- Music box melody with delicate metallic tones and soft resonance, lullaby style, with gentle ambient reverb. BPM: 60. Length: 20 seconds\\n- Soft piano arpeggio with warm felted tone and slow attack, lullaby style, with intimate room ambience. BPM: 60. Length: 30 seconds\\n- Harp gentle plucked pattern with airy resonance, lullaby style, with dreamy reverb tail. BPM: 65. Length: 25 seconds\\n- Acoustic guitar fingerstyle pattern with warm nylon strings and soft dynamics, lullaby style, with subtle room resonance. BPM: 60. Length: 30 seconds\\n- Ambient synth pad with smooth evolving texture and soft harmonics, lullaby style, with wide stereo ambience. BPM: 50. Length: 40 seconds\\n- Early rock piano with walking left-hand bass line, shuffle rhythms, and blues scale improvisations in energetic 1950s boogie-woogie style. BPM: 160. Length: 180 seconds\\n- Trip Hop track with jazzy sampled vibraphone, mid-tempo breakbeat drums, harp, Latin ethnic percussion, and sweeping cinematic strings creating airy, relaxing, soulful lounge vibes. BPM: 90. Length: 180 seconds\\n- Country outlaw cinematic instrumental with blues pedal steel guitar, rustic mandolin, fiddle call-and-response, tape-driven rattly drum kit, autoharp, and soaring accordion solo for raw, emotional southern blues expression. BPM: 85. Length: 200 seconds\\n- Neo Classical track with sweeping string section, elegant horns, and delicate piano creating soothing, hypnotic, modern, soft, and classic mood. BPM: 70. Length: 180 seconds\\n- Art Rock desert track with desolate piano chords, western-themed rhythm guitars, unique lead guitars, rattly vintage drum kit, and supporting bass creating lonely, expansive, beautiful, and strange atmospheres. BPM: 95. Length: 180 seconds\\n- Cinematic Sci-Fi score with dramatic horn section, building marcato strings, gliding bassoon, thunderous cymbals, subdued timpani, and subtle synth drones producing awe-inspiring, uplifting, epic intergalactic energy. BPM: 100. Length: 220 seconds\\n- West Coast Hip Hop instrumental with cascading harp melodies, smooth Rhodes piano chops, vintage boom bap drums, and walking double bass producing raw, street, and soulful block-party vibes. BPM: 92. Length: 180 seconds\\n- Synthwave futuristic track with pulsating synth bass, exciting chords, soaring leads, and reverberating drum machine patterns creating gritty, pounding, and cool energy. BPM: 110. Length: 180 seconds\\n- Breakbeat track with complex percussion, intricate breakbeats, gritty synths, lush pads, and 808 bassline producing fresh, modern, futuristic, and rave-ready energy. BPM: 140. Length: 160 seconds\\n- Lounge Jazz 1960s smooth track with laid-back drums, piano chords, double bass, soft electric piano, subtle flute, and unique percussion creating beautiful, atmospheric, eclectic, retro, and chill vibes. BPM: 85. Length: 180 seconds\\n- Latin Jazz 1950s blissful track with laid-back Latin drums, euphoric piano chords, double bass, orchestral accompaniment, acoustic guitar, and vibraphone producing nostalgic, beautiful, atmospheric, cinematic, and chill mood. BPM: 95. Length: 180 seconds\\n- Acid Jazz 1970s summertime track with smooth electric piano, trippy synth leads, laid-back vintage drum kit, fuzzy electric bass, and uplifting violin producing retro, psychedelic, jazzy, relaxing energy. BPM: 100. Length: 180 seconds\\n- Progressive Soul 1970s track with feel-good piano, psychedelic organ, groovy vintage drum kit with percussion, fuzzy electric bass, and synth strings producing retro, raw, soulful, joyous atmosphere. BPM: 90. Length: 180 seconds\\n- Discotheque 1970s French-inspired track with sultry piano, psychedelic guitars, groovy drum kit, fuzzy electric bass, and melancholic organ producing retro, raw, laid-back, and relaxing mood. BPM: 105. Length: 180 seconds\\n- Soul Jazz 1970s track with expressive saxophone, smooth piano, groovy drum kit, rhythmic upright bass, sweeping strings, and minimal vibraphone producing retro, raw, laid-back, and epic energy. BPM: 95. Length: 180 seconds\\n- Vintage R&B 1970s live studio track with subtle brass, smooth piano, sweeping strings, and minimal drums producing retro, beautiful, uplifting, nostalgic mood. BPM: 85. Length: 180 seconds\\n- 50s Pop track with Latin influence, string section, bold brass, vibraphone, acoustic guitar, flute, ethnic percussion, and brushed drums creating sexy, epic, vintage, retro, melancholic, jazzy, dramatic energy. BPM: 100. Length: 180 seconds\\n- A piece of calm, quiet, mellow, serene music perfect for a peaceful film score, featuring soft modulating piano, ambient sfx and foley, beautiful vibraphone, and subtle synthesizer drones. The mood is cinematic, thoughtful, serene and nostalgic. BPM: 55. Length: 300 seconds\",\n \"Instrument\": \"You are a music metadata expert. Given an instrument, generate a descriptive prompt for a generative audio model.\\n\\n1. Identify the instrument.\\n2. Add playing style or technique.\\n3. Include details about material, timbre, or texture.\\n4. Add musical style or mood. Specify the genre, context, or emotional character.\\n5. Add spatial or production qualities.\\n6. Specify BPM: Always include a BPM appropriate to the style and context.\\n7. Specify length: Provide an integer in seconds (6–20 s for loops, 20–180 s for stems).\\n\\nExamples:\\n- Synth arpeggio loop with bright detuned oscillators. BPM: 120. Length: 8 seconds\\n- Chord stab loop with sharp percussive attack. BPM: 90. Length: 6 seconds\\n- Guitar muted strum loop with tight rhythmic feel. BPM: 100. Length: 8 seconds\\n- Pluck sequence loop with bright resonant tone. BPM: 128. Length: 10 seconds\\n- Marimba and vibraphone percussive loop with resonant wooden and metallic tones. BPM: 110. Length: 12 seconds\\n- Drum loop with deep muffled kick on beat one, snappy rimshot snare on beats two and four with rolling ghost note fills, and tight closed hi-hats with subtle open accents. BPM: 85. Length: 10 seconds\\n- Drum groove loop with brushed snare swinging on the ride, soft feathered kick on downbeats, and light closed hi-hat taps on the upbeats. BPM: 130. Length: 12 seconds\\n- Kick and hi-hat loop with four-on-the-floor punchy kick, tight closed hi-hats on every eighth note, and a sharp dry snare on beats two and four. BPM: 130. Length: 15 seconds\\n- Vinyl crackle drum loop with warm low-pass filtered kick, dusty snare with tape saturation, and shuffled closed hi-hats with subtle vinyl crackle ambiance. BPM: 80. Length: 10 seconds\\n- Ambient pad loop with evolving texture. BPM: 80. Length: 12 seconds\\n- Melodic synth bass groove loop with pumping sidechain feel. BPM: 122. Length: 10 seconds\\n- Melodic Bass slap and pop rhythm loop. BPM: 100. Length: 8 seconds\\n- Acoustic bass walking line loop with natural wooden resonance. BPM: 120. Length: 12 seconds\\n- String pizzicato motif loop, suspenseful, with tight string texture. BPM: 90. Length: 8 seconds\\n- Brass staccato riff loop with sharp bright attack. BPM: 130. Length: 10 seconds\\n- Flute airy melodic loop with wooden headjoint resonance. BPM: 100. Length: 6 seconds\\n- Pan flute ambient loop with breathy timbre. BPM: 75. Length: 8 seconds\\n- Clarinet riff loop with warm smooth reed tone. BPM: 120. Length: 10 seconds\\n- Oboe motif loop, orchestral, with rich double reed resonance. BPM: 80. Length: 8 seconds\\n- Recorder Renaissance motif loop with soft wooden timbre. BPM: 100. Length: 6 seconds\\n- Electric sitar riff loop with buzzing resonant tone. BPM: 90. Length: 10 seconds\\n- Koto plucked motif loop with resonant wooden strings. BPM: 90. Length: 8 seconds\\n- Shamisen folk melody loop with percussive twang. BPM: 100. Length: 8 seconds\\n- Banjo fingerpicking loop with metallic string resonance. BPM: 110. Length: 10 seconds\\n- Mandolin tremolo loop with crisp wooden body tone. BPM: 120. Length: 10 seconds\\n- Acoustic guitar chord vamp loop with natural room resonance. BPM: 110. Length: 12 seconds\\n- Nylon string guitar arpeggio loop with warm, soft timbre. BPM: 90. Length: 15 seconds\\n- Electric guitar riff loop with driven distorted tone. BPM: 130. Length: 10 seconds\\n- Slide guitar melody loop with warm resonant glide. BPM: 100. Length: 12 seconds\\n- Steel guitar slide loop with bright pedal steel tone. BPM: 95. Length: 12 seconds\\n- Harpsichord arpeggio loop with crisp plucked attack. BPM: 120. Length: 10 seconds\\n- Rhodes chord vamp loop with warm electric piano tone. BPM: 100. Length: 12 seconds\\n- Clavinet funky rhythm loop. BPM: 105. Length: 10 seconds\\n- Organ chord vamp loop with full drawbar warmth. BPM: 90. Length: 12 seconds\\n- Drum loop with booming 808 kick on beat one, crisp snare on beat three, and rapid triplet hi-hat rolls with open hat accents for aggressive high-energy feel. BPM: 140. Length: 8 seconds\\n- Breakbeat drum loop with chopped Amen-style snare flurries, driving kick on the one, fast sixteenth-note closed hi-hats, and syncopated open hat accents. BPM: 170. Length: 10 seconds\\n- Glitch percussion loop with stuttered kick transients, randomised snare hits processed with bit-crushing, and erratic hi-hat patterns with pitch-shifted metallic ticks. BPM: 120. Length: 12 seconds\\n- Metallic hits loop with distorted kick impacts, processed metal-plate snare slams, and grinding hi-hat noise bursts for aggressive mechanical texture. BPM: 120. Length: 10 seconds\\n- Timpani hits loop, cinematic, with deep resonant kick-like timpani strikes on beat one, rolling snare-style timpani fills, and no hi-hats for a grand orchestral feel. BPM: 70. Length: 8 seconds\\n- Snare roll loop, dramatic, with accelerating snare drum rolls building from soft to crashing, deep supporting kick pulses, and no hi-hats for maximum impact. BPM: 100. Length: 8 seconds\\n- Accordion motif loop with bright reedy bellows tone. BPM: 100. Length: 10 seconds\\n- Harmonica blues riff loop with expressive reed timbre. BPM: 90. Length: 10 seconds\\n- Trombone riff loop with warm sliding brass tone. BPM: 120. Length: 10 seconds\\n- French horn melodic loop, cinematic. BPM: 80. Length: 12 seconds\\n- Soprano sax ballad loop. BPM: 70. Length: 12 seconds\\n- Alto sax bebop riff loop. BPM: 200. Length: 10 seconds\\n- Electric violin melodic loop with reverb. BPM: 90. Length: 10 seconds\\n- String pad loop with cinematic texture. BPM: 70. Length: 15 seconds\\n- Granular synth evolving texture loop. BPM: 90. Length: 15 seconds\\n- Piano motif loop with soft felt hammer tone. BPM: 80. Length: 10 seconds\\n- Pad and synth loop with lush detuned shimmer. BPM: 85. Length: 12 seconds\\n- Synth lead loop with sidechain pumping compression. BPM: 128. Length: 10 seconds\\n- Analog synth bassline loop with deep warm low-end. BPM: 122. Length: 12 seconds\\n- FM synth lead motif loop with bright metallic shimmer. BPM: 110. Length: 10 seconds\\n- Bass groove loop with tight rhythmic two-bar pattern. BPM: 100. Length: 16 seconds\\n- Acoustic guitar fingerstyle motif loop with warm wood resonance. BPM: 90. Length: 45 seconds\\n- Sombre acoustic guitar motif loop with cavernous reverb, delicate fingerpicking, and expressive melancholic tone. BPM: 70. Length: 45 seconds\\n- Electric guitar rock riff motif loop. BPM: 130. Length: 40 seconds\\n- Vintage electric guitar motif loop, live-recorded in a vintage studio, with expressive and dynamic solo performance. BPM: 90. Length: 40 seconds\\n- Piano chord progression motif loop with rich harmonic movement. BPM: 120. Length: 60 seconds\\n- String ensemble cinematic motif loop with rich wooden resonance. BPM: 80. Length: 120 seconds\\n- Brass ensemble cinematic motif loop with bright metallic timbre. BPM: 90. Length: 90 seconds\\n- Ethnic percussion ensemble motif loop with deep resonant djembe kick tones, slapped snare-like rim hits on congas, and layered shakers and bells providing hi-hat-like rhythmic texture with polyrhythmic patterns. BPM: 100. Length: 90 seconds\\n- Synth ambient motif loop with evolving textures. BPM: 80. Length: 180 seconds\\n- Motif loop with warm dusty vinyl crackle and tape saturation. BPM: 80. Length: 60 seconds\\n- Synth lead and bass motif loop with bright punchy energy. BPM: 128. Length: 90 seconds\\n- Funk band motif loop: bass, drums, guitar. BPM: 100. Length: 90 seconds\\n- Ethnic flute motif for cinematic use. BPM: 80. Length: 30 seconds\\n- Steel drum melodic motif loop with bright metallic resonance. BPM: 110. Length: 20 seconds\\n- Marimba percussive motif loop with resonant wooden tone. BPM: 100. Length: 20 seconds\\n- Vibraphone melodic motif loop with metallic shimmer. BPM: 90. Length: 25 seconds\\n- Piano cinematic motif loop with resonant wooden tone. BPM: 80. Length: 30 seconds\\n- Violin expressive cinematic motif loop with rich wooden resonance. BPM: 75. Length: 25 seconds\\n- Cello expressive motif loop with deep wooden resonance. BPM: 70. Length: 30 seconds\\n- Trumpet expressive motif loop with brassy overtones. BPM: 100. Length: 25 seconds\\n- Sax expressive motif loop with warm reed timbre. BPM: 95. Length: 25 seconds\\n- Ethnic drum ensemble motif loop with booming natural-skin bass drum kicks, sharp hand-slap snare accents on djembes and talking drums, and layered wooden and metal percussion providing rhythmic hi-hat-like patterns. BPM: 95. Length: 30 seconds\\n- Ambient drone motif loop. BPM: 60. Length: 180 seconds\\n- Orchestral tension motif loop. BPM: 90. Length: 150 seconds\\n- Electronic track motif loop with drums, bass, synth. BPM: 128. Length: 180 seconds\",\n \"SFX\": \"You are a professional sound design expert. Convert the user's input into a precise, vivid sound effects description suitable for generative audio models.\\n\\nDescribe clearly:\\n- Sound source\\n- Physical character (texture, timbre, material: metal, wood, glass, concrete, etc.)\\n- Spatial qualities (indoor/outdoor, cave/open field/underwater, dry/reverberant, close-up/distant, echoing/muffled)\\n- Temporal evolution (attack, decay, movement, transitions over time)\\n- Include motion or spatial movement if applicable (passing, approaching, stereo movement)\\n\\nAudio length rules:\\n- Very short sounds (impacts, clicks, gunshots): 1–3 seconds\\n- Medium actions (footsteps, object movement, transitions): 3–6 seconds\\n- Ambience / environments: 6–15 seconds\\n- Always append: Length: X seconds (integer only, no decimals).\\n\\nOutput constraints:\\n- Length: 1–2 dense sentences maximum\\n- Output ONLY the final rewritten prompt\\n- No explanations, no formatting, no quotes\\n- Use concise but dense technical language\\n- Focus strictly on sound effects or ambience\\n- Always append: Length: X seconds (integer only, no decimals).\\n\\nQuality guidelines:\\n- Be specific and avoid vague terms\\n- Prioritize clarity and realism\\n- Combine elements into one coherent scene\\n- Avoid redundancy\\n\\nExamples:\\n- Heavy rain hitting a metal roof during a thunderstorm, distant thunder rumbles, stereo, realistic ambience. Length: 45 seconds\\n- Quiet forest at dawn with birds chirping, soft wind through leaves, distant stream flowing. Length: 60 seconds\\n- Busy city street at night, cars passing, muffled conversations, occasional horn, urban ambience. Length: 50 seconds\\n- Ocean waves crashing against rocky cliffs, strong wind, dramatic and cinematic. Length: 70 seconds\\n- Wooden door creaking open slowly in an old house, echoing interior, eerie tone. Length: 3 seconds\\n- Glass bottle shattering on concrete, sharp impact, scattered fragments. Length: 2 seconds\\n- Footsteps on gravel, steady walking pace, close perspective. Length: 8 seconds\\n- Typing rapidly on a mechanical keyboard, crisp tactile clicks. Length: 5 seconds\\n- Punch impact with deep bass hit, cinematic trailer style. Length: 2 seconds\\n- Car speeding past at high velocity, doppler effect, realistic whoosh. Length: 3 seconds\\n- Object falling from height and hitting ground with a heavy thud. Length: 2 seconds\\n- Sword swing whooshing through air, fast motion, clean metallic tone. Length: 2 seconds\\n- Futuristic laser blast, clean energy pulse, high-tech sound design. Length: 1 seconds\\n- Spaceship engine humming, low frequency rumble, interior perspective. Length: 90 seconds\\n- Magical spell casting, shimmering particles, rising tonal energy. Length: 8 seconds\\n- Teleportation effect, glitchy digital distortion with a soft whoosh. Length: 5 seconds\\n- Dark eerie drone with distant whispers, creepy, slow build tension. Length: 120 seconds\\n- Sudden horror jump scare sting, sharp violin hit, cinematic. Length: 1 second\\n- Metal scraping slowly in a dark tunnel, echoing and ominous. Length: 20 seconds\\n- Explosion with debris scattering, deep bass, cinematic realism. Length: 4 seconds\\n- Building collapsing, rumbling concrete, dust and debris falling. Length: 25 seconds\\n- Fire crackling intensely, wood burning, close-up detail. Length: 80 seconds\\n- Gunshot in a large empty warehouse, loud echo decay. Length: 2 seconds\\n- Retro arcade coin insert sound, 8-bit style. Length: 1 second\\n- Level up chime, bright, rewarding, fantasy RPG style. Length: 2 seconds\\n- Error buzzer, short, digital, UI feedback. Length: 1 second\\n- Menu navigation clicks, soft futuristic interface sounds. Length: 3 seconds\\n- Layered soundscape: rain, thunder, footsteps, and distant sirens all blending naturally. Length: 90 seconds\\n- Rapid sequence of three impacts: metal hit, glass break, wood crack, spaced evenly. Length: 4 seconds\\n- Sound moving from left to right stereo field: passing motorcycle. Length: 5 seconds\\n- Close vs far perspective transition: footsteps approaching then fading away. Length: 6 seconds\\n- Tape stop sub drop, a massive sub-bass note that mimics a vinyl record or tape machine being turned off, the pitch and speed drop simultaneously, causing the high-end harmonics to smear and thicken as the sound grinds to a halt at a sub-sonic frequency. Length: 11 seconds\\n- Gravel and leaves footsteps, the sound of a hard boot stepping onto dry leaves or gravel, crisp and natural with detailed texture. Length: 11 seconds\\n- Ghostship moan, a massive, deep wooden groan with a low-frequency moan, like heavy timber under immense structural tension, swaying slowly, processed with long, dark wooden room reverb for a sense of scale. Length: 11 seconds\\n- Bicycle chain, a continuous metallic whirring sound of a chain moving over sprockets, with individual teeth catching the links, processed with resonant band-pass filter to emphasize metallic singing. Length: 11 seconds\\n- Warp drive, a sound that starts with a massive suck-back of ambient noise, followed by a supersonic crack and high-pitched zing that disappears into the distance, giving the sense of stretching space-time. Length: 11 seconds\\n- Ice cubes, high-pitched musical clinking of hard ice hitting a thin glass, bright resonant ring with subtle liquid sloshing around the edges. Length: 11 seconds\\n- Paper shuffle, the sound of a thick stack of heavy bond paper being squared up on a desk, dry papery thud with a quick fanning sound as air moves between the pages. Length: 11 seconds\\n- Drawer slam, a blunt, powerful thud made by slamming a wooden desk drawer shut, pronounced low-mid body, slightly distorted for aggressive character. Length: 3 seconds\",\n \"One-shot\": \"You are a music metadata expert. Given an instrument or sound, generate a descriptive prompt for a short, isolated one-shot audio sample for music production.\\n\\n1. Identify the instrument or sound source.\\n2. Describe the playing technique or hit type (e.g., pluck, slam, tap, stab).\\n3. Include details about material, timbre, or texture.\\n4. Add spatial or production qualities (dry/wet, room, close-mic).\\n5. Specify length: short integer in seconds (1–11 s).\\n\\nExamples:\\n- Piano key hit with bright percussive attack and resonant wooden body. Length: 2 seconds\\n- Kick drum punchy low-end hit with warm skin resonance. Length: 2 seconds\\n- Snare drum rimshot accent with crisp snare wires. Length: 2 seconds\\n- Acoustic guitar fingerstyle note with warm spruce tone. Length: 3 seconds\\n- Bass pluck with jazzy tone and resonant wooden body. Length: 3 seconds\\n- Electric guitar power chord with distortion. Length: 3 seconds\\n- Metallic glitch percussion hit with sharp metallic texture. Length: 2 seconds\\n- Tabla resonant tone hit with natural skin timbre. Length: 2 seconds\\n- Djembe slap accent with dry wooden resonance. Length: 2 seconds\\n- Synth stab with reverb tail. Length: 3 seconds\\n- Violin expressive note with vibrato and rich wooden resonance. Length: 3 seconds\\n- Cello legato note, cinematic, with warm resonant body. Length: 3 seconds\\n- Trumpet bright accent with slightly brassy overtones. Length: 2 seconds\\n- Melodic saxophone jazz riff with smooth reed timbre and a slight vibrato bend. Length: 3 seconds\\n- Harp pluck with airy tone and resonant strings. Length: 2 seconds\\n- Glockenspiel bell-like note with bright metallic clarity. Length: 2 seconds\\n- Metallic clang sound design hit. Length: 2 seconds\\n- Granular texture hit. Length: 3 seconds\\n- Reversed piano hit. Length: 2 seconds\\n- Synth riser effect. Length: 6 seconds\\n- Percussion impact hit. Length: 2 seconds\\n- Cinematic hit. Length: 2 seconds\\n- Dry clap, a crisp, natural single hand clap recorded in a dead room with an extremely sharp transient and no room reflections. Length: 1 second\\n- Studio hat, a classic, natural recording of 14-inch hi-hats played tightly closed, zero ring, very fast decay. Length: 1 second\\n- Disco open hat, bright 14-inch open hi-hat with long, shimmering decay, perfect for disco or dance grooves. Length: 1 second\\n- Pillow kick, acoustic kick drum muffled with a heavy blanket, producing a short, dry \\\"thump\\\" with almost zero resonance. Length: 1 second\\n- Short 808, punchy 808 kick with sharp, distorted transient and fast-decaying sub-tail. Length: 1 second\\n- Egg shaker, classic plastic egg shaker recorded with a small-diaphragm condenser mic, producing a light, consistent \\\"tick\\\" with very short sustain. Length: 1 second\\n- African drums, dynamic African drums and percussion ensemble with natural acoustic textures. Length: 3 seconds\\n- Latin drums, dynamic Latin drums and percussion ensemble featuring authentic rhythmic patterns. Length: 3 seconds\\n- String quartet, euphoric string quartet with dynamic and emotional playing, full of expressive harmonies and movement. Length: 3 seconds\\n- Piano, nostalgic, atmospheric piano piece with dynamic and emotional performance, intimate and resonant. Length: 3 seconds\\n- Analogue drift pad, warm polyphonic pad with three detuned oscillators (saw + triangle), subtle pitch drift, and lush bucket-brigade chorus for wide, nostalgic stereo image. Length: 11 seconds\\n- Phase distortion bass, Casio CZ-style phase-distorted sine wave warped into a jagged sawtooth for retro synth bass tone. Length: 11 seconds\\n- Vibrato saxophone, bright lyrical alto sax with fast fluttery vibrato, reedy vintage tone, captured with ribbon mic for warm nostalgic sound. Length: 11 seconds\\n- Lofi upright bass, upright bass recorded with ribbon mic in a wooden room, natural air with slightly boxy resonance, tape-saturated for dusty 1950s jazz feel. Length: 2 seconds\"\n}", + "Music" + ] + }, + { + "id": 40, + "type": "StringReplace", + "pos": [ + 1350, + 900 + ], + "size": [ + 260, + 280 + ], + "flags": {}, + "order": 15, + "mode": 0, + "inputs": [ + { + "localized_name": "string", + "name": "string", + "type": "STRING", + "widget": { + "name": "string" + }, + "link": 59 + }, + { + "localized_name": "find", + "name": "find", + "type": "STRING", + "widget": { + "name": "find" + }, + "link": null + }, + { + "localized_name": "replace", + "name": "replace", + "type": "STRING", + "widget": { + "name": "replace" + }, + "link": 58 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 60 + ] + } + ], + "title": "Text Replace (AUDIO LENGTH)", + "properties": { + "Node name for S&R": "StringReplace" + }, + "widgets_values": [ + "", + "AUDIO_LENGTH", + "" + ] + }, + { + "id": 38, + "type": "StringReplace", + "pos": [ + 720, + 900 + ], + "size": [ + 290, + 280 + ], + "flags": {}, + "order": 13, + "mode": 0, + "inputs": [ + { + "localized_name": "string", + "name": "string", + "type": "STRING", + "widget": { + "name": "string" + }, + "link": null + }, + { + "localized_name": "find", + "name": "find", + "type": "STRING", + "widget": { + "name": "find" + }, + "link": null + }, + { + "localized_name": "replace", + "name": "replace", + "type": "STRING", + "widget": { + "name": "replace" + }, + "link": 66 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 52 + ] + } + ], + "title": "Text Replace (PROMPT TEMPLATE)", + "properties": { + "Node name for S&R": "StringReplace" + }, + "widgets_values": [ + "SYSTEM_PROMPTS\n\nInput: USER_INPUT\nTarget audio length: AUDIO_LENGTH seconds.\nOutput:", + "SYSTEM_PROMPTS", + "" + ] + }, + { + "id": 35, + "type": "PrimitiveBoolean", + "pos": [ + -390, + 570 + ], + "size": [ + 400, + 100 + ], + "flags": {}, + "order": 11, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "BOOLEAN", + "widget": { + "name": "value" + }, + "link": 83 + } + ], + "outputs": [ + { + "localized_name": "BOOLEAN", + "name": "BOOLEAN", + "type": "BOOLEAN", + "links": [ + 48 + ] + } + ], + "title": "Boolean (Enable_Reprompt)", + "properties": { + "Node name for S&R": "PrimitiveBoolean" + }, + "widgets_values": [ + true + ] + }, + { + "id": 36, + "type": "PrimitiveFloat", + "pos": [ + -390, + 410 + ], + "size": [ + 400, + 110 + ], + "flags": {}, + "order": 12, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "FLOAT", + "widget": { + "name": "value" + }, + "link": 82 + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": [ + 50, + 56 + ] + } + ], + "title": "Float (Duration)", + "properties": { + "Node name for S&R": "PrimitiveFloat" + }, + "widgets_values": [ + 150 + ] + }, + { + "id": 25, + "type": "CheckpointLoaderSimple", + "pos": [ + 100, + 130 + ], + "size": [ + 440, + 190 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "ckpt_name", + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": 79 + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "links": [ + 30 + ] + }, + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "links": [] + }, + { + "localized_name": "VAE", + "name": "VAE", + "type": "VAE", + "links": [ + 39 + ] + } + ], + "properties": { + "Node name for S&R": "CheckpointLoaderSimple", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "models": [ + { + "name": "stable_audio_3_medium.safetensors", + "url": "https://huggingface.co/Comfy-Org/stable-audio-3/resolve/main/checkpoints/stable_audio_3_medium.safetensors", + "directory": "checkpoints" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "stable_audio_3_medium.safetensors" + ] + }, + { + "id": 26, + "type": "CLIPLoader", + "pos": [ + 100, + 390 + ], + "size": [ + 440, + 170 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "localized_name": "clip_name", + "name": "clip_name", + "type": "COMBO", + "widget": { + "name": "clip_name" + }, + "link": 80 + }, + { + "localized_name": "type", + "name": "type", + "type": "COMBO", + "widget": { + "name": "type" + }, + "link": null + }, + { + "localized_name": "device", + "name": "device", + "shape": 7, + "type": "COMBO", + "widget": { + "name": "device" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "links": [ + 34, + 35 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPLoader", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "models": [ + { + "name": "t5gemma_b_b_ul2.safetensors", + "url": "https://huggingface.co/Comfy-Org/stable-audio-3/resolve/main/text_encoders/t5gemma_b_b_ul2.safetensors", + "directory": "text_encoders" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "t5gemma_b_b_ul2.safetensors", + "stable_audio", + "default" + ] + }, + { + "id": 54, + "type": "PreviewAny", + "pos": [ + 1720, + 1580 + ], + "size": [ + 420, + 550 + ], + "flags": {}, + "order": 20, + "mode": 4, + "inputs": [ + { + "localized_name": "source", + "name": "source", + "type": "*", + "link": 84 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": null + } + ], + "properties": { + "Node name for S&R": "PreviewAny" + }, + "widgets_values": [ + null, + null, + null + ] + } + ], + "groups": [ + { + "id": 1, + "title": "Loaders: checkpoint & CLIP", + "bounding": [ + 80, + 50, + 485.721654232725, + 527.2848777754299 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 2, + "title": "CLIP encode: conditioning", + "bounding": [ + 600, + 60, + 470, + 510 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 3, + "title": "User inputs: prompt & duration", + "bounding": [ + -400, + 10, + 430, + 740 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 7, + "title": "Reprompt: full branch (template + LLM)", + "bounding": [ + 60, + 780, + 1630, + 1360 + ], + "color": "#444", + "flags": {} + }, + { + "id": 4, + "title": "Reprompt: JSON extract & template fills", + "bounding": [ + 120, + 820, + 1520, + 650 + ], + "color": "#444", + "flags": {} + }, + { + "id": 5, + "title": "Helpers: duration to string", + "bounding": [ + 1340, + 1180, + 280, + 250 + ], + "color": "#444", + "flags": {} + }, + { + "id": 6, + "title": "Reprompt: Qwen TextGenerate", + "bounding": [ + 680, + 1510, + 960, + 614.65625 + ], + "color": "#444", + "flags": {} + }, + { + "id": 8, + "title": "Audio generation: Stable Audio", + "bounding": [ + 60, + 10, + 1627.3616782294932, + 737.0545987464304 + ], + "color": "#3f789e", + "flags": {} + } + ], + "links": [ + { + "id": 35, + "origin_id": 26, + "origin_slot": 0, + "target_id": 7, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 13, + "origin_id": 3, + "origin_slot": 0, + "target_id": 12, + "target_slot": 0, + "type": "LATENT" + }, + { + "id": 39, + "origin_id": 25, + "origin_slot": 2, + "target_id": 12, + "target_slot": 1, + "type": "VAE" + }, + { + "id": 50, + "origin_id": 36, + "origin_slot": 0, + "target_id": 11, + "target_slot": 0, + "type": "FLOAT" + }, + { + "id": 30, + "origin_id": 25, + "origin_slot": 0, + "target_id": 3, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 4, + "origin_id": 6, + "origin_slot": 0, + "target_id": 3, + "target_slot": 1, + "type": "CONDITIONING" + }, + { + "id": 6, + "origin_id": 7, + "origin_slot": 0, + "target_id": 3, + "target_slot": 2, + "type": "CONDITIONING" + }, + { + "id": 12, + "origin_id": 11, + "origin_slot": 0, + "target_id": 3, + "target_slot": 3, + "type": "LATENT" + }, + { + "id": 34, + "origin_id": 26, + "origin_slot": 0, + "target_id": 6, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 49, + "origin_id": 34, + "origin_slot": 0, + "target_id": 6, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 47, + "origin_id": 31, + "origin_slot": 0, + "target_id": 34, + "target_slot": 0, + "type": "STRING" + }, + { + "id": 46, + "origin_id": 28, + "origin_slot": 0, + "target_id": 34, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 48, + "origin_id": 35, + "origin_slot": 0, + "target_id": 34, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 56, + "origin_id": 36, + "origin_slot": 0, + "target_id": 41, + "target_slot": 0, + "type": "FLOAT" + }, + { + "id": 57, + "origin_id": 41, + "origin_slot": 1, + "target_id": 42, + "target_slot": 0, + "type": "INT" + }, + { + "id": 52, + "origin_id": 38, + "origin_slot": 0, + "target_id": 39, + "target_slot": 0, + "type": "STRING" + }, + { + "id": 53, + "origin_id": 31, + "origin_slot": 0, + "target_id": 39, + "target_slot": 2, + "type": "STRING" + }, + { + "id": 40, + "origin_id": 29, + "origin_slot": 0, + "target_id": 28, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 60, + "origin_id": 40, + "origin_slot": 0, + "target_id": 28, + "target_slot": 4, + "type": "STRING" + }, + { + "id": 65, + "origin_id": 43, + "origin_slot": 0, + "target_id": 49, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 59, + "origin_id": 39, + "origin_slot": 0, + "target_id": 40, + "target_slot": 0, + "type": "STRING" + }, + { + "id": 58, + "origin_id": 42, + "origin_slot": 0, + "target_id": 40, + "target_slot": 2, + "type": "STRING" + }, + { + "id": 66, + "origin_id": 49, + "origin_slot": 0, + "target_id": 38, + "target_slot": 2, + "type": "STRING" + }, + { + "id": 27, + "origin_id": 12, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "AUDIO" + }, + { + "id": 68, + "origin_id": -10, + "origin_slot": 0, + "target_id": 31, + "target_slot": 0, + "type": "STRING" + }, + { + "id": 76, + "origin_id": -10, + "origin_slot": 2, + "target_id": 3, + "target_slot": 4, + "type": "INT" + }, + { + "id": 78, + "origin_id": -10, + "origin_slot": 4, + "target_id": 43, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 79, + "origin_id": -10, + "origin_slot": 5, + "target_id": 25, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 80, + "origin_id": -10, + "origin_slot": 6, + "target_id": 26, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 81, + "origin_id": -10, + "origin_slot": 7, + "target_id": 29, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 82, + "origin_id": -10, + "origin_slot": 1, + "target_id": 36, + "target_slot": 0, + "type": "FLOAT" + }, + { + "id": 83, + "origin_id": -10, + "origin_slot": 3, + "target_id": 35, + "target_slot": 0, + "type": "BOOLEAN" + }, + { + "id": 84, + "origin_id": 28, + "origin_slot": 0, + "target_id": 54, + "target_slot": 0, + "type": "STRING" + } + ], + "extra": {}, + "category": "Audio/Music generation", + "description": "Generates music, instrument loops, sound effects, and one-shots from text using Stable Audio 3 Medium, with optional Qwen 3.5 category-based prompt expansion (Music, Instrument, SFX, One-shot)." + } + ] + }, + "extra": {} +} \ No newline at end of file diff --git a/blueprints/Canny to Image (Z-Image-Turbo).json b/blueprints/Canny to Image (Z-Image-Turbo).json index 14deb64cc..903d372b1 100644 --- a/blueprints/Canny to Image (Z-Image-Turbo).json +++ b/blueprints/Canny to Image (Z-Image-Turbo).json @@ -1553,7 +1553,7 @@ "VHS_MetadataImage": true, "VHS_KeepIntermediate": true }, - "category": "Image generation and editing/Canny to image", + "category": "Image generation and editing/Conditioned", "description": "Generates an image from a Canny edge map using Z-Image-Turbo, with text conditioning." } ] diff --git a/blueprints/Canny to Video (LTX 2.0).json b/blueprints/Canny to Video (LTX 2.0).json index a9682c8a4..ed602b521 100644 --- a/blueprints/Canny to Video (LTX 2.0).json +++ b/blueprints/Canny to Video (LTX 2.0).json @@ -3600,7 +3600,7 @@ "extra": { "workflowRendererVersion": "LG" }, - "category": "Video generation and editing/Canny to video", + "category": "Video generation and editing/Conditioned", "description": "Generates video from Canny edge maps using LTX-2, with optional synchronized audio." } ] diff --git a/blueprints/ControlNet (Z-Image-Turbo).json b/blueprints/ControlNet (Z-Image-Turbo).json index fbec95a97..160ee11e2 100644 --- a/blueprints/ControlNet (Z-Image-Turbo).json +++ b/blueprints/ControlNet (Z-Image-Turbo).json @@ -1401,7 +1401,7 @@ "extra": { "workflowRendererVersion": "LG" }, - "category": "Image generation and editing/ControlNet", + "category": "Image generation and editing/Conditioned", "description": "Generates images from a text prompt and ControlNet conditioning (e.g. depth, canny) using Z-Image-Turbo." } ] diff --git a/blueprints/Depth to Image (Z-Image-Turbo).json b/blueprints/Depth to Image (Z-Image-Turbo).json index fe9ef0f72..2790827a3 100644 --- a/blueprints/Depth to Image (Z-Image-Turbo).json +++ b/blueprints/Depth to Image (Z-Image-Turbo).json @@ -1579,7 +1579,7 @@ "VHS_MetadataImage": true, "VHS_KeepIntermediate": true }, - "category": "Image generation and editing/Depth to image", + "category": "Image generation and editing/Conditioned", "description": "Generates an image from a depth map using Z-Image-Turbo with text conditioning." }, { diff --git a/blueprints/Depth to Video (ltx 2.0).json b/blueprints/Depth to Video (ltx 2.0).json index bd51e4476..56912de51 100644 --- a/blueprints/Depth to Video (ltx 2.0).json +++ b/blueprints/Depth to Video (ltx 2.0).json @@ -4233,7 +4233,7 @@ "extra": { "workflowRendererVersion": "LG" }, - "category": "Video generation and editing/Depth to video", + "category": "Video generation and editing/Conditioned", "description": "Generates depth-controlled video with LTX-2: motion and structure follow a depth-reference video alongside text prompting, optional first-frame image conditioning, with optional synchronized audio." }, { diff --git a/blueprints/First-Last-Frame to Video (LTX-2.3).json b/blueprints/First-Last-Frame to Video (LTX-2.3).json index f509aefe0..4cae2dc24 100644 --- a/blueprints/First-Last-Frame to Video (LTX-2.3).json +++ b/blueprints/First-Last-Frame to Video (LTX-2.3).json @@ -3350,7 +3350,7 @@ } ], "extra": {}, - "category": "Video generation and editing/First-Last-Frame to Video", + "category": "Video generation and editing/Conditioned", "description": "Generates a video interpolating between first and last keyframes using LTX-2.3." } ] diff --git a/blueprints/First-Last-Frame to Video.json b/blueprints/First-Last-Frame to Video.json index 84dfafbcd..d76e1e045 100644 --- a/blueprints/First-Last-Frame to Video.json +++ b/blueprints/First-Last-Frame to Video.json @@ -3350,7 +3350,7 @@ } ], "extra": {}, - "category": "Video generation and editing/First-Last-Frame to Video", + "category": "Video generation and editing/FLF2V", "description": "Generates a video that interpolates between the first and last keyframes using LTX-2.3, including optional audio." } ] diff --git a/blueprints/Geometry Estimation (MoGe).json b/blueprints/Geometry Estimation (MoGe).json new file mode 100644 index 000000000..e6f08bf71 --- /dev/null +++ b/blueprints/Geometry Estimation (MoGe).json @@ -0,0 +1,1266 @@ +{ + "revision": 0, + "last_node_id": 67, + "last_link_id": 0, + "nodes": [ + { + "id": 67, + "type": "936dfaf2-575a-48b5-9e0c-df391319d11f", + "pos": [ + -3950, + 5000 + ], + "size": [ + 430, + 480 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "source_image", + "name": "source_image", + "type": "IMAGE", + "link": null + }, + { + "localized_name": "inference_resolution", + "name": "inference_resolution", + "type": "INT", + "widget": { + "name": "inference_resolution" + }, + "link": null + }, + { + "localized_name": "inference_batch_size", + "name": "inference_batch_size", + "type": "INT", + "widget": { + "name": "inference_batch_size" + }, + "link": null + }, + { + "localized_name": "mesh_frame_index", + "name": "mesh_frame_index", + "type": "INT", + "widget": { + "name": "mesh_frame_index" + }, + "link": null + }, + { + "localized_name": "mesh_decimation", + "name": "mesh_decimation", + "type": "INT", + "widget": { + "name": "mesh_decimation" + }, + "link": null + }, + { + "localized_name": "mesh_gap_threshold", + "name": "mesh_gap_threshold", + "type": "FLOAT", + "widget": { + "name": "mesh_gap_threshold" + }, + "link": null + }, + { + "localized_name": "mesh_texture", + "name": "mesh_texture", + "type": "BOOLEAN", + "widget": { + "name": "mesh_texture" + }, + "link": null + }, + { + "localized_name": "moge_model", + "name": "moge_model", + "type": "COMBO", + "widget": { + "name": "moge_model" + }, + "link": null + }, + { + "label": "auto_resize_input", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "mesh", + "name": "mesh", + "type": "MESH", + "links": [] + }, + { + "localized_name": "normal_opengl", + "name": "normal_opengl", + "type": "IMAGE", + "links": [] + }, + { + "localized_name": "normal_directx", + "name": "normal_directx", + "type": "IMAGE", + "links": [] + } + ], + "properties": { + "proxyWidgets": [ + [ + "55", + "resolution_level" + ], + [ + "55", + "batch_size" + ], + [ + "54", + "batch_index" + ], + [ + "54", + "decimation" + ], + [ + "54", + "discontinuity_threshold" + ], + [ + "54", + "texture" + ], + [ + "58", + "model_name" + ], + [ + "66", + "switch" + ] + ], + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [], + "title": "Geometry Estimation (MoGe)" + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "936dfaf2-575a-48b5-9e0c-df391319d11f", + "version": 1, + "state": { + "lastGroupId": 1, + "lastNodeId": 69, + "lastLinkId": 91, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Geometry Estimation (MoGe)", + "inputNode": { + "id": -10, + "bounding": [ + -5130, + 5320, + 167.337890625, + 228 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + -3090, + 4966, + 131.51953125, + 108 + ] + }, + "inputs": [ + { + "id": "cc8ce79d-ba20-4a25-a51c-c2afcd35e520", + "name": "source_image", + "type": "IMAGE", + "linkIds": [ + 48, + 55, + 56, + 82 + ], + "localized_name": "source_image", + "pos": [ + -4986.662109375, + 5344 + ] + }, + { + "id": "06eefa21-8e60-49f3-9a34-35b081f4ae52", + "name": "inference_resolution", + "type": "INT", + "linkIds": [ + 73 + ], + "localized_name": "inference_resolution", + "pos": [ + -4986.662109375, + 5364 + ] + }, + { + "id": "616638fe-f603-4d10-bae9-fc87c134380f", + "name": "inference_batch_size", + "type": "INT", + "linkIds": [ + 74 + ], + "localized_name": "inference_batch_size", + "pos": [ + -4986.662109375, + 5384 + ] + }, + { + "id": "fcacfca9-7927-4c38-94da-8ab22256325f", + "name": "mesh_frame_index", + "type": "INT", + "linkIds": [ + 75 + ], + "localized_name": "mesh_frame_index", + "pos": [ + -4986.662109375, + 5404 + ] + }, + { + "id": "acbfe7f9-1b69-42c1-8614-4ccf54b28d4e", + "name": "mesh_decimation", + "type": "INT", + "linkIds": [ + 76 + ], + "localized_name": "mesh_decimation", + "pos": [ + -4986.662109375, + 5424 + ] + }, + { + "id": "cd20f9a7-3a0a-4c4c-98d7-96f423867b87", + "name": "mesh_gap_threshold", + "type": "FLOAT", + "linkIds": [ + 77 + ], + "localized_name": "mesh_gap_threshold", + "pos": [ + -4986.662109375, + 5444 + ] + }, + { + "id": "6f5c15f7-7f77-4fc9-b47b-3514467b06b6", + "name": "mesh_texture", + "type": "BOOLEAN", + "linkIds": [ + 78 + ], + "localized_name": "mesh_texture", + "pos": [ + -4986.662109375, + 5464 + ] + }, + { + "id": "65694805-186e-4181-a721-df8b5af49d31", + "name": "moge_model", + "type": "COMBO", + "linkIds": [ + 79 + ], + "localized_name": "moge_model", + "pos": [ + -4986.662109375, + 5484 + ] + }, + { + "id": "badf1be1-53c6-4fc1-b5cd-79ad3daf1674", + "name": "switch", + "type": "BOOLEAN", + "linkIds": [ + 83 + ], + "label": "auto_resize_input", + "pos": [ + -4986.662109375, + 5504 + ] + } + ], + "outputs": [ + { + "id": "3c616ea0-9a4c-4cff-a405-662320229df0", + "name": "mesh", + "type": "MESH", + "linkIds": [ + 34 + ], + "localized_name": "mesh", + "pos": [ + -3066, + 4990 + ] + }, + { + "id": "ff85a763-b7f7-4bcc-9b1d-a4eaf55ad2f9", + "name": "normal_opengl", + "type": "IMAGE", + "linkIds": [ + 62 + ], + "localized_name": "normal_opengl", + "pos": [ + -3066, + 5010 + ] + }, + { + "id": "26b3f88a-0ba0-4d4d-9c7d-0ad76106c844", + "name": "normal_directx", + "type": "IMAGE", + "linkIds": [ + 63 + ], + "localized_name": "normal_directx", + "pos": [ + -3066, + 5030 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 54, + "type": "MoGePointMapToMesh", + "pos": [ + -3440, + 5220 + ], + "size": [ + 290, + 200 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "moge_geometry", + "name": "moge_geometry", + "type": "MOGE_GEOMETRY", + "link": 33 + }, + { + "localized_name": "batch_index", + "name": "batch_index", + "type": "INT", + "widget": { + "name": "batch_index" + }, + "link": 75 + }, + { + "localized_name": "decimation", + "name": "decimation", + "type": "INT", + "widget": { + "name": "decimation" + }, + "link": 76 + }, + { + "localized_name": "discontinuity_threshold", + "name": "discontinuity_threshold", + "type": "FLOAT", + "widget": { + "name": "discontinuity_threshold" + }, + "link": 77 + }, + { + "localized_name": "texture", + "name": "texture", + "type": "BOOLEAN", + "widget": { + "name": "texture" + }, + "link": 78 + } + ], + "outputs": [ + { + "localized_name": "MESH", + "name": "MESH", + "type": "MESH", + "links": [ + 34 + ] + } + ], + "properties": { + "Node name for S&R": "MoGePointMapToMesh", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 0, + 1, + 0.04, + true + ] + }, + { + "id": 55, + "type": "MoGeInference", + "pos": [ + -3790, + 5180 + ], + "size": [ + 270, + 230 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "moge_model", + "name": "moge_model", + "type": "MOGE_MODEL", + "link": 58 + }, + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 81 + }, + { + "localized_name": "resolution_level", + "name": "resolution_level", + "type": "INT", + "widget": { + "name": "resolution_level" + }, + "link": 73 + }, + { + "localized_name": "fov_x_degrees", + "name": "fov_x_degrees", + "type": "FLOAT", + "widget": { + "name": "fov_x_degrees" + }, + "link": null + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { + "name": "batch_size" + }, + "link": 74 + }, + { + "localized_name": "force_projection", + "name": "force_projection", + "type": "BOOLEAN", + "widget": { + "name": "force_projection" + }, + "link": null + }, + { + "localized_name": "apply_mask", + "name": "apply_mask", + "type": "BOOLEAN", + "widget": { + "name": "apply_mask" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "moge_geometry", + "name": "moge_geometry", + "type": "MOGE_GEOMETRY", + "links": [ + 33, + 59, + 60 + ] + } + ], + "properties": { + "Node name for S&R": "MoGeInference", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 9, + 0, + 4, + true, + true + ] + }, + { + "id": 58, + "type": "LoadMoGeModel", + "pos": [ + -4180, + 4910 + ], + "size": [ + 270, + 140 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "model_name", + "name": "model_name", + "type": "COMBO", + "widget": { + "name": "model_name" + }, + "link": 79 + } + ], + "outputs": [ + { + "localized_name": "MOGE_MODEL", + "name": "MOGE_MODEL", + "type": "MOGE_MODEL", + "links": [ + 58 + ] + } + ], + "properties": { + "Node name for S&R": "LoadMoGeModel", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "models": [ + { + "name": "moge_2_vitl_normal_fp16.safetensors", + "url": "https://huggingface.co/Comfy-Org/MoGe/resolve/main/geometry_estimation/moge_2_vitl_normal_fp16.safetensors", + "directory": "geometry_estimation" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "moge_2_vitl_normal_fp16.safetensors" + ] + }, + { + "id": 59, + "type": "ComfyMathExpression", + "pos": [ + -4720, + 4910 + ], + "size": [ + 400, + 200 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "label": "a", + "localized_name": "values.a", + "name": "values.a", + "type": "FLOAT,INT,BOOLEAN", + "link": 49 + }, + { + "label": "b", + "localized_name": "values.b", + "name": "values.b", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": null + }, + { + "localized_name": "expression", + "name": "expression", + "type": "STRING", + "widget": { + "name": "expression" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": null + }, + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": null + }, + { + "localized_name": "BOOL", + "name": "BOOL", + "type": "BOOLEAN", + "links": [ + 53 + ] + } + ], + "properties": { + "Node name for S&R": "ComfyMathExpression", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "a > 2048" + ] + }, + { + "id": 60, + "type": "GetImageSize", + "pos": [ + -4980, + 4910 + ], + "size": [ + 230, + 160 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 48 + } + ], + "outputs": [ + { + "localized_name": "width", + "name": "width", + "type": "INT", + "links": [ + 49 + ] + }, + { + "localized_name": "height", + "name": "height", + "type": "INT", + "links": null + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "links": null + } + ], + "properties": { + "Node name for S&R": "GetImageSize", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 61, + "type": "ResizeImagesByLongerEdge", + "pos": [ + -4650, + 5210 + ], + "size": [ + 310, + 110 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "link": 55 + }, + { + "localized_name": "longer_edge", + "name": "longer_edge", + "type": "INT", + "widget": { + "name": "longer_edge" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "links": [ + 54 + ] + } + ], + "properties": { + "Node name for S&R": "ResizeImagesByLongerEdge", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 2048 + ] + }, + { + "id": 62, + "type": "ComfySwitchNode", + "pos": [ + -4180, + 5120 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 56 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 54 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 53 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 80 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + false + ] + }, + { + "id": 63, + "type": "MoGeRender", + "pos": [ + -3430, + 4890 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "localized_name": "moge_geometry", + "name": "moge_geometry", + "type": "MOGE_GEOMETRY", + "link": 59 + }, + { + "localized_name": "output", + "name": "output", + "type": "COMBO", + "widget": { + "name": "output" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 62 + ] + } + ], + "properties": { + "Node name for S&R": "MoGeRender", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "normal_opengl" + ] + }, + { + "id": 64, + "type": "MoGeRender", + "pos": [ + -3430, + 5050 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "localized_name": "moge_geometry", + "name": "moge_geometry", + "type": "MOGE_GEOMETRY", + "link": 60 + }, + { + "localized_name": "output", + "name": "output", + "type": "COMBO", + "widget": { + "name": "output" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 63 + ] + } + ], + "properties": { + "Node name for S&R": "MoGeRender", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "normal_directx" + ] + }, + { + "id": 66, + "type": "ComfySwitchNode", + "pos": [ + -4160, + 5340 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 82 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 80 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 83 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 81 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + true + ] + } + ], + "groups": [ + { + "id": 1, + "title": "auto_resize_if_width_gt_2048", + "bounding": [ + -5000, + 4840, + 690, + 280 + ], + "color": "#3f789e", + "flags": {} + } + ], + "links": [ + { + "id": 33, + "origin_id": 55, + "origin_slot": 0, + "target_id": 54, + "target_slot": 0, + "type": "MOGE_GEOMETRY" + }, + { + "id": 58, + "origin_id": 58, + "origin_slot": 0, + "target_id": 55, + "target_slot": 0, + "type": "MOGE_MODEL" + }, + { + "id": 49, + "origin_id": 60, + "origin_slot": 0, + "target_id": 59, + "target_slot": 0, + "type": "INT" + }, + { + "id": 54, + "origin_id": 61, + "origin_slot": 0, + "target_id": 62, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 53, + "origin_id": 59, + "origin_slot": 2, + "target_id": 62, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 59, + "origin_id": 55, + "origin_slot": 0, + "target_id": 63, + "target_slot": 0, + "type": "MOGE_GEOMETRY" + }, + { + "id": 60, + "origin_id": 55, + "origin_slot": 0, + "target_id": 64, + "target_slot": 0, + "type": "MOGE_GEOMETRY" + }, + { + "id": 48, + "origin_id": -10, + "origin_slot": 0, + "target_id": 60, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 55, + "origin_id": -10, + "origin_slot": 0, + "target_id": 61, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 56, + "origin_id": -10, + "origin_slot": 0, + "target_id": 62, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 34, + "origin_id": 54, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "MESH" + }, + { + "id": 62, + "origin_id": 63, + "origin_slot": 0, + "target_id": -20, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 63, + "origin_id": 64, + "origin_slot": 0, + "target_id": -20, + "target_slot": 2, + "type": "IMAGE" + }, + { + "id": 73, + "origin_id": -10, + "origin_slot": 1, + "target_id": 55, + "target_slot": 2, + "type": "INT" + }, + { + "id": 74, + "origin_id": -10, + "origin_slot": 2, + "target_id": 55, + "target_slot": 4, + "type": "INT" + }, + { + "id": 75, + "origin_id": -10, + "origin_slot": 3, + "target_id": 54, + "target_slot": 1, + "type": "INT" + }, + { + "id": 76, + "origin_id": -10, + "origin_slot": 4, + "target_id": 54, + "target_slot": 2, + "type": "INT" + }, + { + "id": 77, + "origin_id": -10, + "origin_slot": 5, + "target_id": 54, + "target_slot": 3, + "type": "FLOAT" + }, + { + "id": 78, + "origin_id": -10, + "origin_slot": 6, + "target_id": 54, + "target_slot": 4, + "type": "BOOLEAN" + }, + { + "id": 79, + "origin_id": -10, + "origin_slot": 7, + "target_id": 58, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 80, + "origin_id": 62, + "origin_slot": 0, + "target_id": 66, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 81, + "origin_id": 66, + "origin_slot": 0, + "target_id": 55, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 82, + "origin_id": -10, + "origin_slot": 0, + "target_id": 66, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 83, + "origin_id": -10, + "origin_slot": 8, + "target_id": 66, + "target_slot": 2, + "type": "BOOLEAN" + } + ], + "category": "3D/Geometry Estimation", + "description": "Estimates 3D scene geometry from an input image using MoGe, outputting a mesh plus OpenGL and DirectX normal maps.", + "extra": {} + } + ] + }, + "extra": {} +} \ No newline at end of file diff --git a/blueprints/Image Captioning (gemini).json b/blueprints/Image Captioning (gemini).json index 2fc5d6746..9005e5191 100644 --- a/blueprints/Image Captioning (gemini).json +++ b/blueprints/Image Captioning (gemini).json @@ -310,9 +310,9 @@ "extra": { "workflowRendererVersion": "LG" }, - "category": "Text generation/Image Captioning", + "category": "Image Tools", "description": "Generates descriptive captions for images using Google's Gemini multimodal LLM." } ] } -} +} \ No newline at end of file diff --git a/blueprints/Image to Depth Map (Lotus).json b/blueprints/Image Depth Estimation (Lotus Depth).json similarity index 92% rename from blueprints/Image to Depth Map (Lotus).json rename to blueprints/Image Depth Estimation (Lotus Depth).json index 12f10ba5b..8aa338d0d 100644 --- a/blueprints/Image to Depth Map (Lotus).json +++ b/blueprints/Image Depth Estimation (Lotus Depth).json @@ -1,19 +1,18 @@ { - "id": "6af0a6c1-0161-4528-8685-65776e838d44", "revision": 0, - "last_node_id": 75, - "last_link_id": 245, + "last_node_id": 76, + "last_link_id": 0, "nodes": [ { - "id": 75, - "type": "488652fd-6edf-4d06-8f9f-4d84d3a34eaf", + "id": 76, + "type": "96338968-1242-4f02-b6a1-d496af4bcffe", "pos": [ - 600, - 830 + 670, + 1280 ], "size": [ 400, - 110 + 201.3125 ], "flags": {}, "order": 0, @@ -59,47 +58,44 @@ "links": [] } ], + "title": "Image Depth Estimation (Lotus Depth)", "properties": { "proxyWidgets": [ [ - "-1", + "28", "sigma" ], [ - "-1", + "10", "unet_name" ], [ - "-1", + "14", "vae_name" ] ], "cnr_id": "comfy-core", "ver": "0.14.1" }, - "widgets_values": [ - 999.0000000000002, - "lotus-depth-d-v1-1.safetensors", - "vae-ft-mse-840000-ema-pruned.safetensors" - ] + "widgets_values": [] } ], "links": [], - "groups": [], + "version": 0.4, "definitions": { "subgraphs": [ { - "id": "488652fd-6edf-4d06-8f9f-4d84d3a34eaf", + "id": "96338968-1242-4f02-b6a1-d496af4bcffe", "version": 1, "state": { "lastGroupId": 1, - "lastNodeId": 75, + "lastNodeId": 76, "lastLinkId": 245, "lastRerouteId": 0 }, "revision": 0, "config": {}, - "name": "Image to Depth Map (Lotus)", + "name": "Image Depth Estimation (Lotus Depth)", "inputNode": { "id": -10, "bounding": [ @@ -191,12 +187,12 @@ "id": 10, "type": "UNETLoader", "pos": [ - 108.05555555555557, - -253.05555555555557 + 110, + -250 ], "size": [ - 254.93706597222226, - 82 + 260, + 90 ], "flags": {}, "order": 4, @@ -234,9 +230,9 @@ } ], "properties": { + "Node name for S&R": "UNETLoader", "cnr_id": "comfy-core", "ver": "0.3.34", - "Node name for S&R": "UNETLoader", "models": [ { "name": "lotus-depth-d-v1-1.safetensors", @@ -255,12 +251,12 @@ "id": 18, "type": "DisableNoise", "pos": [ - 607.0641494069639, - -268.33337840371513 + 610, + -270 ], "size": [ - 175, - 33.333333333333336 + 180, + 40 ], "flags": {}, "order": 0, @@ -278,26 +274,25 @@ } ], "properties": { + "Node name for S&R": "DisableNoise", "cnr_id": "comfy-core", "ver": "0.3.34", - "Node name for S&R": "DisableNoise", "widget_ue_connectable": {} - }, - "widgets_values": [] + } }, { - "id": 23, + "id": 74, "type": "VAEEncode", "pos": [ 620, 160 ], "size": [ - 175, + 180, 50 ], "flags": {}, - "order": 10, + "order": 11, "mode": 0, "inputs": [ { @@ -325,12 +320,11 @@ } ], "properties": { + "Node name for S&R": "VAEEncode", "cnr_id": "comfy-core", "ver": "0.3.34", - "Node name for S&R": "VAEEncode", "widget_ue_connectable": {} - }, - "widgets_values": [] + } }, { "id": 21, @@ -341,7 +335,7 @@ ], "size": [ 210, - 58 + 60 ], "flags": {}, "order": 1, @@ -369,9 +363,9 @@ } ], "properties": { + "Node name for S&R": "KSamplerSelect", "cnr_id": "comfy-core", "ver": "0.3.34", - "Node name for S&R": "KSamplerSelect", "widget_ue_connectable": {} }, "widgets_values": [ @@ -386,7 +380,7 @@ -170 ], "size": [ - 175, + 180, 50 ], "flags": {}, @@ -418,12 +412,11 @@ } ], "properties": { + "Node name for S&R": "BasicGuider", "cnr_id": "comfy-core", "ver": "0.3.34", - "Node name for S&R": "BasicGuider", "widget_ue_connectable": {} - }, - "widgets_values": [] + } }, { "id": 16, @@ -433,8 +426,8 @@ -130 ], "size": [ - 295.99609375, - 271.65798611111114 + 300, + 280 ], "flags": {}, "order": 6, @@ -490,12 +483,11 @@ } ], "properties": { + "Node name for S&R": "SamplerCustomAdvanced", "cnr_id": "comfy-core", "ver": "0.3.34", - "Node name for S&R": "SamplerCustomAdvanced", "widget_ue_connectable": {} - }, - "widgets_values": [] + } }, { "id": 28, @@ -506,10 +498,10 @@ ], "size": [ 210, - 58 + 60 ], "flags": {}, - "order": 11, + "order": 10, "mode": 0, "inputs": [ { @@ -540,9 +532,9 @@ } ], "properties": { + "Node name for S&R": "SetFirstSigma", "cnr_id": "comfy-core", "ver": "0.3.34", - "Node name for S&R": "SetFirstSigma", "widget_ue_connectable": {} }, "widgets_values": [ @@ -557,7 +549,7 @@ -120 ], "size": [ - 175, + 180, 50 ], "flags": {}, @@ -589,12 +581,11 @@ } ], "properties": { + "Node name for S&R": "VAEDecode", "cnr_id": "comfy-core", "ver": "0.3.34", - "Node name for S&R": "VAEDecode", "widget_ue_connectable": {} - }, - "widgets_values": [] + } }, { "id": 22, @@ -604,8 +595,8 @@ -220 ], "size": [ - 175, - 33.333333333333336 + 180, + 40 ], "flags": {}, "order": 9, @@ -630,12 +621,11 @@ } ], "properties": { + "Node name for S&R": "ImageInvert", "cnr_id": "comfy-core", "ver": "0.3.34", - "Node name for S&R": "ImageInvert", "widget_ue_connectable": {} - }, - "widgets_values": [] + } }, { "id": 14, @@ -645,8 +635,8 @@ -90 ], "size": [ - 254.93706597222226, - 58 + 260, + 60 ], "flags": {}, "order": 5, @@ -675,9 +665,9 @@ } ], "properties": { + "Node name for S&R": "VAELoader", "cnr_id": "comfy-core", "ver": "0.3.34", - "Node name for S&R": "VAELoader", "models": [ { "name": "vae-ft-mse-840000-ema-pruned.safetensors", @@ -692,15 +682,15 @@ ] }, { - "id": 68, + "id": 75, "type": "LotusConditioning", "pos": [ 400, -150 ], "size": [ - 175, - 33.333333333333336 + 180, + 40 ], "flags": {}, "order": 2, @@ -718,12 +708,11 @@ } ], "properties": { + "Node name for S&R": "LotusConditioning", "cnr_id": "comfy-core", "ver": "0.3.34", - "Node name for S&R": "LotusConditioning", "widget_ue_connectable": {} - }, - "widgets_values": [] + } }, { "id": 20, @@ -734,7 +723,7 @@ ], "size": [ 210, - 106 + 110 ], "flags": {}, "order": 8, @@ -786,9 +775,9 @@ } ], "properties": { + "Node name for S&R": "BasicScheduler", "cnr_id": "comfy-core", "ver": "0.3.34", - "Node name for S&R": "BasicScheduler", "widget_ue_connectable": {} }, "widgets_values": [ @@ -850,7 +839,7 @@ }, { "id": 201, - "origin_id": 23, + "origin_id": 74, "origin_slot": 0, "target_id": 16, "target_slot": 4, @@ -866,7 +855,7 @@ }, { "id": 238, - "origin_id": 68, + "origin_id": 75, "origin_slot": 0, "target_id": 19, "target_slot": 1, @@ -892,7 +881,7 @@ "id": 38, "origin_id": 14, "origin_slot": 0, - "target_id": 23, + "target_id": 74, "target_slot": 1, "type": "VAE" }, @@ -908,7 +897,7 @@ "id": 37, "origin_id": -10, "origin_slot": 0, - "target_id": 23, + "target_id": 74, "target_slot": 0, "type": "IMAGE" }, @@ -948,12 +937,11 @@ "extra": { "workflowRendererVersion": "LG" }, - "category": "Image generation and editing/Depth to image", + "category": "Conditioning & Preprocessors/Depth", "description": "Estimates a monocular depth map from an input image using the Lotus depth estimation model." } ] }, - "config": {}, "extra": { "ds": { "scale": 1.3589709866044692, @@ -961,8 +949,6 @@ -138.53613935617864, -786.0629126022195 ] - }, - "workflowRendererVersion": "LG" - }, - "version": 0.4 + } + } } \ No newline at end of file diff --git a/blueprints/Image Depth Estimation (MoGe).json b/blueprints/Image Depth Estimation (MoGe).json new file mode 100644 index 000000000..e2d5d1298 --- /dev/null +++ b/blueprints/Image Depth Estimation (MoGe).json @@ -0,0 +1,1154 @@ +{ + "revision": 0, + "last_node_id": 49, + "last_link_id": 0, + "nodes": [ + { + "id": 49, + "type": "ca1fac5f-abe5-4729-b7fe-2299f6630a65", + "pos": [ + -3970, + 5000 + ], + "size": [ + 430, + 330 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "source_image", + "name": "source_image", + "type": "IMAGE", + "link": null + }, + { + "localized_name": "inference_resolution", + "name": "inference_resolution", + "type": "INT", + "widget": { + "name": "inference_resolution" + }, + "link": null + }, + { + "localized_name": "inference_batch_size", + "name": "inference_batch_size", + "type": "INT", + "widget": { + "name": "inference_batch_size" + }, + "link": null + }, + { + "localized_name": "moge_model", + "name": "moge_model", + "type": "COMBO", + "widget": { + "name": "moge_model" + }, + "link": null + }, + { + "label": "auto_resize_input", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "depth_colored", + "name": "depth_colored", + "type": "IMAGE", + "links": [] + }, + { + "localized_name": "depth", + "name": "depth", + "type": "IMAGE", + "links": [] + }, + { + "name": "MASK", + "type": "MASK", + "links": [] + } + ], + "title": "Image Depth Estimation (MoGe)", + "properties": { + "proxyWidgets": [ + [ + "13", + "resolution_level" + ], + [ + "13", + "batch_size" + ], + [ + "32", + "model_name" + ], + [ + "53", + "switch" + ] + ], + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [] + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "ca1fac5f-abe5-4729-b7fe-2299f6630a65", + "version": 1, + "state": { + "lastGroupId": 1, + "lastNodeId": 69, + "lastLinkId": 90, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Image Depth Estimation (MoGe)", + "description": "Estimates monocular depth from an input image using MoGe, outputting both raw and colorized depth maps plus a mask.", + "inputNode": { + "id": -10, + "bounding": [ + -5130, + 5320, + 167.337890625, + 148 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + -3090, + 4966, + 129, + 108 + ] + }, + "inputs": [ + { + "id": "cc8ce79d-ba20-4a25-a51c-c2afcd35e520", + "name": "source_image", + "type": "IMAGE", + "linkIds": [ + 48, + 55, + 56, + 82 + ], + "localized_name": "source_image", + "pos": [ + -4986.662109375, + 5344 + ] + }, + { + "id": "06eefa21-8e60-49f3-9a34-35b081f4ae52", + "name": "inference_resolution", + "type": "INT", + "linkIds": [ + 73 + ], + "localized_name": "inference_resolution", + "pos": [ + -4986.662109375, + 5364 + ] + }, + { + "id": "616638fe-f603-4d10-bae9-fc87c134380f", + "name": "inference_batch_size", + "type": "INT", + "linkIds": [ + 74 + ], + "localized_name": "inference_batch_size", + "pos": [ + -4986.662109375, + 5384 + ] + }, + { + "id": "65694805-186e-4181-a721-df8b5af49d31", + "name": "moge_model", + "type": "COMBO", + "linkIds": [ + 79 + ], + "localized_name": "moge_model", + "pos": [ + -4986.662109375, + 5404 + ] + }, + { + "id": "badf1be1-53c6-4fc1-b5cd-79ad3daf1674", + "name": "switch", + "type": "BOOLEAN", + "linkIds": [ + 83 + ], + "label": "auto_resize_input", + "pos": [ + -4986.662109375, + 5424 + ] + } + ], + "outputs": [ + { + "id": "59c37b52-074f-49fc-9731-483f899c12c4", + "name": "depth_colored", + "type": "IMAGE", + "linkIds": [ + 36 + ], + "localized_name": "depth_colored", + "pos": [ + -3066, + 4990 + ] + }, + { + "id": "f583e936-da5c-4630-9901-391fa605c1f8", + "name": "depth", + "type": "IMAGE", + "linkIds": [ + 40 + ], + "localized_name": "depth", + "pos": [ + -3066, + 5010 + ] + }, + { + "id": "6845b6a1-1980-454a-9451-314f24495c1d", + "name": "MASK", + "type": "MASK", + "linkIds": [ + 86 + ], + "pos": [ + -3066, + 5030 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 13, + "type": "MoGeInference", + "pos": [ + -3790, + 5180 + ], + "size": [ + 270, + 230 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "moge_model", + "name": "moge_model", + "type": "MOGE_MODEL", + "link": 58 + }, + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 81 + }, + { + "localized_name": "resolution_level", + "name": "resolution_level", + "type": "INT", + "widget": { + "name": "resolution_level" + }, + "link": 73 + }, + { + "localized_name": "fov_x_degrees", + "name": "fov_x_degrees", + "type": "FLOAT", + "widget": { + "name": "fov_x_degrees" + }, + "link": null + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { + "name": "batch_size" + }, + "link": 74 + }, + { + "localized_name": "force_projection", + "name": "force_projection", + "type": "BOOLEAN", + "widget": { + "name": "force_projection" + }, + "link": null + }, + { + "localized_name": "apply_mask", + "name": "apply_mask", + "type": "BOOLEAN", + "widget": { + "name": "apply_mask" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "moge_geometry", + "name": "moge_geometry", + "type": "MOGE_GEOMETRY", + "links": [ + 35, + 39, + 61 + ] + } + ], + "properties": { + "Node name for S&R": "MoGeInference", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 3, + 0, + 4, + true, + true + ] + }, + { + "id": 23, + "type": "MoGeRender", + "pos": [ + -3430, + 4870 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "moge_geometry", + "name": "moge_geometry", + "type": "MOGE_GEOMETRY", + "link": 35 + }, + { + "localized_name": "output", + "name": "output", + "type": "COMBO", + "widget": { + "name": "output" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 36 + ] + } + ], + "properties": { + "Node name for S&R": "MoGeRender", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "depth_colored" + ] + }, + { + "id": 25, + "type": "MoGeRender", + "pos": [ + -3430, + 5030 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "moge_geometry", + "name": "moge_geometry", + "type": "MOGE_GEOMETRY", + "link": 39 + }, + { + "localized_name": "output", + "name": "output", + "type": "COMBO", + "widget": { + "name": "output" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 40 + ] + } + ], + "properties": { + "Node name for S&R": "MoGeRender", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "depth" + ] + }, + { + "id": 32, + "type": "LoadMoGeModel", + "pos": [ + -4180, + 4880 + ], + "size": [ + 270, + 140 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "model_name", + "name": "model_name", + "type": "COMBO", + "widget": { + "name": "model_name" + }, + "link": 79 + } + ], + "outputs": [ + { + "localized_name": "MOGE_MODEL", + "name": "MOGE_MODEL", + "type": "MOGE_MODEL", + "links": [ + 58 + ] + } + ], + "properties": { + "Node name for S&R": "LoadMoGeModel", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "models": [ + { + "name": "moge_2_vitl_normal_fp16.safetensors", + "url": "https://huggingface.co/Comfy-Org/MoGe/resolve/main/geometry_estimation/moge_2_vitl_normal_fp16.safetensors", + "directory": "geometry_estimation" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "moge_2_vitl_normal_fp16.safetensors" + ] + }, + { + "id": 36, + "type": "ComfyMathExpression", + "pos": [ + -4720, + 4910 + ], + "size": [ + 400, + 200 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "label": "a", + "localized_name": "values.a", + "name": "values.a", + "type": "FLOAT,INT,BOOLEAN", + "link": 49 + }, + { + "label": "b", + "localized_name": "values.b", + "name": "values.b", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": null + }, + { + "localized_name": "expression", + "name": "expression", + "type": "STRING", + "widget": { + "name": "expression" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": null + }, + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": null + }, + { + "localized_name": "BOOL", + "name": "BOOL", + "type": "BOOLEAN", + "links": [ + 53 + ] + } + ], + "properties": { + "Node name for S&R": "ComfyMathExpression", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "a > 2048" + ] + }, + { + "id": 37, + "type": "GetImageSize", + "pos": [ + -4980, + 4910 + ], + "size": [ + 230, + 160 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 48 + } + ], + "outputs": [ + { + "localized_name": "width", + "name": "width", + "type": "INT", + "links": [ + 49 + ] + }, + { + "localized_name": "height", + "name": "height", + "type": "INT", + "links": null + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "links": null + } + ], + "properties": { + "Node name for S&R": "GetImageSize", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 40, + "type": "ResizeImagesByLongerEdge", + "pos": [ + -4650, + 5210 + ], + "size": [ + 310, + 110 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "link": 55 + }, + { + "localized_name": "longer_edge", + "name": "longer_edge", + "type": "INT", + "widget": { + "name": "longer_edge" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "links": [ + 54 + ] + } + ], + "properties": { + "Node name for S&R": "ResizeImagesByLongerEdge", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 2048 + ] + }, + { + "id": 42, + "type": "ComfySwitchNode", + "pos": [ + -4180, + 5060 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 56 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 54 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 53 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 80 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + false + ] + }, + { + "id": 45, + "type": "MoGeRender", + "pos": [ + -3430, + 5200 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "localized_name": "moge_geometry", + "name": "moge_geometry", + "type": "MOGE_GEOMETRY", + "link": 61 + }, + { + "localized_name": "output", + "name": "output", + "type": "COMBO", + "widget": { + "name": "output" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 85 + ] + } + ], + "properties": { + "Node name for S&R": "MoGeRender", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "mask" + ] + }, + { + "id": 53, + "type": "ComfySwitchNode", + "pos": [ + -4160, + 5340 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 82 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 80 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 83 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 81 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + true + ] + }, + { + "id": 68, + "type": "ImageToMask", + "pos": [ + -3420, + 5360 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 85 + }, + { + "localized_name": "channel", + "name": "channel", + "type": "COMBO", + "widget": { + "name": "channel" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "MASK", + "name": "MASK", + "type": "MASK", + "links": [ + 86 + ] + } + ], + "properties": { + "Node name for S&R": "ImageToMask" + }, + "widgets_values": [ + "red" + ] + } + ], + "groups": [ + { + "id": 1, + "title": "auto_resize_if_width_gt_2048", + "bounding": [ + -5000, + 4840, + 690, + 280 + ], + "color": "#3f789e", + "flags": {} + } + ], + "links": [ + { + "id": 58, + "origin_id": 32, + "origin_slot": 0, + "target_id": 13, + "target_slot": 0, + "type": "MOGE_MODEL" + }, + { + "id": 35, + "origin_id": 13, + "origin_slot": 0, + "target_id": 23, + "target_slot": 0, + "type": "MOGE_GEOMETRY" + }, + { + "id": 39, + "origin_id": 13, + "origin_slot": 0, + "target_id": 25, + "target_slot": 0, + "type": "MOGE_GEOMETRY" + }, + { + "id": 49, + "origin_id": 37, + "origin_slot": 0, + "target_id": 36, + "target_slot": 0, + "type": "INT" + }, + { + "id": 54, + "origin_id": 40, + "origin_slot": 0, + "target_id": 42, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 53, + "origin_id": 36, + "origin_slot": 2, + "target_id": 42, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 61, + "origin_id": 13, + "origin_slot": 0, + "target_id": 45, + "target_slot": 0, + "type": "MOGE_GEOMETRY" + }, + { + "id": 48, + "origin_id": -10, + "origin_slot": 0, + "target_id": 37, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 55, + "origin_id": -10, + "origin_slot": 0, + "target_id": 40, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 56, + "origin_id": -10, + "origin_slot": 0, + "target_id": 42, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 36, + "origin_id": 23, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 40, + "origin_id": 25, + "origin_slot": 0, + "target_id": -20, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 73, + "origin_id": -10, + "origin_slot": 1, + "target_id": 13, + "target_slot": 2, + "type": "INT" + }, + { + "id": 74, + "origin_id": -10, + "origin_slot": 2, + "target_id": 13, + "target_slot": 4, + "type": "INT" + }, + { + "id": 79, + "origin_id": -10, + "origin_slot": 3, + "target_id": 32, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 80, + "origin_id": 42, + "origin_slot": 0, + "target_id": 53, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 81, + "origin_id": 53, + "origin_slot": 0, + "target_id": 13, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 82, + "origin_id": -10, + "origin_slot": 0, + "target_id": 53, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 83, + "origin_id": -10, + "origin_slot": 4, + "target_id": 53, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 85, + "origin_id": 45, + "origin_slot": 0, + "target_id": 68, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 86, + "origin_id": 68, + "origin_slot": 0, + "target_id": -20, + "target_slot": 2, + "type": "MASK" + } + ], + "extra": {}, + "category": "Conditioning & Preprocessors/Depth" + } + ] + }, + "extra": {} +} \ No newline at end of file diff --git a/blueprints/Image Face Detection (Mediapipe).json b/blueprints/Image Face Detection (Mediapipe).json new file mode 100644 index 000000000..e2548d485 --- /dev/null +++ b/blueprints/Image Face Detection (Mediapipe).json @@ -0,0 +1,779 @@ +{ + "revision": 0, + "last_node_id": 33, + "last_link_id": 0, + "nodes": [ + { + "id": 33, + "type": "6062babb-b649-4a71-be9e-20ebce567744", + "pos": [ + -450, + 4240 + ], + "size": [ + 420, + 400 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": null + }, + { + "name": "face_landmarker", + "type": "FACE_LANDMARKER", + "link": null + }, + { + "name": "detector_variant", + "type": "COMBO", + "widget": { + "name": "detector_variant" + }, + "link": null + }, + { + "name": "num_faces", + "type": "INT", + "widget": { + "name": "num_faces" + }, + "link": null + }, + { + "label": "custom_face_oval", + "name": "regions.face_oval", + "type": "BOOLEAN", + "widget": { + "name": "regions.face_oval" + }, + "link": null + }, + { + "label": "custom_lips", + "name": "regions.lips", + "type": "BOOLEAN", + "widget": { + "name": "regions.lips" + }, + "link": null + }, + { + "label": "custom_left_eye", + "name": "regions.left_eye", + "type": "BOOLEAN", + "widget": { + "name": "regions.left_eye" + }, + "link": null + }, + { + "label": "custom_right_eye", + "name": "regions.right_eye", + "type": "BOOLEAN", + "widget": { + "name": "regions.right_eye" + }, + "link": null + }, + { + "label": "custom_irises", + "name": "regions.irises", + "type": "BOOLEAN", + "widget": { + "name": "regions.irises" + }, + "link": null + }, + { + "name": "model_name", + "type": "COMBO", + "widget": { + "name": "model_name" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "face_landmarks", + "name": "face_landmarks", + "type": "FACE_LANDMARKS", + "links": [] + }, + { + "localized_name": "bboxes", + "name": "bboxes", + "type": "BOUNDING_BOX", + "links": [] + }, + { + "label": "mask", + "name": "MASK_1", + "type": "MASK", + "links": [] + } + ], + "title": "Image Face Detection (Mediapipe)", + "properties": { + "proxyWidgets": [ + [ + "11", + "detector_variant" + ], + [ + "11", + "num_faces" + ], + [ + "20", + "regions.face_oval" + ], + [ + "20", + "regions.lips" + ], + [ + "20", + "regions.left_eye" + ], + [ + "20", + "regions.right_eye" + ], + [ + "20", + "regions.irises" + ], + [ + "2", + "model_name" + ] + ], + "cnr_id": "comfy-core", + "ver": "0.22.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [] + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "6062babb-b649-4a71-be9e-20ebce567744", + "version": 1, + "state": { + "lastGroupId": 2, + "lastNodeId": 158, + "lastLinkId": 140, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Image Face Detection (Mediapipe)", + "description": "Detects facial landmarks from an image using MediaPipe, outputting landmark data, face bounding boxes, and an optional face-region mask.", + "inputNode": { + "id": -10, + "bounding": [ + -710, + 4300, + 148.880859375, + 248 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + 140, + 4480, + 137.677734375, + 108 + ] + }, + "inputs": [ + { + "id": "705dc1ae-6dc9-4155-92df-52f816ad451e", + "name": "image", + "type": "IMAGE", + "linkIds": [ + 60 + ], + "localized_name": "image", + "pos": [ + -585.119140625, + 4324 + ] + }, + { + "id": "d6277190-732c-4604-b7cd-d3a9588bf761", + "name": "face_landmarker", + "type": "FACE_LANDMARKER", + "linkIds": [ + 74 + ], + "pos": [ + -585.119140625, + 4344 + ] + }, + { + "id": "ac473a08-6a86-42a7-b460-e70c6c5e1e2b", + "name": "detector_variant", + "type": "COMBO", + "linkIds": [ + 75 + ], + "pos": [ + -585.119140625, + 4364 + ] + }, + { + "id": "1bec2252-ca2d-496e-8a33-33a61d21f897", + "name": "num_faces", + "type": "INT", + "linkIds": [ + 76 + ], + "pos": [ + -585.119140625, + 4384 + ] + }, + { + "id": "17994fa2-0ea0-4c9b-a70a-19789c459c80", + "name": "regions.face_oval", + "type": "BOOLEAN", + "linkIds": [ + 77 + ], + "label": "custom_face_oval", + "pos": [ + -585.119140625, + 4404 + ] + }, + { + "id": "1c6c5893-2aee-4c37-b702-15ef2e20d863", + "name": "regions.lips", + "type": "BOOLEAN", + "linkIds": [ + 78 + ], + "label": "custom_lips", + "pos": [ + -585.119140625, + 4424 + ] + }, + { + "id": "f353fcea-4b6f-42a1-8fdd-32b3aa1e1f09", + "name": "regions.left_eye", + "type": "BOOLEAN", + "linkIds": [ + 79 + ], + "label": "custom_left_eye", + "pos": [ + -585.119140625, + 4444 + ] + }, + { + "id": "1387e121-c1fb-4522-8f0d-43459e11dd86", + "name": "regions.right_eye", + "type": "BOOLEAN", + "linkIds": [ + 80 + ], + "label": "custom_right_eye", + "pos": [ + -585.119140625, + 4464 + ] + }, + { + "id": "14acb0a0-d1f4-48f3-ba31-811b26236ef9", + "name": "regions.irises", + "type": "BOOLEAN", + "linkIds": [ + 81 + ], + "label": "custom_irises", + "pos": [ + -585.119140625, + 4484 + ] + }, + { + "id": "25a82859-87de-42c8-8431-09948665546e", + "name": "model_name", + "type": "COMBO", + "linkIds": [ + 86 + ], + "pos": [ + -585.119140625, + 4504 + ] + } + ], + "outputs": [ + { + "id": "d2ba3f92-e8b1-49c3-9590-cfad56c54cf4", + "name": "face_landmarks", + "type": "FACE_LANDMARKS", + "linkIds": [ + 44 + ], + "localized_name": "face_landmarks", + "pos": [ + 164, + 4504 + ] + }, + { + "id": "4f356bb0-d4c4-4f93-b4cf-0845a65c4e6d", + "name": "bboxes", + "type": "BOUNDING_BOX", + "linkIds": [ + 25 + ], + "localized_name": "bboxes", + "pos": [ + 164, + 4524 + ] + }, + { + "id": "f6309e1d-6397-4363-b38f-778a122abc51", + "name": "MASK_1", + "type": "MASK", + "linkIds": [ + 83 + ], + "label": "mask", + "pos": [ + 164, + 4544 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 11, + "type": "MediaPipeFaceLandmarker", + "pos": [ + -280, + 4280 + ], + "size": [ + 350, + 220 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "face_detection_model", + "name": "face_detection_model", + "type": "FACE_DETECTION_MODEL", + "link": 66 + }, + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 60 + }, + { + "localized_name": "detector_variant", + "name": "detector_variant", + "type": "COMBO", + "widget": { + "name": "detector_variant" + }, + "link": 75 + }, + { + "localized_name": "num_faces", + "name": "num_faces", + "type": "INT", + "widget": { + "name": "num_faces" + }, + "link": 76 + }, + { + "localized_name": "min_confidence", + "name": "min_confidence", + "type": "FLOAT", + "widget": { + "name": "min_confidence" + }, + "link": null + }, + { + "localized_name": "missing_frame_fallback", + "name": "missing_frame_fallback", + "type": "COMBO", + "widget": { + "name": "missing_frame_fallback" + }, + "link": null + }, + { + "name": "face_landmarker", + "type": "FACE_LANDMARKER", + "link": 74 + } + ], + "outputs": [ + { + "localized_name": "face_landmarks", + "name": "face_landmarks", + "type": "FACE_LANDMARKS", + "links": [ + 44, + 46 + ] + }, + { + "localized_name": "bboxes", + "name": "bboxes", + "type": "BOUNDING_BOX", + "links": [ + 25 + ] + } + ], + "properties": { + "Node name for S&R": "MediaPipeFaceLandmarker", + "cnr_id": "comfy-core", + "ver": "0.22.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "full", + 0, + 0.5, + "empty" + ] + }, + { + "id": 2, + "type": "LoadMediaPipeFaceLandmarker", + "pos": [ + -290, + 4060 + ], + "size": [ + 350, + 140 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "model_name", + "name": "model_name", + "type": "COMBO", + "widget": { + "name": "model_name" + }, + "link": 86 + } + ], + "outputs": [ + { + "localized_name": "FACE_DETECTION_MODEL", + "name": "FACE_DETECTION_MODEL", + "type": "FACE_DETECTION_MODEL", + "links": [ + 66 + ] + } + ], + "properties": { + "Node name for S&R": "LoadMediaPipeFaceLandmarker", + "cnr_id": "comfy-core", + "ver": "0.22.0", + "models": [ + { + "name": "mediapipe_face_fp32.safetensors", + "url": "https://huggingface.co/Comfy-Org/mediapipe/resolve/main/detection/mediapipe_face_fp32.safetensors", + "directory": "detection" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "mediapipe_face_fp32.safetensors" + ] + }, + { + "id": 20, + "type": "MediaPipeFaceMask", + "pos": [ + -290, + 4560 + ], + "size": [ + 360, + 180 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "face_landmarks", + "name": "face_landmarks", + "type": "FACE_LANDMARKS", + "link": 46 + }, + { + "localized_name": "regions", + "name": "regions", + "type": "COMFY_DYNAMICCOMBO_V3", + "widget": { + "name": "regions" + }, + "link": null + }, + { + "localized_name": "regions.face_oval", + "name": "regions.face_oval", + "type": "BOOLEAN", + "widget": { + "name": "regions.face_oval" + }, + "link": 77 + }, + { + "localized_name": "regions.lips", + "name": "regions.lips", + "type": "BOOLEAN", + "widget": { + "name": "regions.lips" + }, + "link": 78 + }, + { + "localized_name": "regions.left_eye", + "name": "regions.left_eye", + "type": "BOOLEAN", + "widget": { + "name": "regions.left_eye" + }, + "link": 79 + }, + { + "localized_name": "regions.right_eye", + "name": "regions.right_eye", + "type": "BOOLEAN", + "widget": { + "name": "regions.right_eye" + }, + "link": 80 + }, + { + "localized_name": "regions.irises", + "name": "regions.irises", + "type": "BOOLEAN", + "widget": { + "name": "regions.irises" + }, + "link": 81 + } + ], + "outputs": [ + { + "localized_name": "MASK", + "name": "MASK", + "type": "MASK", + "links": [ + 83 + ] + } + ], + "properties": { + "Node name for S&R": "MediaPipeFaceMask", + "cnr_id": "comfy-core", + "ver": "0.22.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "custom", + true, + false, + false, + false, + false + ] + } + ], + "groups": [], + "links": [ + { + "id": 66, + "origin_id": 2, + "origin_slot": 0, + "target_id": 11, + "target_slot": 0, + "type": "FACE_DETECTION_MODEL" + }, + { + "id": 46, + "origin_id": 11, + "origin_slot": 0, + "target_id": 20, + "target_slot": 0, + "type": "FACE_LANDMARKS" + }, + { + "id": 60, + "origin_id": -10, + "origin_slot": 0, + "target_id": 11, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 44, + "origin_id": 11, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "FACE_LANDMARKS" + }, + { + "id": 25, + "origin_id": 11, + "origin_slot": 1, + "target_id": -20, + "target_slot": 1, + "type": "BOUNDING_BOX" + }, + { + "id": 74, + "origin_id": -10, + "origin_slot": 1, + "target_id": 11, + "target_slot": 6, + "type": "FACE_LANDMARKER" + }, + { + "id": 75, + "origin_id": -10, + "origin_slot": 2, + "target_id": 11, + "target_slot": 2, + "type": "COMBO" + }, + { + "id": 76, + "origin_id": -10, + "origin_slot": 3, + "target_id": 11, + "target_slot": 3, + "type": "INT" + }, + { + "id": 77, + "origin_id": -10, + "origin_slot": 4, + "target_id": 20, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 78, + "origin_id": -10, + "origin_slot": 5, + "target_id": 20, + "target_slot": 3, + "type": "BOOLEAN" + }, + { + "id": 79, + "origin_id": -10, + "origin_slot": 6, + "target_id": 20, + "target_slot": 4, + "type": "BOOLEAN" + }, + { + "id": 80, + "origin_id": -10, + "origin_slot": 7, + "target_id": 20, + "target_slot": 5, + "type": "BOOLEAN" + }, + { + "id": 81, + "origin_id": -10, + "origin_slot": 8, + "target_id": 20, + "target_slot": 6, + "type": "BOOLEAN" + }, + { + "id": 83, + "origin_id": 20, + "origin_slot": 0, + "target_id": -20, + "target_slot": 2, + "type": "MASK" + }, + { + "id": 86, + "origin_id": -10, + "origin_slot": 9, + "target_id": 2, + "target_slot": 0, + "type": "COMBO" + } + ], + "extra": {}, + "category": "Conditioning & Preprocessors/Face Detection" + } + ] + }, + "extra": {} +} \ No newline at end of file diff --git a/blueprints/Image Segmentation (SAM3).json b/blueprints/Image Segmentation (SAM3).json index b405bf623..a2ef40ac8 100644 --- a/blueprints/Image Segmentation (SAM3).json +++ b/blueprints/Image Segmentation (SAM3).json @@ -703,7 +703,7 @@ } ], "extra": {}, - "category": "Image Tools/Image Segmentation", + "category": "Conditioning & Preprocessors/Segmentation & Mask", "description": "Segments images into masks using Meta SAM3 from text prompts, points, or boxes." } ] diff --git a/blueprints/Image Upscale(Z-image-Turbo).json b/blueprints/Image Upscale(Z-image-Turbo).json index bd803a0b1..25d2838a8 100644 --- a/blueprints/Image Upscale(Z-image-Turbo).json +++ b/blueprints/Image Upscale(Z-image-Turbo).json @@ -1302,7 +1302,7 @@ "extra": { "workflowRendererVersion": "LG" }, - "category": "Image generation and editing/Enhance", + "category": "Image generation and editing/Upscale", "description": "Upscales images to higher resolution using Z-Image-Turbo." } ] @@ -1312,4 +1312,4 @@ "workflowRendererVersion": "LG" }, "version": 0.4 -} +} \ No newline at end of file diff --git a/blueprints/Image to Pose Map (SDPose Multi-Person).json b/blueprints/Image to Pose Map (SDPose Multi-Person).json new file mode 100644 index 000000000..38df20775 --- /dev/null +++ b/blueprints/Image to Pose Map (SDPose Multi-Person).json @@ -0,0 +1,1206 @@ +{ + "revision": 0, + "last_node_id": 675, + "last_link_id": 0, + "nodes": [ + { + "id": 675, + "type": "01b6a731-fb78-4070-9a38-c87146da9604", + "pos": [ + -2480, + 3400 + ], + "size": [ + 370, + 590.625 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "input", + "name": "input", + "type": "IMAGE,MASK", + "link": null + }, + { + "label": "resize_target_longer_size", + "name": "resize_type.longer_size", + "type": "INT", + "widget": { + "name": "resize_type.longer_size" + }, + "link": null + }, + { + "name": "scale_method", + "type": "COMBO", + "widget": { + "name": "scale_method" + }, + "link": null + }, + { + "name": "draw_body", + "type": "BOOLEAN", + "widget": { + "name": "draw_body" + }, + "link": null + }, + { + "name": "draw_hands", + "type": "BOOLEAN", + "widget": { + "name": "draw_hands" + }, + "link": null + }, + { + "name": "draw_face", + "type": "BOOLEAN", + "widget": { + "name": "draw_face" + }, + "link": null + }, + { + "name": "draw_feet", + "type": "BOOLEAN", + "widget": { + "name": "draw_feet" + }, + "link": null + }, + { + "name": "stick_width", + "type": "INT", + "widget": { + "name": "stick_width" + }, + "link": null + }, + { + "name": "face_point_size", + "type": "INT", + "widget": { + "name": "face_point_size" + }, + "link": null + }, + { + "name": "score_threshold", + "type": "FLOAT", + "widget": { + "name": "score_threshold" + }, + "link": null + }, + { + "label": "detect_threshold", + "name": "threshold", + "type": "FLOAT", + "widget": { + "name": "threshold" + }, + "link": null + }, + { + "label": "detect_class", + "name": "class_name", + "type": "COMBO", + "widget": { + "name": "class_name" + }, + "link": null + }, + { + "name": "max_detections", + "type": "INT", + "widget": { + "name": "max_detections" + }, + "link": null + }, + { + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": null + }, + { + "name": "unet_name", + "type": "COMBO", + "widget": { + "name": "unet_name" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [] + }, + { + "name": "keypoints", + "type": "POSE_KEYPOINT", + "links": null + }, + { + "name": "bboxes", + "type": "BOUNDING_BOX", + "links": [] + } + ], + "properties": { + "proxyWidgets": [ + [ + "674", + "resize_type.longer_size" + ], + [ + "674", + "scale_method" + ], + [ + "672", + "draw_body" + ], + [ + "672", + "draw_hands" + ], + [ + "672", + "draw_face" + ], + [ + "672", + "draw_feet" + ], + [ + "672", + "stick_width" + ], + [ + "672", + "face_point_size" + ], + [ + "672", + "score_threshold" + ], + [ + "678", + "threshold" + ], + [ + "678", + "class_name" + ], + [ + "678", + "max_detections" + ], + [ + "673", + "ckpt_name" + ], + [ + "677", + "unet_name" + ] + ], + "cnr_id": "comfy-core", + "ver": "0.15.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [], + "title": "Image to Pose Map (SDPose Multi-Person)" + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "01b6a731-fb78-4070-9a38-c87146da9604", + "version": 1, + "state": { + "lastGroupId": 2, + "lastNodeId": 691, + "lastLinkId": 1740, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Image to Pose Map (SDPose Multi-Person)", + "inputNode": { + "id": -10, + "bounding": [ + -3350, + 3410, + 190.8984375, + 348 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + -1840, + 3570, + 128, + 108 + ] + }, + "inputs": [ + { + "id": "e24699c3-1356-4634-9eb4-19bb58e5c0b0", + "name": "input", + "type": "IMAGE,MASK", + "linkIds": [ + 1700 + ], + "localized_name": "input", + "pos": [ + -3183.1015625, + 3434 + ] + }, + { + "id": "088eefc1-cd8a-4573-993f-9e4da008a12d", + "name": "resize_type.longer_size", + "type": "INT", + "linkIds": [ + 1704 + ], + "label": "resize_target_longer_size", + "pos": [ + -3183.1015625, + 3454 + ] + }, + { + "id": "b6449bd3-73d4-41c8-b81f-cf8d33f76a2e", + "name": "scale_method", + "type": "COMBO", + "linkIds": [ + 1705 + ], + "pos": [ + -3183.1015625, + 3474 + ] + }, + { + "id": "4cff52ad-ed07-4c97-8803-fcbd89554fd0", + "name": "draw_body", + "type": "BOOLEAN", + "linkIds": [ + 1706 + ], + "pos": [ + -3183.1015625, + 3494 + ] + }, + { + "id": "7af63dce-f7df-4d7e-8215-d7c7f60bf81c", + "name": "draw_hands", + "type": "BOOLEAN", + "linkIds": [ + 1707 + ], + "pos": [ + -3183.1015625, + 3514 + ] + }, + { + "id": "af3a9bce-61f9-4aca-b530-9f65e028b35e", + "name": "draw_face", + "type": "BOOLEAN", + "linkIds": [ + 1708 + ], + "pos": [ + -3183.1015625, + 3534 + ] + }, + { + "id": "4620f6a3-2c85-4b79-ad8f-35d0326b568f", + "name": "draw_feet", + "type": "BOOLEAN", + "linkIds": [ + 1709 + ], + "pos": [ + -3183.1015625, + 3554 + ] + }, + { + "id": "fee5d0c9-8d4b-4934-81d8-ba2206dc56cb", + "name": "stick_width", + "type": "INT", + "linkIds": [ + 1710 + ], + "pos": [ + -3183.1015625, + 3574 + ] + }, + { + "id": "aafdd060-ba81-4324-a9cc-b656e1ebc133", + "name": "face_point_size", + "type": "INT", + "linkIds": [ + 1711 + ], + "pos": [ + -3183.1015625, + 3594 + ] + }, + { + "id": "514c5503-f9e6-4d23-b1ae-1d3291acb2a3", + "name": "score_threshold", + "type": "FLOAT", + "linkIds": [ + 1712 + ], + "pos": [ + -3183.1015625, + 3614 + ] + }, + { + "id": "4eb3e4ea-7a36-4511-8483-0d12aadd32f7", + "name": "threshold", + "type": "FLOAT", + "linkIds": [ + 1718 + ], + "label": "detect_threshold", + "pos": [ + -3183.1015625, + 3634 + ] + }, + { + "id": "c76a7a05-81e6-4b17-a9e0-85f47a5844f2", + "name": "class_name", + "type": "COMBO", + "linkIds": [ + 1719 + ], + "label": "detect_class", + "pos": [ + -3183.1015625, + 3654 + ] + }, + { + "id": "4417e988-6e80-4236-be31-4c179037f5a2", + "name": "max_detections", + "type": "INT", + "linkIds": [ + 1720 + ], + "pos": [ + -3183.1015625, + 3674 + ] + }, + { + "id": "7d7c4a0b-0d1b-4c98-942b-f90548d2a492", + "name": "ckpt_name", + "type": "COMBO", + "linkIds": [ + 1721 + ], + "pos": [ + -3183.1015625, + 3694 + ] + }, + { + "id": "4d75122c-2c14-452a-98fe-d1545d3e012a", + "name": "unet_name", + "type": "COMBO", + "linkIds": [ + 1722 + ], + "pos": [ + -3183.1015625, + 3714 + ] + } + ], + "outputs": [ + { + "id": "f05ed8cc-9403-4f14-8085-4364b06f8a48", + "name": "IMAGE", + "type": "IMAGE", + "linkIds": [ + 1701 + ], + "localized_name": "IMAGE", + "pos": [ + -1816, + 3594 + ] + }, + { + "id": "4b64118e-3cef-4eeb-9dad-4cd09cfd63a2", + "name": "keypoints", + "type": "POSE_KEYPOINT", + "linkIds": [ + 1725 + ], + "pos": [ + -1816, + 3614 + ] + }, + { + "id": "a27f7e34-dcbc-4fb0-a4e1-2c5fc423ca5f", + "name": "bboxes", + "type": "BOUNDING_BOX", + "linkIds": [ + 1726 + ], + "pos": [ + -1816, + 3634 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 671, + "type": "SDPoseKeypointExtractor", + "pos": [ + -2550, + 3080 + ], + "size": [ + 270, + 180 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 1696 + }, + { + "localized_name": "vae", + "name": "vae", + "type": "VAE", + "link": 1697 + }, + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 1698 + }, + { + "localized_name": "bboxes", + "name": "bboxes", + "shape": 7, + "type": "BOUNDING_BOX", + "link": 1717 + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { + "name": "batch_size" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "keypoints", + "name": "keypoints", + "type": "POSE_KEYPOINT", + "links": [ + 1699, + 1725 + ] + } + ], + "properties": { + "Node name for S&R": "SDPoseKeypointExtractor", + "cnr_id": "comfy-core", + "ver": "0.15.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 16 + ] + }, + { + "id": 674, + "type": "ResizeImageMaskNode", + "pos": [ + -2970, + 3580 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "input", + "name": "input", + "type": "IMAGE,MASK", + "link": 1700 + }, + { + "localized_name": "resize_type", + "name": "resize_type", + "type": "COMFY_DYNAMICCOMBO_V3", + "widget": { + "name": "resize_type" + }, + "link": null + }, + { + "localized_name": "resize_type.longer_size", + "name": "resize_type.longer_size", + "type": "INT", + "widget": { + "name": "resize_type.longer_size" + }, + "link": 1704 + }, + { + "localized_name": "scale_method", + "name": "scale_method", + "type": "COMBO", + "widget": { + "name": "scale_method" + }, + "link": 1705 + } + ], + "outputs": [ + { + "localized_name": "resized", + "name": "resized", + "type": "*", + "links": [ + 1698, + 1716 + ] + } + ], + "properties": { + "Node name for S&R": "ResizeImageMaskNode", + "cnr_id": "comfy-core", + "ver": "0.15.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "scale longer dimension", + 1024, + "lanczos" + ] + }, + { + "id": 672, + "type": "SDPoseDrawKeypoints", + "pos": [ + -2540, + 3590 + ], + "size": [ + 270, + 280 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "keypoints", + "name": "keypoints", + "type": "POSE_KEYPOINT", + "link": 1699 + }, + { + "localized_name": "draw_body", + "name": "draw_body", + "type": "BOOLEAN", + "widget": { + "name": "draw_body" + }, + "link": 1706 + }, + { + "localized_name": "draw_hands", + "name": "draw_hands", + "type": "BOOLEAN", + "widget": { + "name": "draw_hands" + }, + "link": 1707 + }, + { + "localized_name": "draw_face", + "name": "draw_face", + "type": "BOOLEAN", + "widget": { + "name": "draw_face" + }, + "link": 1708 + }, + { + "localized_name": "draw_feet", + "name": "draw_feet", + "type": "BOOLEAN", + "widget": { + "name": "draw_feet" + }, + "link": 1709 + }, + { + "localized_name": "stick_width", + "name": "stick_width", + "type": "INT", + "widget": { + "name": "stick_width" + }, + "link": 1710 + }, + { + "localized_name": "face_point_size", + "name": "face_point_size", + "type": "INT", + "widget": { + "name": "face_point_size" + }, + "link": 1711 + }, + { + "localized_name": "score_threshold", + "name": "score_threshold", + "type": "FLOAT", + "widget": { + "name": "score_threshold" + }, + "link": 1712 + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 1701 + ] + } + ], + "properties": { + "Node name for S&R": "SDPoseDrawKeypoints", + "cnr_id": "comfy-core", + "ver": "0.15.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + true, + true, + true, + true, + 4, + 2, + 0.5 + ] + }, + { + "id": 673, + "type": "CheckpointLoaderSimple", + "pos": [ + -3040, + 3080 + ], + "size": [ + 390, + 190 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "ckpt_name", + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": 1721 + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "links": [ + 1696 + ] + }, + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "links": [] + }, + { + "localized_name": "VAE", + "name": "VAE", + "type": "VAE", + "links": [ + 1697 + ] + } + ], + "properties": { + "Node name for S&R": "CheckpointLoaderSimple", + "cnr_id": "comfy-core", + "ver": "0.15.0", + "models": [ + { + "name": "sdpose_wholebody_fp16.safetensors", + "url": "https://huggingface.co/Comfy-Org/SDPose/resolve/main/checkpoints/sdpose_wholebody_fp16.safetensors", + "directory": "checkpoints" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "sdpose_wholebody_fp16.safetensors" + ] + }, + { + "id": 677, + "type": "UNETLoader", + "pos": [ + -3030, + 3330 + ], + "size": [ + 370, + 140 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "localized_name": "unet_name", + "name": "unet_name", + "type": "COMBO", + "widget": { + "name": "unet_name" + }, + "link": 1722 + }, + { + "localized_name": "weight_dtype", + "name": "weight_dtype", + "type": "COMBO", + "widget": { + "name": "weight_dtype" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "links": [ + 1715 + ] + } + ], + "properties": { + "Node name for S&R": "UNETLoader", + "cnr_id": "comfy-core", + "ver": "0.14.1", + "models": [ + { + "name": "rt_detr_v4-x-hgnet_fp16.safetensors", + "url": "https://huggingface.co/Comfy-Org/SDPose/resolve/main/diffusion_models/rt_detr_v4-x-hgnet_fp16.safetensors", + "directory": "diffusion_models" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "rt_detr_v4-x-hgnet_fp16.safetensors", + "default" + ] + }, + { + "id": 678, + "type": "RTDETR_detect", + "pos": [ + -2540, + 3320 + ], + "size": [ + 270, + 200 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "label": "model", + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 1715 + }, + { + "label": "image", + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 1716 + }, + { + "localized_name": "threshold", + "name": "threshold", + "type": "FLOAT", + "widget": { + "name": "threshold" + }, + "link": 1718 + }, + { + "localized_name": "class_name", + "name": "class_name", + "type": "COMBO", + "widget": { + "name": "class_name" + }, + "link": 1719 + }, + { + "localized_name": "max_detections", + "name": "max_detections", + "type": "INT", + "widget": { + "name": "max_detections" + }, + "link": 1720 + } + ], + "outputs": [ + { + "localized_name": "bboxes", + "name": "bboxes", + "type": "BOUNDING_BOX", + "links": [ + 1717, + 1726 + ] + } + ], + "properties": { + "Node name for S&R": "RTDETR_detect", + "cnr_id": "comfy-core", + "ver": "0.15.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 0.5, + "person", + 1 + ] + } + ], + "groups": [], + "links": [ + { + "id": 1696, + "origin_id": 673, + "origin_slot": 0, + "target_id": 671, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 1697, + "origin_id": 673, + "origin_slot": 2, + "target_id": 671, + "target_slot": 1, + "type": "VAE" + }, + { + "id": 1698, + "origin_id": 674, + "origin_slot": 0, + "target_id": 671, + "target_slot": 2, + "type": "IMAGE" + }, + { + "id": 1699, + "origin_id": 671, + "origin_slot": 0, + "target_id": 672, + "target_slot": 0, + "type": "POSE_KEYPOINT" + }, + { + "id": 1700, + "origin_id": -10, + "origin_slot": 0, + "target_id": 674, + "target_slot": 0, + "type": "IMAGE,MASK" + }, + { + "id": 1701, + "origin_id": 672, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 1704, + "origin_id": -10, + "origin_slot": 1, + "target_id": 674, + "target_slot": 2, + "type": "INT" + }, + { + "id": 1705, + "origin_id": -10, + "origin_slot": 2, + "target_id": 674, + "target_slot": 3, + "type": "COMBO" + }, + { + "id": 1706, + "origin_id": -10, + "origin_slot": 3, + "target_id": 672, + "target_slot": 1, + "type": "BOOLEAN" + }, + { + "id": 1707, + "origin_id": -10, + "origin_slot": 4, + "target_id": 672, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 1708, + "origin_id": -10, + "origin_slot": 5, + "target_id": 672, + "target_slot": 3, + "type": "BOOLEAN" + }, + { + "id": 1709, + "origin_id": -10, + "origin_slot": 6, + "target_id": 672, + "target_slot": 4, + "type": "BOOLEAN" + }, + { + "id": 1710, + "origin_id": -10, + "origin_slot": 7, + "target_id": 672, + "target_slot": 5, + "type": "INT" + }, + { + "id": 1711, + "origin_id": -10, + "origin_slot": 8, + "target_id": 672, + "target_slot": 6, + "type": "INT" + }, + { + "id": 1712, + "origin_id": -10, + "origin_slot": 9, + "target_id": 672, + "target_slot": 7, + "type": "FLOAT" + }, + { + "id": 1715, + "origin_id": 677, + "origin_slot": 0, + "target_id": 678, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 1716, + "origin_id": 674, + "origin_slot": 0, + "target_id": 678, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 1717, + "origin_id": 678, + "origin_slot": 0, + "target_id": 671, + "target_slot": 3, + "type": "BOUNDING_BOX" + }, + { + "id": 1718, + "origin_id": -10, + "origin_slot": 10, + "target_id": 678, + "target_slot": 2, + "type": "FLOAT" + }, + { + "id": 1719, + "origin_id": -10, + "origin_slot": 11, + "target_id": 678, + "target_slot": 3, + "type": "COMBO" + }, + { + "id": 1720, + "origin_id": -10, + "origin_slot": 12, + "target_id": 678, + "target_slot": 4, + "type": "INT" + }, + { + "id": 1721, + "origin_id": -10, + "origin_slot": 13, + "target_id": 673, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 1722, + "origin_id": -10, + "origin_slot": 14, + "target_id": 677, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 1725, + "origin_id": 671, + "origin_slot": 0, + "target_id": -20, + "target_slot": 1, + "type": "POSE_KEYPOINT" + }, + { + "id": 1726, + "origin_id": 678, + "origin_slot": 0, + "target_id": -20, + "target_slot": 2, + "type": "BOUNDING_BOX" + } + ], + "extra": { + "workflowRendererVersion": "LG" + }, + "category": "Conditioning & Preprocessors/Pose", + "description": "Detects multiple people in an image and outputs per-person pose keypoints, skeleton renders, and bounding boxes using SDPose." + } + ] + }, + "extra": {} +} \ No newline at end of file diff --git a/blueprints/Image to Pose Map (SDPose-OOD).json b/blueprints/Image to Pose Map (SDPose-OOD).json new file mode 100644 index 000000000..76ee9ff4e --- /dev/null +++ b/blueprints/Image to Pose Map (SDPose-OOD).json @@ -0,0 +1,888 @@ +{ + "revision": 0, + "last_node_id": 675, + "last_link_id": 0, + "nodes": [ + { + "id": 675, + "type": "01b6a731-fb78-4070-9a38-c87146da9604", + "pos": [ + -2480, + 3400 + ], + "size": [ + 360, + 433.3125 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "input", + "name": "input", + "type": "IMAGE,MASK", + "link": null + }, + { + "label": "resize_target_longer_size", + "name": "resize_type.longer_size", + "type": "INT", + "widget": { + "name": "resize_type.longer_size" + }, + "link": null + }, + { + "name": "scale_method", + "type": "COMBO", + "widget": { + "name": "scale_method" + }, + "link": null + }, + { + "name": "draw_body", + "type": "BOOLEAN", + "widget": { + "name": "draw_body" + }, + "link": null + }, + { + "name": "draw_hands", + "type": "BOOLEAN", + "widget": { + "name": "draw_hands" + }, + "link": null + }, + { + "name": "draw_face", + "type": "BOOLEAN", + "widget": { + "name": "draw_face" + }, + "link": null + }, + { + "name": "draw_feet", + "type": "BOOLEAN", + "widget": { + "name": "draw_feet" + }, + "link": null + }, + { + "name": "stick_width", + "type": "INT", + "widget": { + "name": "stick_width" + }, + "link": null + }, + { + "name": "face_point_size", + "type": "INT", + "widget": { + "name": "face_point_size" + }, + "link": null + }, + { + "name": "score_threshold", + "type": "FLOAT", + "widget": { + "name": "score_threshold" + }, + "link": null + }, + { + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": null + }, + { + "name": "bboxes", + "shape": 7, + "type": "BOUNDING_BOX", + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [] + }, + { + "name": "keypoints", + "type": "POSE_KEYPOINT", + "links": null + } + ], + "properties": { + "proxyWidgets": [ + [ + "674", + "resize_type.longer_size" + ], + [ + "674", + "scale_method" + ], + [ + "672", + "draw_body" + ], + [ + "672", + "draw_hands" + ], + [ + "672", + "draw_face" + ], + [ + "672", + "draw_feet" + ], + [ + "672", + "stick_width" + ], + [ + "672", + "face_point_size" + ], + [ + "672", + "score_threshold" + ], + [ + "673", + "ckpt_name" + ] + ], + "cnr_id": "comfy-core", + "ver": "0.15.1", + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [], + "title": "Image to Pose Map (SDPose-OOD)" + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "01b6a731-fb78-4070-9a38-c87146da9604", + "version": 1, + "state": { + "lastGroupId": 0, + "lastNodeId": 676, + "lastLinkId": 1715, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Image to Pose Map (SDPose-OOD)", + "inputNode": { + "id": -10, + "bounding": [ + -3290, + 3590, + 190.8984375, + 288 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + -1756.2451602089645, + 3366, + 128, + 88 + ] + }, + "inputs": [ + { + "id": "e24699c3-1356-4634-9eb4-19bb58e5c0b0", + "name": "input", + "type": "IMAGE,MASK", + "linkIds": [ + 1700 + ], + "localized_name": "input", + "pos": [ + -3123.1015625, + 3614 + ] + }, + { + "id": "088eefc1-cd8a-4573-993f-9e4da008a12d", + "name": "resize_type.longer_size", + "type": "INT", + "linkIds": [ + 1704 + ], + "label": "resize_target_longer_size", + "pos": [ + -3123.1015625, + 3634 + ] + }, + { + "id": "b6449bd3-73d4-41c8-b81f-cf8d33f76a2e", + "name": "scale_method", + "type": "COMBO", + "linkIds": [ + 1705 + ], + "pos": [ + -3123.1015625, + 3654 + ] + }, + { + "id": "4cff52ad-ed07-4c97-8803-fcbd89554fd0", + "name": "draw_body", + "type": "BOOLEAN", + "linkIds": [ + 1706 + ], + "pos": [ + -3123.1015625, + 3674 + ] + }, + { + "id": "7af63dce-f7df-4d7e-8215-d7c7f60bf81c", + "name": "draw_hands", + "type": "BOOLEAN", + "linkIds": [ + 1707 + ], + "pos": [ + -3123.1015625, + 3694 + ] + }, + { + "id": "af3a9bce-61f9-4aca-b530-9f65e028b35e", + "name": "draw_face", + "type": "BOOLEAN", + "linkIds": [ + 1708 + ], + "pos": [ + -3123.1015625, + 3714 + ] + }, + { + "id": "4620f6a3-2c85-4b79-ad8f-35d0326b568f", + "name": "draw_feet", + "type": "BOOLEAN", + "linkIds": [ + 1709 + ], + "pos": [ + -3123.1015625, + 3734 + ] + }, + { + "id": "fee5d0c9-8d4b-4934-81d8-ba2206dc56cb", + "name": "stick_width", + "type": "INT", + "linkIds": [ + 1710 + ], + "pos": [ + -3123.1015625, + 3754 + ] + }, + { + "id": "aafdd060-ba81-4324-a9cc-b656e1ebc133", + "name": "face_point_size", + "type": "INT", + "linkIds": [ + 1711 + ], + "pos": [ + -3123.1015625, + 3774 + ] + }, + { + "id": "514c5503-f9e6-4d23-b1ae-1d3291acb2a3", + "name": "score_threshold", + "type": "FLOAT", + "linkIds": [ + 1712 + ], + "pos": [ + -3123.1015625, + 3794 + ] + }, + { + "id": "ae46de61-2cc6-483e-8ee9-87e4144a2ffa", + "name": "ckpt_name", + "type": "COMBO", + "linkIds": [ + 1713 + ], + "pos": [ + -3123.1015625, + 3814 + ] + }, + { + "id": "41bec0c6-dffa-4c78-9289-ee678715ae54", + "name": "bboxes", + "type": "BOUNDING_BOX", + "linkIds": [ + 1714 + ], + "pos": [ + -3123.1015625, + 3834 + ] + } + ], + "outputs": [ + { + "id": "f05ed8cc-9403-4f14-8085-4364b06f8a48", + "name": "IMAGE", + "type": "IMAGE", + "linkIds": [ + 1701 + ], + "localized_name": "IMAGE", + "pos": [ + -1732.2451602089645, + 3390 + ] + }, + { + "id": "29a6584e-4685-4986-8ffd-e6d8539953fd", + "name": "keypoints", + "type": "POSE_KEYPOINT", + "linkIds": [ + 1715 + ], + "pos": [ + -1732.2451602089645, + 3410 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 671, + "type": "SDPoseKeypointExtractor", + "pos": [ + -2470, + 3250 + ], + "size": [ + 270, + 180 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 1696 + }, + { + "localized_name": "vae", + "name": "vae", + "type": "VAE", + "link": 1697 + }, + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 1698 + }, + { + "localized_name": "bboxes", + "name": "bboxes", + "shape": 7, + "type": "BOUNDING_BOX", + "link": 1714 + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { + "name": "batch_size" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "keypoints", + "name": "keypoints", + "type": "POSE_KEYPOINT", + "links": [ + 1699, + 1715 + ] + } + ], + "properties": { + "Node name for S&R": "SDPoseKeypointExtractor", + "cnr_id": "comfy-core", + "ver": "0.15.0", + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + 16 + ] + }, + { + "id": 674, + "type": "ResizeImageMaskNode", + "pos": [ + -2960, + 3490 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "input", + "name": "input", + "type": "IMAGE,MASK", + "link": 1700 + }, + { + "localized_name": "resize_type", + "name": "resize_type", + "type": "COMFY_DYNAMICCOMBO_V3", + "widget": { + "name": "resize_type" + }, + "link": null + }, + { + "localized_name": "resize_type.longer_size", + "name": "resize_type.longer_size", + "type": "INT", + "widget": { + "name": "resize_type.longer_size" + }, + "link": 1704 + }, + { + "localized_name": "scale_method", + "name": "scale_method", + "type": "COMBO", + "widget": { + "name": "scale_method" + }, + "link": 1705 + } + ], + "outputs": [ + { + "localized_name": "resized", + "name": "resized", + "type": "*", + "links": [ + 1698 + ] + } + ], + "properties": { + "Node name for S&R": "ResizeImageMaskNode", + "cnr_id": "comfy-core", + "ver": "0.15.0", + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + "scale longer dimension", + 1024, + "area" + ] + }, + { + "id": 672, + "type": "SDPoseDrawKeypoints", + "pos": [ + -2120, + 3260 + ], + "size": [ + 270, + 280 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "keypoints", + "name": "keypoints", + "type": "POSE_KEYPOINT", + "link": 1699 + }, + { + "localized_name": "draw_body", + "name": "draw_body", + "type": "BOOLEAN", + "widget": { + "name": "draw_body" + }, + "link": 1706 + }, + { + "localized_name": "draw_hands", + "name": "draw_hands", + "type": "BOOLEAN", + "widget": { + "name": "draw_hands" + }, + "link": 1707 + }, + { + "localized_name": "draw_face", + "name": "draw_face", + "type": "BOOLEAN", + "widget": { + "name": "draw_face" + }, + "link": 1708 + }, + { + "localized_name": "draw_feet", + "name": "draw_feet", + "type": "BOOLEAN", + "widget": { + "name": "draw_feet" + }, + "link": 1709 + }, + { + "localized_name": "stick_width", + "name": "stick_width", + "type": "INT", + "widget": { + "name": "stick_width" + }, + "link": 1710 + }, + { + "localized_name": "face_point_size", + "name": "face_point_size", + "type": "INT", + "widget": { + "name": "face_point_size" + }, + "link": 1711 + }, + { + "localized_name": "score_threshold", + "name": "score_threshold", + "type": "FLOAT", + "widget": { + "name": "score_threshold" + }, + "link": 1712 + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 1701 + ] + } + ], + "properties": { + "Node name for S&R": "SDPoseDrawKeypoints", + "cnr_id": "comfy-core", + "ver": "0.15.0", + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + true, + true, + true, + true, + 4, + 2, + 0.5 + ] + }, + { + "id": 673, + "type": "CheckpointLoaderSimple", + "pos": [ + -2960, + 3250 + ], + "size": [ + 390, + 190 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "ckpt_name", + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": 1713 + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "links": [ + 1696 + ] + }, + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "links": [] + }, + { + "localized_name": "VAE", + "name": "VAE", + "type": "VAE", + "links": [ + 1697 + ] + } + ], + "properties": { + "Node name for S&R": "CheckpointLoaderSimple", + "cnr_id": "comfy-core", + "ver": "0.15.0", + "models": [ + { + "name": "sdpose_wholebody_fp16.safetensors", + "url": "https://huggingface.co/Comfy-Org/SDPose/resolve/main/checkpoints/sdpose_wholebody_fp16.safetensors", + "directory": "checkpoints" + } + ], + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + "sdpose_wholebody_fp16.safetensors" + ] + } + ], + "groups": [], + "links": [ + { + "id": 1696, + "origin_id": 673, + "origin_slot": 0, + "target_id": 671, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 1697, + "origin_id": 673, + "origin_slot": 2, + "target_id": 671, + "target_slot": 1, + "type": "VAE" + }, + { + "id": 1698, + "origin_id": 674, + "origin_slot": 0, + "target_id": 671, + "target_slot": 2, + "type": "IMAGE" + }, + { + "id": 1699, + "origin_id": 671, + "origin_slot": 0, + "target_id": 672, + "target_slot": 0, + "type": "POSE_KEYPOINT" + }, + { + "id": 1700, + "origin_id": -10, + "origin_slot": 0, + "target_id": 674, + "target_slot": 0, + "type": "IMAGE,MASK" + }, + { + "id": 1701, + "origin_id": 672, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 1704, + "origin_id": -10, + "origin_slot": 1, + "target_id": 674, + "target_slot": 2, + "type": "INT" + }, + { + "id": 1705, + "origin_id": -10, + "origin_slot": 2, + "target_id": 674, + "target_slot": 3, + "type": "COMBO" + }, + { + "id": 1706, + "origin_id": -10, + "origin_slot": 3, + "target_id": 672, + "target_slot": 1, + "type": "BOOLEAN" + }, + { + "id": 1707, + "origin_id": -10, + "origin_slot": 4, + "target_id": 672, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 1708, + "origin_id": -10, + "origin_slot": 5, + "target_id": 672, + "target_slot": 3, + "type": "BOOLEAN" + }, + { + "id": 1709, + "origin_id": -10, + "origin_slot": 6, + "target_id": 672, + "target_slot": 4, + "type": "BOOLEAN" + }, + { + "id": 1710, + "origin_id": -10, + "origin_slot": 7, + "target_id": 672, + "target_slot": 5, + "type": "INT" + }, + { + "id": 1711, + "origin_id": -10, + "origin_slot": 8, + "target_id": 672, + "target_slot": 6, + "type": "INT" + }, + { + "id": 1712, + "origin_id": -10, + "origin_slot": 9, + "target_id": 672, + "target_slot": 7, + "type": "FLOAT" + }, + { + "id": 1713, + "origin_id": -10, + "origin_slot": 10, + "target_id": 673, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 1714, + "origin_id": -10, + "origin_slot": 11, + "target_id": 671, + "target_slot": 3, + "type": "BOUNDING_BOX" + }, + { + "id": 1715, + "origin_id": 671, + "origin_slot": 0, + "target_id": -20, + "target_slot": 1, + "type": "POSE_KEYPOINT" + } + ], + "extra": { + "workflowRendererVersion": "LG" + }, + "category": "Conditioning & Preprocessors/Pose", + "description": "Extracts human pose keypoints and stick-figure visuals from an image using SDPose-OOD, with optional bounding-box input per subject." + } + ] + }, + "extra": { + "ue_links": [] + } +} \ No newline at end of file diff --git a/blueprints/Merge Videos.json b/blueprints/Merge Videos.json new file mode 100644 index 000000000..689e6ec16 --- /dev/null +++ b/blueprints/Merge Videos.json @@ -0,0 +1,1219 @@ +{ + "revision": 0, + "last_node_id": 26, + "last_link_id": 0, + "nodes": [ + { + "id": 26, + "type": "32e6dbcc-e2d7-45c0-a245-fc74b8271dfb", + "pos": [ + -980, + 480 + ], + "size": [ + 290, + 190 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "label": "base_video", + "localized_name": "clip_to_resize", + "name": "clip_to_resize", + "type": "VIDEO", + "link": null + }, + { + "label": "second_video", + "localized_name": "base_video", + "name": "base_video", + "type": "VIDEO", + "link": null + }, + { + "label": "pad_second_video", + "localized_name": "pad_second_video", + "name": "pad_second_video", + "type": "BOOLEAN", + "widget": { + "name": "pad_second_video" + }, + "link": null + }, + { + "name": "interpolation", + "type": "COMBO", + "widget": { + "name": "interpolation" + }, + "link": null + }, + { + "name": "padding_color", + "type": "COMBO", + "widget": { + "name": "padding_color" + }, + "link": null + }, + { + "label": "drop_audio", + "localized_name": "drop_audio", + "name": "drop_audio", + "type": "BOOLEAN", + "widget": { + "name": "drop_audio" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "merged_video", + "name": "merged_video", + "type": "VIDEO", + "links": [] + } + ], + "properties": { + "proxyWidgets": [ + [ + "28", + "value" + ], + [ + "6", + "interpolation" + ], + [ + "6", + "padding_color" + ], + [ + "11", + "value" + ] + ], + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [], + "title": "Merge Videos" + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "32e6dbcc-e2d7-45c0-a245-fc74b8271dfb", + "version": 1, + "state": { + "lastGroupId": 2, + "lastNodeId": 34, + "lastLinkId": 75, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Merge Videos", + "inputNode": { + "id": -10, + "bounding": [ + -1990, + 700, + 152.5546875, + 168 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + 1210, + 614, + 128, + 68 + ] + }, + "inputs": [ + { + "id": "2fb09e41-c5fa-4654-b9d2-569b59626ec4", + "name": "clip_to_resize", + "type": "VIDEO", + "linkIds": [ + 50 + ], + "localized_name": "clip_to_resize", + "label": "base_video", + "pos": [ + -1861.4453125, + 724 + ] + }, + { + "id": "017f8d09-7900-4dc9-b95c-0cab31bcde7d", + "name": "base_video", + "type": "VIDEO", + "linkIds": [ + 51 + ], + "localized_name": "base_video", + "label": "second_video", + "pos": [ + -1861.4453125, + 744 + ] + }, + { + "id": "a39894ce-1785-4037-b39c-b40d2e470c43", + "name": "pad_second_video", + "type": "BOOLEAN", + "linkIds": [ + 59 + ], + "localized_name": "pad_second_video", + "label": "pad_second_video", + "pos": [ + -1861.4453125, + 764 + ] + }, + { + "id": "b4fb86cb-8d87-4193-8533-88a57df50e18", + "name": "interpolation", + "type": "COMBO", + "linkIds": [ + 60 + ], + "pos": [ + -1861.4453125, + 784 + ] + }, + { + "id": "2413a2e2-cfdc-4d1d-9e2e-81e7acdf35e3", + "name": "padding_color", + "type": "COMBO", + "linkIds": [ + 62 + ], + "pos": [ + -1861.4453125, + 804 + ] + }, + { + "id": "338b1e09-0efb-424f-949b-e730a0aa8527", + "name": "drop_audio", + "type": "BOOLEAN", + "linkIds": [ + 63 + ], + "localized_name": "drop_audio", + "label": "drop_audio", + "pos": [ + -1861.4453125, + 824 + ] + } + ], + "outputs": [ + { + "id": "be99efc6-7fb3-4059-93d0-136dc8cc8faf", + "name": "merged_video", + "type": "VIDEO", + "linkIds": [ + 16 + ], + "localized_name": "merged_video", + "pos": [ + 1234, + 638 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 11, + "type": "PrimitiveBoolean", + "pos": [ + -990, + 1230 + ], + "size": [ + 270, + 80 + ], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "BOOLEAN", + "widget": { + "name": "value" + }, + "link": 63 + } + ], + "outputs": [ + { + "localized_name": "BOOLEAN", + "name": "BOOLEAN", + "type": "BOOLEAN", + "links": [ + 14 + ] + } + ], + "properties": { + "Node name for S&R": "PrimitiveBoolean", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + false + ] + }, + { + "id": 10, + "type": "EmptyAudio", + "pos": [ + -990, + 1060 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "duration", + "name": "duration", + "type": "FLOAT", + "widget": { + "name": "duration" + }, + "link": null + }, + { + "localized_name": "sample_rate", + "name": "sample_rate", + "type": "INT", + "widget": { + "name": "sample_rate" + }, + "link": null + }, + { + "localized_name": "channels", + "name": "channels", + "type": "INT", + "widget": { + "name": "channels" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "AUDIO", + "name": "AUDIO", + "type": "AUDIO", + "links": [ + 22 + ] + } + ], + "properties": { + "Node name for S&R": "EmptyAudio", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 60, + 44100, + 2 + ] + }, + { + "id": 3, + "type": "ComfySwitchNode", + "pos": [ + -370, + 1010 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 21 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 22 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 14 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 12 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + false + ] + }, + { + "id": 6, + "type": "ResizeAndPadImage", + "pos": [ + -400, + 440 + ], + "size": [ + 270, + 210 + ], + "flags": {}, + "order": 4, + "mode": 0, + "showAdvanced": true, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 39 + }, + { + "localized_name": "target_width", + "name": "target_width", + "type": "INT", + "widget": { + "name": "target_width" + }, + "link": 4 + }, + { + "localized_name": "target_height", + "name": "target_height", + "type": "INT", + "widget": { + "name": "target_height" + }, + "link": 5 + }, + { + "localized_name": "padding_color", + "name": "padding_color", + "type": "COMBO", + "widget": { + "name": "padding_color" + }, + "link": 62 + }, + { + "localized_name": "interpolation", + "name": "interpolation", + "type": "COMBO", + "widget": { + "name": "interpolation" + }, + "link": 60 + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 75 + ] + } + ], + "properties": { + "Node name for S&R": "ResizeAndPadImage", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 512, + 512, + "white", + "lanczos" + ] + }, + { + "id": 8, + "type": "CreateVideo", + "pos": [ + 880, + 280 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "link": 19 + }, + { + "localized_name": "audio", + "name": "audio", + "shape": 7, + "type": "AUDIO", + "link": 12 + }, + { + "localized_name": "fps", + "name": "fps", + "type": "FLOAT", + "widget": { + "name": "fps" + }, + "link": 15 + } + ], + "outputs": [ + { + "localized_name": "VIDEO", + "name": "VIDEO", + "type": "VIDEO", + "links": [ + 16 + ] + } + ], + "properties": { + "Node name for S&R": "CreateVideo", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 30 + ] + }, + { + "id": 9, + "type": "AudioMerge", + "pos": [ + -990, + 890 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "localized_name": "audio1", + "name": "audio1", + "type": "AUDIO", + "link": 9 + }, + { + "localized_name": "audio2", + "name": "audio2", + "type": "AUDIO", + "link": 10 + }, + { + "localized_name": "merge_method", + "name": "merge_method", + "type": "COMBO", + "widget": { + "name": "merge_method" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "AUDIO", + "name": "AUDIO", + "type": "AUDIO", + "links": [ + 21 + ] + } + ], + "properties": { + "Node name for S&R": "AudioMerge", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "add" + ] + }, + { + "id": 2, + "type": "GetVideoComponents", + "pos": [ + -1590, + 460 + ], + "size": [ + 230, + 100 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "video", + "name": "video", + "type": "VIDEO", + "link": 51 + } + ], + "outputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "links": [ + 39, + 54 + ] + }, + { + "localized_name": "audio", + "name": "audio", + "type": "AUDIO", + "links": [ + 9 + ] + }, + { + "localized_name": "fps", + "name": "fps", + "type": "FLOAT", + "links": null + } + ], + "properties": { + "Node name for S&R": "GetVideoComponents", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 27, + "type": "ComfySwitchNode", + "pos": [ + 60, + 70 + ], + "size": [ + 280, + 130 + ], + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 54 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 75 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 56 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 55 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + false + ] + }, + { + "id": 1, + "type": "GetVideoComponents", + "pos": [ + -1600, + 30 + ], + "size": [ + 230, + 100 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "video", + "name": "video", + "type": "VIDEO", + "link": 50 + } + ], + "outputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "links": [ + 3, + 17 + ] + }, + { + "localized_name": "audio", + "name": "audio", + "type": "AUDIO", + "links": [ + 10 + ] + }, + { + "localized_name": "fps", + "name": "fps", + "type": "FLOAT", + "links": [ + 15 + ] + } + ], + "properties": { + "Node name for S&R": "GetVideoComponents", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 7, + "type": "GetImageSize", + "pos": [ + -1000, + 480 + ], + "size": [ + 260, + 110 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 3 + } + ], + "outputs": [ + { + "localized_name": "width", + "name": "width", + "type": "INT", + "links": [ + 4 + ] + }, + { + "localized_name": "height", + "name": "height", + "type": "INT", + "links": [ + 5 + ] + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "links": null + } + ], + "properties": { + "Node name for S&R": "GetImageSize", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 28, + "type": "PrimitiveBoolean", + "pos": [ + -1590, + 190 + ], + "size": [ + 270, + 80 + ], + "flags": {}, + "order": 11, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "BOOLEAN", + "widget": { + "name": "value" + }, + "link": 59 + } + ], + "outputs": [ + { + "localized_name": "BOOLEAN", + "name": "BOOLEAN", + "type": "BOOLEAN", + "links": [ + 56 + ] + } + ], + "properties": { + "Node name for S&R": "PrimitiveBoolean", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + false + ] + }, + { + "id": 13, + "type": "BatchImagesNode", + "pos": [ + 530, + 10 + ], + "size": [ + 230, + 120 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "label": "image0", + "localized_name": "images.image0", + "name": "images.image0", + "type": "IMAGE", + "link": 17 + }, + { + "label": "image1", + "localized_name": "images.image1", + "name": "images.image1", + "shape": 7, + "type": "IMAGE", + "link": 55 + }, + { + "label": "image2", + "localized_name": "images.image2", + "name": "images.image2", + "shape": 7, + "type": "IMAGE", + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 19 + ] + } + ], + "properties": { + "Node name for S&R": "BatchImagesNode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + } + ], + "groups": [ + { + "id": 1, + "title": "Audio", + "bounding": [ + -1000, + 820, + 915, + 496 + ], + "color": "#3f789e", + "flags": {} + } + ], + "links": [ + { + "id": 21, + "origin_id": 9, + "origin_slot": 0, + "target_id": 3, + "target_slot": 0, + "type": "AUDIO" + }, + { + "id": 22, + "origin_id": 10, + "origin_slot": 0, + "target_id": 3, + "target_slot": 1, + "type": "AUDIO" + }, + { + "id": 14, + "origin_id": 11, + "origin_slot": 0, + "target_id": 3, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 9, + "origin_id": 2, + "origin_slot": 1, + "target_id": 9, + "target_slot": 0, + "type": "AUDIO" + }, + { + "id": 10, + "origin_id": 1, + "origin_slot": 1, + "target_id": 9, + "target_slot": 1, + "type": "AUDIO" + }, + { + "id": 39, + "origin_id": 2, + "origin_slot": 0, + "target_id": 6, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 4, + "origin_id": 7, + "origin_slot": 0, + "target_id": 6, + "target_slot": 1, + "type": "INT" + }, + { + "id": 5, + "origin_id": 7, + "origin_slot": 1, + "target_id": 6, + "target_slot": 2, + "type": "INT" + }, + { + "id": 3, + "origin_id": 1, + "origin_slot": 0, + "target_id": 7, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 17, + "origin_id": 1, + "origin_slot": 0, + "target_id": 13, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 19, + "origin_id": 13, + "origin_slot": 0, + "target_id": 8, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 12, + "origin_id": 3, + "origin_slot": 0, + "target_id": 8, + "target_slot": 1, + "type": "AUDIO" + }, + { + "id": 15, + "origin_id": 1, + "origin_slot": 2, + "target_id": 8, + "target_slot": 2, + "type": "FLOAT" + }, + { + "id": 16, + "origin_id": 8, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "VIDEO" + }, + { + "id": 50, + "origin_id": -10, + "origin_slot": 0, + "target_id": 1, + "target_slot": 0, + "type": "VIDEO" + }, + { + "id": 51, + "origin_id": -10, + "origin_slot": 1, + "target_id": 2, + "target_slot": 0, + "type": "VIDEO" + }, + { + "id": 54, + "origin_id": 2, + "origin_slot": 0, + "target_id": 27, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 55, + "origin_id": 27, + "origin_slot": 0, + "target_id": 13, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 56, + "origin_id": 28, + "origin_slot": 0, + "target_id": 27, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 59, + "origin_id": -10, + "origin_slot": 2, + "target_id": 28, + "target_slot": 0, + "type": "BOOLEAN" + }, + { + "id": 60, + "origin_id": -10, + "origin_slot": 3, + "target_id": 6, + "target_slot": 4, + "type": "COMBO" + }, + { + "id": 62, + "origin_id": -10, + "origin_slot": 4, + "target_id": 6, + "target_slot": 3, + "type": "COMBO" + }, + { + "id": 63, + "origin_id": -10, + "origin_slot": 5, + "target_id": 11, + "target_slot": 0, + "type": "BOOLEAN" + }, + { + "id": 75, + "origin_id": 6, + "origin_slot": 0, + "target_id": 27, + "target_slot": 1, + "type": "IMAGE" + } + ], + "extra": {}, + "category": "Video Tools", + "description": "Concatenates two videos end-to-end with optional resize, letterbox padding, and audio merge or drop." + } + ] + }, + "extra": {} +} \ No newline at end of file diff --git a/blueprints/Pose to Image (Z-Image-Turbo).json b/blueprints/Pose to Image (Z-Image-Turbo).json index 5c2749efe..92ee80907 100644 --- a/blueprints/Pose to Image (Z-Image-Turbo).json +++ b/blueprints/Pose to Image (Z-Image-Turbo).json @@ -1298,7 +1298,7 @@ "VHS_MetadataImage": true, "VHS_KeepIntermediate": true }, - "category": "Image generation and editing/Pose to image", + "category": "Image generation and editing/Conditioned", "description": "Generates an image from pose keypoints using Z-Image-Turbo with text conditioning." } ] diff --git a/blueprints/Pose to Video (LTX 2.0).json b/blueprints/Pose to Video (LTX 2.0).json index 1ce49351a..04eb69972 100644 --- a/blueprints/Pose to Video (LTX 2.0).json +++ b/blueprints/Pose to Video (LTX 2.0).json @@ -3870,7 +3870,7 @@ "extra": { "workflowRendererVersion": "LG" }, - "category": "Video generation and editing/Pose to video", + "category": "Video generation and editing/Conditioned", "description": "Generates video from pose reference frames using LTX-2, with optional synchronized audio." } ] diff --git a/blueprints/Prompt Enhance.json b/blueprints/Prompt Enhance.json index e260b1203..e3a77a73b 100644 --- a/blueprints/Prompt Enhance.json +++ b/blueprints/Prompt Enhance.json @@ -270,7 +270,7 @@ "extra": { "workflowRendererVersion": "LG" }, - "category": "Text generation/Prompt enhance", + "category": "Text Tools", "description": "Expands short text prompts into detailed descriptions using a text generation model for better generation quality." } ] diff --git a/blueprints/Remove Background (BiRefNet).json b/blueprints/Remove Background (BiRefNet).json index 732a4adc4..9ec441e51 100644 --- a/blueprints/Remove Background (BiRefNet).json +++ b/blueprints/Remove Background (BiRefNet).json @@ -389,7 +389,7 @@ } ], "extra": {}, - "category": "Image generation and editing/Background Removal" + "category": "Image Tools/Background Removal" } ] }, diff --git a/blueprints/Select Per-Line Text by Index.json b/blueprints/Select Per-Line Text by Index.json new file mode 100644 index 000000000..8a4020d50 --- /dev/null +++ b/blueprints/Select Per-Line Text by Index.json @@ -0,0 +1,485 @@ +{ + "revision": 0, + "last_node_id": 10, + "last_link_id": 0, + "nodes": [ + { + "id": 10, + "type": "3fb7557a-470d-4983-9d8c-6d5caa9788f0", + "pos": [ + -250, + 8590 + ], + "size": [ + 280, + 360 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "text_per_line", + "name": "text_per_line", + "type": "STRING", + "widget": { + "name": "text_per_line" + }, + "link": null + }, + { + "localized_name": "index", + "name": "index", + "type": "INT", + "widget": { + "name": "index" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "selected_line", + "name": "selected_line", + "type": "STRING", + "links": [] + } + ], + "properties": { + "proxyWidgets": [ + [ + "2", + "string" + ], + [ + "3", + "value" + ] + ], + "cnr_id": "comfy-core", + "ver": "0.19.0", + "ue_properties": { + "widget_ue_connectable": {}, + "input_ue_unconnectable": {} + } + }, + "widgets_values": [], + "title": "Select Per-Line Text by Index" + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "3fb7557a-470d-4983-9d8c-6d5caa9788f0", + "version": 1, + "state": { + "lastGroupId": 0, + "lastNodeId": 10, + "lastLinkId": 14, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Select Per-Line Text by Index", + "inputNode": { + "id": -10, + "bounding": [ + -990, + 8595, + 128, + 88 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + 710, + 8585, + 128, + 68 + ] + }, + "inputs": [ + { + "id": "75417d82-a934-4ac9-b667-d8dcd5a3bfb3", + "name": "text_per_line", + "type": "STRING", + "linkIds": [ + 13 + ], + "localized_name": "text_per_line", + "pos": [ + -886, + 8619 + ] + }, + { + "id": "46e69a73-1804-4ca6-9175-31445bf0be96", + "name": "index", + "type": "INT", + "linkIds": [ + 14 + ], + "localized_name": "index", + "pos": [ + -886, + 8639 + ] + } + ], + "outputs": [ + { + "id": "e34e8ad1-84d2-4bd2-a460-eb7de6067c10", + "name": "selected_line", + "type": "STRING", + "linkIds": [ + 10 + ], + "localized_name": "selected_line", + "pos": [ + 734, + 8609 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 1, + "type": "PreviewAny", + "pos": [ + -500, + 8400 + ], + "size": [ + 230, + 180 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "source", + "name": "source", + "type": "*", + "link": 1 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 6 + ] + } + ], + "properties": { + "Node name for S&R": "PreviewAny", + "cnr_id": "comfy-core", + "ver": "0.19.0", + "ue_properties": { + "widget_ue_connectable": {}, + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + null, + null, + null + ] + }, + { + "id": 2, + "type": "RegexExtract", + "pos": [ + -240, + 8740 + ], + "size": [ + 470, + 460 + ], + "flags": {}, + "order": 1, + "mode": 0, + "showAdvanced": false, + "inputs": [ + { + "localized_name": "string", + "name": "string", + "type": "STRING", + "widget": { + "name": "string" + }, + "link": 13 + }, + { + "localized_name": "regex_pattern", + "name": "regex_pattern", + "type": "STRING", + "widget": { + "name": "regex_pattern" + }, + "link": 9 + }, + { + "localized_name": "mode", + "name": "mode", + "type": "COMBO", + "widget": { + "name": "mode" + }, + "link": null + }, + { + "localized_name": "case_insensitive", + "name": "case_insensitive", + "type": "BOOLEAN", + "widget": { + "name": "case_insensitive" + }, + "link": null + }, + { + "localized_name": "multiline", + "name": "multiline", + "type": "BOOLEAN", + "widget": { + "name": "multiline" + }, + "link": null + }, + { + "localized_name": "dotall", + "name": "dotall", + "type": "BOOLEAN", + "widget": { + "name": "dotall" + }, + "link": null + }, + { + "localized_name": "group_index", + "name": "group_index", + "type": "INT", + "widget": { + "name": "group_index" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 10 + ] + } + ], + "properties": { + "Node name for S&R": "RegexExtract", + "cnr_id": "comfy-core", + "ver": "0.19.0", + "ue_properties": { + "widget_ue_connectable": {}, + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + "", + "", + "First Group", + false, + false, + false, + 1 + ] + }, + { + "id": 3, + "type": "PrimitiveInt", + "pos": [ + -810, + 8400 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "INT", + "widget": { + "name": "value" + }, + "link": 14 + } + ], + "outputs": [ + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 1 + ] + } + ], + "title": "Int (line index)", + "properties": { + "Node name for S&R": "Int (line index)", + "cnr_id": "comfy-core", + "ver": "0.19.0", + "ue_properties": { + "widget_ue_connectable": {}, + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + 0, + "fixed" + ] + }, + { + "id": 8, + "type": "StringReplace", + "pos": [ + -240, + 8400 + ], + "size": [ + 400, + 280 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "string", + "name": "string", + "type": "STRING", + "widget": { + "name": "string" + }, + "link": null + }, + { + "localized_name": "find", + "name": "find", + "type": "STRING", + "widget": { + "name": "find" + }, + "link": null + }, + { + "localized_name": "replace", + "name": "replace", + "type": "STRING", + "widget": { + "name": "replace" + }, + "link": 6 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 9 + ] + } + ], + "properties": { + "Node name for S&R": "StringReplace", + "cnr_id": "comfy-core", + "ver": "0.19.0", + "ue_properties": { + "widget_ue_connectable": {}, + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + "^(?:[^\\n]*\\n){index}([^\\n]*)(?:\\n|$)", + "index", + "" + ] + } + ], + "groups": [], + "links": [ + { + "id": 1, + "origin_id": 3, + "origin_slot": 0, + "target_id": 1, + "target_slot": 0, + "type": "INT" + }, + { + "id": 9, + "origin_id": 8, + "origin_slot": 0, + "target_id": 2, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 6, + "origin_id": 1, + "origin_slot": 0, + "target_id": 8, + "target_slot": 2, + "type": "STRING" + }, + { + "id": 10, + "origin_id": 2, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "STRING" + }, + { + "id": 13, + "origin_id": -10, + "origin_slot": 0, + "target_id": 2, + "target_slot": 0, + "type": "STRING" + }, + { + "id": 14, + "origin_id": -10, + "origin_slot": 1, + "target_id": 3, + "target_slot": 0, + "type": "INT" + } + ], + "extra": {}, + "category": "Text Tools", + "description": "Selects one line from multiline text by zero-based index for batch or list-driven prompt workflows." + } + ] + }, + "extra": { + "ue_links": [], + "links_added_by_ue": [] + } +} \ No newline at end of file diff --git a/blueprints/Split Image Grid to Tiles.json b/blueprints/Split Image Grid to Tiles.json new file mode 100644 index 000000000..d1f6e40ef --- /dev/null +++ b/blueprints/Split Image Grid to Tiles.json @@ -0,0 +1,714 @@ +{ + "revision": 0, + "last_node_id": 251, + "last_link_id": 0, + "nodes": [ + { + "id": 251, + "type": "609e1fd1-b731-4b78-89ac-d19b1156b025", + "pos": [ + -1490, + 130 + ], + "size": [ + 230, + 164 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "source_image", + "name": "source_image", + "type": "IMAGE", + "link": null + }, + { + "localized_name": "columns", + "name": "columns", + "type": "INT", + "widget": { + "name": "columns" + }, + "link": null + }, + { + "localized_name": "rows", + "name": "rows", + "type": "INT", + "widget": { + "name": "rows" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "tiles", + "name": "tiles", + "type": "IMAGE", + "links": [] + } + ], + "properties": { + "proxyWidgets": [ + [ + "228", + "value" + ], + [ + "252", + "value" + ] + ], + "cnr_id": "comfy-core", + "ver": "0.20.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [], + "title": "Split Image Grid to Tiles" + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "609e1fd1-b731-4b78-89ac-d19b1156b025", + "version": 1, + "state": { + "lastGroupId": 9, + "lastNodeId": 252, + "lastLinkId": 429, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Split Image Grid to Tiles", + "inputNode": { + "id": -10, + "bounding": [ + -1690, + 260, + 128, + 108 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + -510, + 590, + 128, + 68 + ] + }, + "inputs": [ + { + "id": "866ac798-cfbc-450a-b755-e704f86404d9", + "name": "source_image", + "type": "IMAGE", + "linkIds": [ + 386, + 389 + ], + "localized_name": "source_image", + "pos": [ + -1586, + 284 + ] + }, + { + "id": "bc37b1f8-8ab2-4f19-bd00-75d4fbc4feb3", + "name": "columns", + "type": "INT", + "linkIds": [ + 427 + ], + "localized_name": "columns", + "pos": [ + -1586, + 304 + ] + }, + { + "id": "d45915da-e848-43dd-9ccc-e3161e9c99d9", + "name": "rows", + "type": "INT", + "linkIds": [ + 428 + ], + "localized_name": "rows", + "pos": [ + -1586, + 324 + ] + } + ], + "outputs": [ + { + "id": "18bc780f-064b-4038-87c6-67dba71deb08", + "name": "tiles", + "type": "IMAGE", + "linkIds": [ + 394 + ], + "localized_name": "tiles", + "shape": 6, + "pos": [ + -486, + 614 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 225, + "type": "SplitImageToTileList", + "pos": [ + -1010, + 620 + ], + "size": [ + 290, + 170 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 386 + }, + { + "localized_name": "tile_width", + "name": "tile_width", + "type": "INT", + "widget": { + "name": "tile_width" + }, + "link": 403 + }, + { + "localized_name": "tile_height", + "name": "tile_height", + "type": "INT", + "widget": { + "name": "tile_height" + }, + "link": 404 + }, + { + "localized_name": "overlap", + "name": "overlap", + "type": "INT", + "widget": { + "name": "overlap" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "shape": 6, + "type": "IMAGE", + "links": [ + 394 + ] + } + ], + "properties": { + "Node name for S&R": "SplitImageToTileList", + "cnr_id": "comfy-core", + "ver": "0.20.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 1024, + 1024, + 0 + ] + }, + { + "id": 231, + "type": "ComfyMathExpression", + "pos": [ + -1080, + 330 + ], + "size": [ + 370, + 190 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "label": "a", + "localized_name": "values.a", + "name": "values.a", + "type": "FLOAT,INT,BOOLEAN", + "link": 390 + }, + { + "label": "b", + "localized_name": "values.b", + "name": "values.b", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": 429 + }, + { + "label": "c", + "localized_name": "values.c", + "name": "values.c", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": null + }, + { + "localized_name": "expression", + "name": "expression", + "type": "STRING", + "widget": { + "name": "expression" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": null + }, + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 404 + ] + }, + { + "localized_name": "BOOL", + "name": "BOOL", + "type": "BOOLEAN", + "links": null + } + ], + "title": "Math Expression (Height)", + "properties": { + "Node name for S&R": "ComfyMathExpression", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + "max(1, (int(a) + int(b) - 1) // int(b))" + ] + }, + { + "id": 229, + "type": "ComfyMathExpression", + "pos": [ + -1090, + -30 + ], + "size": [ + 370, + 190 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "label": "a", + "localized_name": "values.a", + "name": "values.a", + "type": "FLOAT,INT,BOOLEAN", + "link": 387 + }, + { + "label": "b", + "localized_name": "values.b", + "name": "values.b", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": 388 + }, + { + "label": "c", + "localized_name": "values.c", + "name": "values.c", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": null + }, + { + "localized_name": "expression", + "name": "expression", + "type": "STRING", + "widget": { + "name": "expression" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": null + }, + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 403 + ] + }, + { + "localized_name": "BOOL", + "name": "BOOL", + "type": "BOOLEAN", + "links": null + } + ], + "title": "Math Expression (Width)", + "properties": { + "Node name for S&R": "ComfyMathExpression", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + "max(1, (int(a) + int(b) - 1) // int(b))" + ] + }, + { + "id": 228, + "type": "PrimitiveInt", + "pos": [ + -1380, + 90 + ], + "size": [ + 230, + 110 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "INT", + "widget": { + "name": "value" + }, + "link": 427 + } + ], + "outputs": [ + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 388 + ] + } + ], + "title": "Int (grid columns)", + "properties": { + "Node name for S&R": "Int (grid columns)", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + 2, + "fixed" + ] + }, + { + "id": 230, + "type": "GetImageSize", + "pos": [ + -1380, + 290 + ], + "size": [ + 230, + 100 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 389 + } + ], + "outputs": [ + { + "localized_name": "width", + "name": "width", + "type": "INT", + "links": [ + 387 + ] + }, + { + "localized_name": "height", + "name": "height", + "type": "INT", + "links": [ + 390 + ] + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "links": null + } + ], + "properties": { + "Node name for S&R": "GetImageSize", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + } + }, + { + "id": 252, + "type": "PrimitiveInt", + "pos": [ + -1380, + 470 + ], + "size": [ + 230, + 110 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "INT", + "widget": { + "name": "value" + }, + "link": 428 + } + ], + "outputs": [ + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 429 + ] + } + ], + "title": "Int (grid rows)", + "properties": { + "Node name for S&R": "Int (grid rows)", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + 3, + "fixed" + ] + } + ], + "groups": [], + "links": [ + { + "id": 403, + "origin_id": 229, + "origin_slot": 1, + "target_id": 225, + "target_slot": 1, + "type": "INT" + }, + { + "id": 404, + "origin_id": 231, + "origin_slot": 1, + "target_id": 225, + "target_slot": 2, + "type": "INT" + }, + { + "id": 390, + "origin_id": 230, + "origin_slot": 1, + "target_id": 231, + "target_slot": 0, + "type": "INT" + }, + { + "id": 387, + "origin_id": 230, + "origin_slot": 0, + "target_id": 229, + "target_slot": 0, + "type": "INT" + }, + { + "id": 388, + "origin_id": 228, + "origin_slot": 0, + "target_id": 229, + "target_slot": 1, + "type": "INT" + }, + { + "id": 386, + "origin_id": -10, + "origin_slot": 0, + "target_id": 225, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 389, + "origin_id": -10, + "origin_slot": 0, + "target_id": 230, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 394, + "origin_id": 225, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 427, + "origin_id": -10, + "origin_slot": 1, + "target_id": 228, + "target_slot": 0, + "type": "INT" + }, + { + "id": 428, + "origin_id": -10, + "origin_slot": 2, + "target_id": 252, + "target_slot": 0, + "type": "INT" + }, + { + "id": 429, + "origin_id": 252, + "origin_slot": 0, + "target_id": 231, + "target_slot": 1, + "type": "INT" + } + ], + "extra": {}, + "category": "Image Tools/Crop", + "description": "Splits an image into a configurable columns×rows grid of equal tiles for tiled generation or processing." + } + ] + }, + "extra": {} +} \ No newline at end of file diff --git a/blueprints/Text to Image (Anima).json b/blueprints/Text to Image (Anima).json new file mode 100644 index 000000000..787908ca9 --- /dev/null +++ b/blueprints/Text to Image (Anima).json @@ -0,0 +1,1085 @@ +{ + "revision": 0, + "last_node_id": 60, + "last_link_id": 0, + "nodes": [ + { + "id": 60, + "type": "a3c0dab6-b250-4585-a0f9-8fb8b074fb2f", + "pos": [ + -10, + 130 + ], + "size": [ + 500, + 640 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "label": "prompt", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": null + }, + { + "name": "width", + "type": "INT", + "widget": { + "name": "width" + }, + "link": null + }, + { + "name": "height", + "type": "INT", + "widget": { + "name": "height" + }, + "link": null + }, + { + "name": "steps", + "type": "INT", + "widget": { + "name": "steps" + }, + "link": null + }, + { + "name": "cfg", + "type": "FLOAT", + "widget": { + "name": "cfg" + }, + "link": null + }, + { + "name": "seed", + "type": "INT", + "widget": { + "name": "seed" + }, + "link": null + }, + { + "name": "unet_name", + "type": "COMBO", + "widget": { + "name": "unet_name" + }, + "link": null + }, + { + "name": "clip_name", + "type": "COMBO", + "widget": { + "name": "clip_name" + }, + "link": null + }, + { + "name": "vae_name", + "type": "COMBO", + "widget": { + "name": "vae_name" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [] + } + ], + "properties": { + "proxyWidgets": [ + [ + "11", + "text" + ], + [ + "28", + "width" + ], + [ + "28", + "height" + ], + [ + "19", + "steps" + ], + [ + "19", + "cfg" + ], + [ + "19", + "seed" + ], + [ + "44", + "unet_name" + ], + [ + "45", + "clip_name" + ], + [ + "15", + "vae_name" + ] + ], + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [], + "title": "Text to Image (Anima)" + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "a3c0dab6-b250-4585-a0f9-8fb8b074fb2f", + "version": 1, + "state": { + "lastGroupId": 3, + "lastNodeId": 70, + "lastLinkId": 104, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Text to Image (Anima)", + "inputNode": { + "id": -10, + "bounding": [ + -330, + 530, + 120, + 220 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + 1229.9999873482075, + 505, + 120, + 60 + ] + }, + "inputs": [ + { + "id": "4693f350-6ba0-446d-80d4-3038c661d26c", + "name": "text", + "type": "STRING", + "linkIds": [ + 95 + ], + "label": "prompt", + "pos": [ + -230, + 550 + ] + }, + { + "id": "4a7886a9-4ed7-49bb-afc2-977bb78a303d", + "name": "width", + "type": "INT", + "linkIds": [ + 96 + ], + "pos": [ + -230, + 570 + ] + }, + { + "id": "f6c04461-d29e-49e3-8790-07bb662bbbfe", + "name": "height", + "type": "INT", + "linkIds": [ + 97 + ], + "pos": [ + -230, + 590 + ] + }, + { + "id": "7a24f998-3808-4837-8bff-52304ad09fb6", + "name": "steps", + "type": "INT", + "linkIds": [ + 98 + ], + "pos": [ + -230, + 610 + ] + }, + { + "id": "aaa99698-b222-40fe-b946-28067528a85c", + "name": "cfg", + "type": "FLOAT", + "linkIds": [ + 99 + ], + "pos": [ + -230, + 630 + ] + }, + { + "id": "053df9ae-7311-4816-aa23-7fa13c656ced", + "name": "seed", + "type": "INT", + "linkIds": [ + 100 + ], + "pos": [ + -230, + 650 + ] + }, + { + "id": "c59194ea-015c-41a7-8edd-ae7ffc220b63", + "name": "unet_name", + "type": "COMBO", + "linkIds": [ + 101 + ], + "pos": [ + -230, + 670 + ] + }, + { + "id": "e655aa3b-2db7-4e25-9ea2-61550fa7ae2d", + "name": "clip_name", + "type": "COMBO", + "linkIds": [ + 102 + ], + "pos": [ + -230, + 690 + ] + }, + { + "id": "94965a7a-74dd-4f5a-87e3-9f87995d554f", + "name": "vae_name", + "type": "COMBO", + "linkIds": [ + 103 + ], + "pos": [ + -230, + 710 + ] + } + ], + "outputs": [ + { + "id": "ef85ac0a-2152-4232-bfa1-929cfc913718", + "name": "IMAGE", + "type": "IMAGE", + "linkIds": [ + 82 + ], + "localized_name": "IMAGE", + "pos": [ + 1249.9999873482075, + 525 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 45, + "type": "CLIPLoader", + "pos": [ + -60, + 380 + ], + "size": [ + 310, + 150 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "localized_name": "clip_name", + "name": "clip_name", + "type": "COMBO", + "widget": { + "name": "clip_name" + }, + "link": 102 + }, + { + "localized_name": "type", + "name": "type", + "type": "COMBO", + "widget": { + "name": "type" + }, + "link": null + }, + { + "localized_name": "device", + "name": "device", + "shape": 7, + "type": "COMBO", + "widget": { + "name": "device" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "links": [ + 80, + 81 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPLoader", + "cnr_id": "comfy-core", + "ver": "0.11.0", + "models": [ + { + "name": "qwen_3_06b_base.safetensors", + "url": "https://huggingface.co/circlestone-labs/Anima/resolve/main/split_files/text_encoders/qwen_3_06b_base.safetensors", + "directory": "text_encoders" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "qwen_3_06b_base.safetensors", + "stable_diffusion", + "default" + ] + }, + { + "id": 15, + "type": "VAELoader", + "pos": [ + -50, + 610 + ], + "size": [ + 310, + 100 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "vae_name", + "name": "vae_name", + "type": "COMBO", + "widget": { + "name": "vae_name" + }, + "link": 103 + } + ], + "outputs": [ + { + "localized_name": "VAE", + "name": "VAE", + "type": "VAE", + "links": [ + 11 + ] + } + ], + "properties": { + "Node name for S&R": "VAELoader", + "cnr_id": "comfy-core", + "ver": "0.3.40", + "models": [ + { + "name": "qwen_image_vae.safetensors", + "url": "https://huggingface.co/circlestone-labs/Anima/resolve/main/split_files/vae/qwen_image_vae.safetensors", + "directory": "vae" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "qwen_image_vae.safetensors" + ] + }, + { + "id": 8, + "type": "VAEDecode", + "pos": [ + 880, + 840 + ], + "size": [ + 230, + 90 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "samples", + "name": "samples", + "type": "LATENT", + "link": 10 + }, + { + "localized_name": "vae", + "name": "vae", + "type": "VAE", + "link": 11 + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "slot_index": 0, + "links": [ + 82 + ] + } + ], + "properties": { + "Node name for S&R": "VAEDecode", + "cnr_id": "comfy-core", + "ver": "0.3.40", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 28, + "type": "EmptyLatentImage", + "pos": [ + -50, + 830 + ], + "size": [ + 310, + 150 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "width", + "name": "width", + "type": "INT", + "widget": { + "name": "width" + }, + "link": 96 + }, + { + "localized_name": "height", + "name": "height", + "type": "INT", + "widget": { + "name": "height" + }, + "link": 97 + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { + "name": "batch_size" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "LATENT", + "name": "LATENT", + "type": "LATENT", + "links": [ + 78 + ] + } + ], + "properties": { + "Node name for S&R": "EmptyLatentImage", + "cnr_id": "comfy-core", + "ver": "0.3.40", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 1024, + 1024, + 1 + ] + }, + { + "id": 12, + "type": "CLIPTextEncode", + "pos": [ + 330, + 830 + ], + "size": [ + 490, + 140 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 81 + }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "links": [ + 40 + ] + } + ], + "title": "CLIP Text Encode (Negative Prompt)", + "properties": { + "Node name for S&R": "CLIPTextEncode", + "cnr_id": "comfy-core", + "ver": "0.3.65", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "worst quality, low quality, score_1, score_2, score_3, blurry, jpeg artifacts, sepia" + ], + "color": "#223", + "bgcolor": "#335" + }, + { + "id": 19, + "type": "KSampler", + "pos": [ + 870, + 120 + ], + "size": [ + 300, + 620 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 79 + }, + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "link": 39 + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "link": 40 + }, + { + "localized_name": "latent_image", + "name": "latent_image", + "type": "LATENT", + "link": 78 + }, + { + "localized_name": "seed", + "name": "seed", + "type": "INT", + "widget": { + "name": "seed" + }, + "link": 100 + }, + { + "localized_name": "steps", + "name": "steps", + "type": "INT", + "widget": { + "name": "steps" + }, + "link": 98 + }, + { + "localized_name": "cfg", + "name": "cfg", + "type": "FLOAT", + "widget": { + "name": "cfg" + }, + "link": 99 + }, + { + "localized_name": "sampler_name", + "name": "sampler_name", + "type": "COMBO", + "widget": { + "name": "sampler_name" + }, + "link": null + }, + { + "localized_name": "scheduler", + "name": "scheduler", + "type": "COMBO", + "widget": { + "name": "scheduler" + }, + "link": null + }, + { + "localized_name": "denoise", + "name": "denoise", + "type": "FLOAT", + "widget": { + "name": "denoise" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "LATENT", + "name": "LATENT", + "type": "LATENT", + "slot_index": 0, + "links": [ + 10 + ] + } + ], + "properties": { + "Node name for S&R": "KSampler", + "cnr_id": "comfy-core", + "ver": "0.3.40", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 0, + "fixed", + 30, + 4, + "er_sde", + "simple", + 1 + ] + }, + { + "id": 11, + "type": "CLIPTextEncode", + "pos": [ + 320, + 170 + ], + "size": [ + 490, + 610 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 80 + }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": 95 + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "links": [ + 39 + ] + } + ], + "title": "CLIP Text Encode (Positive Prompt)", + "properties": { + "Node name for S&R": "CLIPTextEncode", + "cnr_id": "comfy-core", + "ver": "0.3.65", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "" + ], + "color": "#232", + "bgcolor": "#353" + }, + { + "id": 44, + "type": "UNETLoader", + "pos": [ + -50, + 170 + ], + "size": [ + 310, + 130 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "localized_name": "unet_name", + "name": "unet_name", + "type": "COMBO", + "widget": { + "name": "unet_name" + }, + "link": 101 + }, + { + "localized_name": "weight_dtype", + "name": "weight_dtype", + "type": "COMBO", + "widget": { + "name": "weight_dtype" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "links": [ + 79 + ] + } + ], + "properties": { + "Node name for S&R": "UNETLoader", + "cnr_id": "comfy-core", + "ver": "0.11.0", + "models": [ + { + "name": "anima-base-v1.0.safetensors", + "url": "https://huggingface.co/circlestone-labs/Anima/resolve/main/split_files/diffusion_models/anima-base-v1.0.safetensors", + "directory": "diffusion_models" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "anima-base-v1.0.safetensors", + "default" + ] + } + ], + "groups": [ + { + "id": 1, + "title": "Model", + "bounding": [ + -80, + 80, + 360, + 640 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 2, + "title": "Image Size(1MP)", + "bounding": [ + -80, + 750, + 360, + 240 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 3, + "title": "Prompt", + "bounding": [ + 300, + 80, + 530, + 910 + ], + "color": "#3f789e", + "flags": {} + } + ], + "links": [ + { + "id": 10, + "origin_id": 19, + "origin_slot": 0, + "target_id": 8, + "target_slot": 0, + "type": "LATENT" + }, + { + "id": 11, + "origin_id": 15, + "origin_slot": 0, + "target_id": 8, + "target_slot": 1, + "type": "VAE" + }, + { + "id": 81, + "origin_id": 45, + "origin_slot": 0, + "target_id": 12, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 79, + "origin_id": 44, + "origin_slot": 0, + "target_id": 19, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 39, + "origin_id": 11, + "origin_slot": 0, + "target_id": 19, + "target_slot": 1, + "type": "CONDITIONING" + }, + { + "id": 40, + "origin_id": 12, + "origin_slot": 0, + "target_id": 19, + "target_slot": 2, + "type": "CONDITIONING" + }, + { + "id": 78, + "origin_id": 28, + "origin_slot": 0, + "target_id": 19, + "target_slot": 3, + "type": "LATENT" + }, + { + "id": 80, + "origin_id": 45, + "origin_slot": 0, + "target_id": 11, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 82, + "origin_id": 8, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 95, + "origin_id": -10, + "origin_slot": 0, + "target_id": 11, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 96, + "origin_id": -10, + "origin_slot": 1, + "target_id": 28, + "target_slot": 0, + "type": "INT" + }, + { + "id": 97, + "origin_id": -10, + "origin_slot": 2, + "target_id": 28, + "target_slot": 1, + "type": "INT" + }, + { + "id": 98, + "origin_id": -10, + "origin_slot": 3, + "target_id": 19, + "target_slot": 5, + "type": "INT" + }, + { + "id": 99, + "origin_id": -10, + "origin_slot": 4, + "target_id": 19, + "target_slot": 6, + "type": "FLOAT" + }, + { + "id": 100, + "origin_id": -10, + "origin_slot": 5, + "target_id": 19, + "target_slot": 4, + "type": "INT" + }, + { + "id": 101, + "origin_id": -10, + "origin_slot": 6, + "target_id": 44, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 102, + "origin_id": -10, + "origin_slot": 7, + "target_id": 45, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 103, + "origin_id": -10, + "origin_slot": 8, + "target_id": 15, + "target_slot": 0, + "type": "COMBO" + } + ], + "extra": {}, + "category": "Image generation and editing/Text to image" + } + ] + }, + "extra": {} +} \ No newline at end of file diff --git a/blueprints/Video Captioning (Gemini).json b/blueprints/Video Captioning (Gemini).json index 7642b23c1..54a7d6e78 100644 --- a/blueprints/Video Captioning (Gemini).json +++ b/blueprints/Video Captioning (Gemini).json @@ -307,9 +307,9 @@ "extra": { "workflowRendererVersion": "LG" }, - "category": "Text generation/Video Captioning", + "category": "Video Tools", "description": "Generates descriptive captions for video input using Google's Gemini multimodal LLM." } ] } -} +} \ No newline at end of file diff --git a/blueprints/Video Depth Estimation (MoGe).json b/blueprints/Video Depth Estimation (MoGe).json new file mode 100644 index 000000000..025e20cda --- /dev/null +++ b/blueprints/Video Depth Estimation (MoGe).json @@ -0,0 +1,1226 @@ +{ + "revision": 0, + "last_node_id": 72, + "last_link_id": 0, + "nodes": [ + { + "id": 72, + "type": "7ff83f68-6848-47a8-aa43-9036ca6c46e8", + "pos": [ + -4440, + 4550 + ], + "size": [ + 430, + 330 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "inference_resolution", + "name": "inference_resolution", + "type": "INT", + "widget": { + "name": "inference_resolution" + }, + "link": null + }, + { + "localized_name": "inference_batch_size", + "name": "inference_batch_size", + "type": "INT", + "widget": { + "name": "inference_batch_size" + }, + "link": null + }, + { + "localized_name": "moge_model", + "name": "moge_model", + "type": "COMBO", + "widget": { + "name": "moge_model" + }, + "link": null + }, + { + "label": "auto_resize_input", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": null + }, + { + "name": "video", + "type": "VIDEO", + "link": null + } + ], + "outputs": [ + { + "localized_name": "depth_colored", + "name": "depth_colored", + "type": "IMAGE", + "links": [] + }, + { + "localized_name": "depth", + "name": "depth", + "type": "IMAGE", + "links": [] + }, + { + "name": "MASK", + "type": "MASK", + "links": [] + } + ], + "properties": { + "proxyWidgets": [ + [ + "13", + "resolution_level" + ], + [ + "13", + "batch_size" + ], + [ + "32", + "model_name" + ], + [ + "53", + "switch" + ] + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.21.1" + }, + "widgets_values": [], + "title": "Video Depth Estimation (MoGe)" + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "7ff83f68-6848-47a8-aa43-9036ca6c46e8", + "version": 1, + "state": { + "lastGroupId": 1, + "lastNodeId": 72, + "lastLinkId": 96, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Video Depth Estimation (MoGe)", + "inputNode": { + "id": -10, + "bounding": [ + -5320, + 5320, + 167.337890625, + 148 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + -3090, + 4966, + 129, + 108 + ] + }, + "inputs": [ + { + "id": "06eefa21-8e60-49f3-9a34-35b081f4ae52", + "name": "inference_resolution", + "type": "INT", + "linkIds": [ + 73 + ], + "localized_name": "inference_resolution", + "pos": [ + -5176.662109375, + 5344 + ] + }, + { + "id": "616638fe-f603-4d10-bae9-fc87c134380f", + "name": "inference_batch_size", + "type": "INT", + "linkIds": [ + 74 + ], + "localized_name": "inference_batch_size", + "pos": [ + -5176.662109375, + 5364 + ] + }, + { + "id": "65694805-186e-4181-a721-df8b5af49d31", + "name": "moge_model", + "type": "COMBO", + "linkIds": [ + 79 + ], + "localized_name": "moge_model", + "pos": [ + -5176.662109375, + 5384 + ] + }, + { + "id": "badf1be1-53c6-4fc1-b5cd-79ad3daf1674", + "name": "switch", + "type": "BOOLEAN", + "linkIds": [ + 83 + ], + "label": "auto_resize_input", + "pos": [ + -5176.662109375, + 5404 + ] + }, + { + "id": "749bad18-d00a-4ec4-a5ff-e45b1d0cf089", + "name": "video", + "type": "VIDEO", + "linkIds": [ + 91 + ], + "pos": [ + -5176.662109375, + 5424 + ] + } + ], + "outputs": [ + { + "id": "59c37b52-074f-49fc-9731-483f899c12c4", + "name": "depth_colored", + "type": "IMAGE", + "linkIds": [ + 36 + ], + "localized_name": "depth_colored", + "pos": [ + -3066, + 4990 + ] + }, + { + "id": "f583e936-da5c-4630-9901-391fa605c1f8", + "name": "depth", + "type": "IMAGE", + "linkIds": [ + 40 + ], + "localized_name": "depth", + "pos": [ + -3066, + 5010 + ] + }, + { + "id": "6845b6a1-1980-454a-9451-314f24495c1d", + "name": "MASK", + "type": "MASK", + "linkIds": [ + 86 + ], + "pos": [ + -3066, + 5030 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 13, + "type": "MoGeInference", + "pos": [ + -3790, + 5180 + ], + "size": [ + 270, + 230 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "moge_model", + "name": "moge_model", + "type": "MOGE_MODEL", + "link": 58 + }, + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 81 + }, + { + "localized_name": "resolution_level", + "name": "resolution_level", + "type": "INT", + "widget": { + "name": "resolution_level" + }, + "link": 73 + }, + { + "localized_name": "fov_x_degrees", + "name": "fov_x_degrees", + "type": "FLOAT", + "widget": { + "name": "fov_x_degrees" + }, + "link": null + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { + "name": "batch_size" + }, + "link": 74 + }, + { + "localized_name": "force_projection", + "name": "force_projection", + "type": "BOOLEAN", + "widget": { + "name": "force_projection" + }, + "link": null + }, + { + "localized_name": "apply_mask", + "name": "apply_mask", + "type": "BOOLEAN", + "widget": { + "name": "apply_mask" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "moge_geometry", + "name": "moge_geometry", + "type": "MOGE_GEOMETRY", + "links": [ + 35, + 39, + 61 + ] + } + ], + "properties": { + "Node name for S&R": "MoGeInference", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.21.1" + }, + "widgets_values": [ + 9, + 0, + 4, + true, + true + ] + }, + { + "id": 23, + "type": "MoGeRender", + "pos": [ + -3430, + 4870 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "moge_geometry", + "name": "moge_geometry", + "type": "MOGE_GEOMETRY", + "link": 35 + }, + { + "localized_name": "output", + "name": "output", + "type": "COMBO", + "widget": { + "name": "output" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 36 + ] + } + ], + "properties": { + "Node name for S&R": "MoGeRender", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.21.1" + }, + "widgets_values": [ + "depth_colored" + ] + }, + { + "id": 25, + "type": "MoGeRender", + "pos": [ + -3430, + 5030 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "moge_geometry", + "name": "moge_geometry", + "type": "MOGE_GEOMETRY", + "link": 39 + }, + { + "localized_name": "output", + "name": "output", + "type": "COMBO", + "widget": { + "name": "output" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 40 + ] + } + ], + "properties": { + "Node name for S&R": "MoGeRender", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.21.1" + }, + "widgets_values": [ + "depth" + ] + }, + { + "id": 32, + "type": "LoadMoGeModel", + "pos": [ + -4180, + 4880 + ], + "size": [ + 270, + 140 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "model_name", + "name": "model_name", + "type": "COMBO", + "widget": { + "name": "model_name" + }, + "link": 79 + } + ], + "outputs": [ + { + "localized_name": "MOGE_MODEL", + "name": "MOGE_MODEL", + "type": "MOGE_MODEL", + "links": [ + 58 + ] + } + ], + "properties": { + "Node name for S&R": "LoadMoGeModel", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.21.1", + "models": [ + { + "name": "moge_2_vitl_normal_fp16.safetensors", + "url": "https://huggingface.co/Comfy-Org/MoGe/resolve/main/geometry_estimation/moge_2_vitl_normal_fp16.safetensors", + "directory": "geometry_estimation" + } + ] + }, + "widgets_values": [ + "moge_2_vitl_normal_fp16.safetensors" + ] + }, + { + "id": 36, + "type": "ComfyMathExpression", + "pos": [ + -4720, + 4910 + ], + "size": [ + 400, + 200 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "label": "a", + "localized_name": "values.a", + "name": "values.a", + "type": "FLOAT,INT,BOOLEAN", + "link": 49 + }, + { + "label": "b", + "localized_name": "values.b", + "name": "values.b", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": null + }, + { + "localized_name": "expression", + "name": "expression", + "type": "STRING", + "widget": { + "name": "expression" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": null + }, + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": null + }, + { + "localized_name": "BOOL", + "name": "BOOL", + "type": "BOOLEAN", + "links": [ + 53 + ] + } + ], + "properties": { + "Node name for S&R": "ComfyMathExpression", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.21.1" + }, + "widgets_values": [ + "a > 2048" + ] + }, + { + "id": 37, + "type": "GetImageSize", + "pos": [ + -4980, + 4910 + ], + "size": [ + 230, + 160 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 92 + } + ], + "outputs": [ + { + "localized_name": "width", + "name": "width", + "type": "INT", + "links": [ + 49 + ] + }, + { + "localized_name": "height", + "name": "height", + "type": "INT", + "links": null + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "links": null + } + ], + "properties": { + "Node name for S&R": "GetImageSize", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.21.1" + } + }, + { + "id": 40, + "type": "ResizeImagesByLongerEdge", + "pos": [ + -4650, + 5210 + ], + "size": [ + 310, + 110 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "link": 93 + }, + { + "localized_name": "longer_edge", + "name": "longer_edge", + "type": "INT", + "widget": { + "name": "longer_edge" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "links": [ + 54 + ] + } + ], + "properties": { + "Node name for S&R": "ResizeImagesByLongerEdge", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.21.1" + }, + "widgets_values": [ + 2048 + ] + }, + { + "id": 42, + "type": "ComfySwitchNode", + "pos": [ + -4180, + 5060 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 94 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 54 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 53 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 80 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.21.1" + }, + "widgets_values": [ + false + ] + }, + { + "id": 45, + "type": "MoGeRender", + "pos": [ + -3430, + 5200 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "localized_name": "moge_geometry", + "name": "moge_geometry", + "type": "MOGE_GEOMETRY", + "link": 61 + }, + { + "localized_name": "output", + "name": "output", + "type": "COMBO", + "widget": { + "name": "output" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 85 + ] + } + ], + "properties": { + "Node name for S&R": "MoGeRender", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.21.1" + }, + "widgets_values": [ + "mask" + ] + }, + { + "id": 53, + "type": "ComfySwitchNode", + "pos": [ + -4160, + 5340 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 95 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 80 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 83 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 81 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.21.1" + }, + "widgets_values": [ + true + ] + }, + { + "id": 68, + "type": "ImageToMask", + "pos": [ + -3420, + 5360 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 85 + }, + { + "localized_name": "channel", + "name": "channel", + "type": "COMBO", + "widget": { + "name": "channel" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "MASK", + "name": "MASK", + "type": "MASK", + "links": [ + 86 + ] + } + ], + "properties": { + "Node name for S&R": "ImageToMask", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.22.0" + }, + "widgets_values": [ + "red" + ] + }, + { + "id": 70, + "type": "GetVideoComponents", + "pos": [ + -4920, + 5490 + ], + "size": [ + 230, + 120 + ], + "flags": {}, + "order": 11, + "mode": 0, + "inputs": [ + { + "localized_name": "video", + "name": "video", + "type": "VIDEO", + "link": 91 + } + ], + "outputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "links": [ + 92, + 93, + 94, + 95 + ] + }, + { + "localized_name": "audio", + "name": "audio", + "type": "AUDIO", + "links": null + }, + { + "localized_name": "fps", + "name": "fps", + "type": "FLOAT", + "links": null + } + ], + "properties": { + "Node name for S&R": "GetVideoComponents", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.22.0" + } + } + ], + "groups": [ + { + "id": 1, + "title": "auto_resize_if_width_gt_2048", + "bounding": [ + -5000, + 4840, + 690, + 280 + ], + "color": "#3f789e", + "flags": {} + } + ], + "links": [ + { + "id": 58, + "origin_id": 32, + "origin_slot": 0, + "target_id": 13, + "target_slot": 0, + "type": "MOGE_MODEL" + }, + { + "id": 35, + "origin_id": 13, + "origin_slot": 0, + "target_id": 23, + "target_slot": 0, + "type": "MOGE_GEOMETRY" + }, + { + "id": 39, + "origin_id": 13, + "origin_slot": 0, + "target_id": 25, + "target_slot": 0, + "type": "MOGE_GEOMETRY" + }, + { + "id": 49, + "origin_id": 37, + "origin_slot": 0, + "target_id": 36, + "target_slot": 0, + "type": "INT" + }, + { + "id": 54, + "origin_id": 40, + "origin_slot": 0, + "target_id": 42, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 53, + "origin_id": 36, + "origin_slot": 2, + "target_id": 42, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 61, + "origin_id": 13, + "origin_slot": 0, + "target_id": 45, + "target_slot": 0, + "type": "MOGE_GEOMETRY" + }, + { + "id": 36, + "origin_id": 23, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 40, + "origin_id": 25, + "origin_slot": 0, + "target_id": -20, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 73, + "origin_id": -10, + "origin_slot": 0, + "target_id": 13, + "target_slot": 2, + "type": "INT" + }, + { + "id": 74, + "origin_id": -10, + "origin_slot": 1, + "target_id": 13, + "target_slot": 4, + "type": "INT" + }, + { + "id": 79, + "origin_id": -10, + "origin_slot": 2, + "target_id": 32, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 80, + "origin_id": 42, + "origin_slot": 0, + "target_id": 53, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 81, + "origin_id": 53, + "origin_slot": 0, + "target_id": 13, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 83, + "origin_id": -10, + "origin_slot": 3, + "target_id": 53, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 85, + "origin_id": 45, + "origin_slot": 0, + "target_id": 68, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 86, + "origin_id": 68, + "origin_slot": 0, + "target_id": -20, + "target_slot": 2, + "type": "MASK" + }, + { + "id": 91, + "origin_id": -10, + "origin_slot": 4, + "target_id": 70, + "target_slot": 0, + "type": "VIDEO" + }, + { + "id": 92, + "origin_id": 70, + "origin_slot": 0, + "target_id": 37, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 93, + "origin_id": 70, + "origin_slot": 0, + "target_id": 40, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 94, + "origin_id": 70, + "origin_slot": 0, + "target_id": 42, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 95, + "origin_id": 70, + "origin_slot": 0, + "target_id": 53, + "target_slot": 0, + "type": "IMAGE" + } + ], + "extra": {}, + "category": "Conditioning & Preprocessors/Depth", + "description": "Estimates monocular depth from an input video using MoGe, outputting both raw and colorized depth maps plus a mask." + } + ] + }, + "extra": {} +} \ No newline at end of file diff --git a/blueprints/Video Face Detection (Mediapipe).json b/blueprints/Video Face Detection (Mediapipe).json new file mode 100644 index 000000000..c70352481 --- /dev/null +++ b/blueprints/Video Face Detection (Mediapipe).json @@ -0,0 +1,1109 @@ +{ + "revision": 0, + "last_node_id": 167, + "last_link_id": 0, + "nodes": [ + { + "id": 167, + "type": "ca14b151-8f5e-4386-aab7-d2ec84eaf43c", + "pos": [ + -3410, + 6100 + ], + "size": [ + 420, + 481.3125 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "name": "video", + "type": "VIDEO", + "link": null + }, + { + "label": "trim_audio", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": null + }, + { + "name": "start_time", + "type": "FLOAT", + "widget": { + "name": "start_time" + }, + "link": null + }, + { + "name": "duration", + "type": "FLOAT", + "widget": { + "name": "duration" + }, + "link": null + }, + { + "label": "face_landmarker", + "name": "face_landmarker_1", + "type": "FACE_LANDMARKER", + "link": null + }, + { + "label": "detector_variant", + "name": "detector_variant_1", + "type": "COMBO", + "widget": { + "name": "detector_variant_1" + }, + "link": null + }, + { + "label": "num_faces", + "name": "num_faces_1", + "type": "INT", + "widget": { + "name": "num_faces_1" + }, + "link": null + }, + { + "label": "face_oval", + "name": "regions.face_oval", + "type": "BOOLEAN", + "widget": { + "name": "regions.face_oval" + }, + "link": null + }, + { + "label": "face_lips", + "name": "regions.lips", + "type": "BOOLEAN", + "widget": { + "name": "regions.lips" + }, + "link": null + }, + { + "label": "left_eye", + "name": "regions.left_eye", + "type": "BOOLEAN", + "widget": { + "name": "regions.left_eye" + }, + "link": null + }, + { + "label": "right_eye", + "name": "regions.right_eye_1", + "type": "BOOLEAN", + "widget": { + "name": "regions.right_eye_1" + }, + "link": null + }, + { + "label": "irises", + "name": "regions.irises_1", + "type": "BOOLEAN", + "widget": { + "name": "regions.irises_1" + }, + "link": null + }, + { + "name": "model_name", + "type": "COMBO", + "widget": { + "name": "model_name" + }, + "link": null + } + ], + "outputs": [ + { + "label": "mask", + "name": "MASK_1", + "type": "MASK", + "links": [] + }, + { + "label": "bboxes", + "name": "bboxes_1", + "type": "BOUNDING_BOX", + "links": null + }, + { + "name": "face_landmarks", + "type": "FACE_LANDMARKS", + "links": null + } + ], + "title": "Video Face Detection (Mediapipe)", + "properties": { + "proxyWidgets": [ + [ + "165", + "switch" + ], + [ + "164", + "start_time" + ], + [ + "164", + "duration" + ], + [ + "11", + "detector_variant" + ], + [ + "11", + "num_faces" + ], + [ + "20", + "regions.face_oval" + ], + [ + "20", + "regions.lips" + ], + [ + "20", + "regions.left_eye" + ], + [ + "20", + "regions.right_eye" + ], + [ + "20", + "regions.irises" + ], + [ + "2", + "model_name" + ] + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.22.0" + }, + "widgets_values": [] + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "ca14b151-8f5e-4386-aab7-d2ec84eaf43c", + "version": 1, + "state": { + "lastGroupId": 2, + "lastNodeId": 167, + "lastLinkId": 168, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Video Face Detection (Mediapipe)", + "description": "Detects facial landmarks from a video using MediaPipe, outputting landmark data, face bounding boxes, and an optional face-region mask.", + "inputNode": { + "id": -10, + "bounding": [ + -1060, + 4350, + 142.587890625, + 308 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + 470, + 4460, + 137.677734375, + 108 + ] + }, + "inputs": [ + { + "id": "16e5a20f-22bc-4960-a67b-e32c64409c49", + "name": "video", + "type": "VIDEO", + "linkIds": [ + 150, + 153 + ], + "pos": [ + -941.412109375, + 4374 + ] + }, + { + "id": "cc7fc7d4-24ec-4c00-878e-1af1b6809b4b", + "name": "switch", + "type": "BOOLEAN", + "linkIds": [ + 154 + ], + "label": "trim_audio", + "pos": [ + -941.412109375, + 4394 + ] + }, + { + "id": "efa9ab9f-ca70-449c-be43-5ca60c7f0d59", + "name": "start_time", + "type": "FLOAT", + "linkIds": [ + 155 + ], + "pos": [ + -941.412109375, + 4414 + ] + }, + { + "id": "45050127-4089-4b85-bf81-73b725196c2e", + "name": "duration", + "type": "FLOAT", + "linkIds": [ + 156 + ], + "pos": [ + -941.412109375, + 4434 + ] + }, + { + "id": "239fcd3b-6324-4824-8255-98199ae58914", + "name": "face_landmarker_1", + "type": "FACE_LANDMARKER", + "linkIds": [ + 157 + ], + "label": "face_landmarker", + "pos": [ + -941.412109375, + 4454 + ] + }, + { + "id": "f79f67b9-5bcb-4cab-9101-8b9dee461bca", + "name": "detector_variant_1", + "type": "COMBO", + "linkIds": [ + 158 + ], + "label": "detector_variant", + "pos": [ + -941.412109375, + 4474 + ] + }, + { + "id": "3369790b-e730-41bf-b5b2-dc1f5fafbe11", + "name": "num_faces_1", + "type": "INT", + "linkIds": [ + 159 + ], + "label": "num_faces", + "pos": [ + -941.412109375, + 4494 + ] + }, + { + "id": "964f6b5f-44ac-456e-ba3a-a3039dfe0729", + "name": "regions.face_oval", + "type": "BOOLEAN", + "linkIds": [ + 160 + ], + "label": "face_oval", + "pos": [ + -941.412109375, + 4514 + ] + }, + { + "id": "d6e89b51-65a2-4f37-a561-8cec3a5040fd", + "name": "regions.lips", + "type": "BOOLEAN", + "linkIds": [ + 161 + ], + "label": "face_lips", + "pos": [ + -941.412109375, + 4534 + ] + }, + { + "id": "49f02319-ea4a-4a69-88f8-589d2ef7c97a", + "name": "regions.left_eye", + "type": "BOOLEAN", + "linkIds": [ + 162 + ], + "label": "left_eye", + "pos": [ + -941.412109375, + 4554 + ] + }, + { + "id": "89179a19-aca6-4469-a0b9-2a4bd21bceea", + "name": "regions.right_eye_1", + "type": "BOOLEAN", + "linkIds": [ + 163 + ], + "label": "right_eye", + "pos": [ + -941.412109375, + 4574 + ] + }, + { + "id": "f5667690-24b5-4df9-9210-b8610c68ff5f", + "name": "regions.irises_1", + "type": "BOOLEAN", + "linkIds": [ + 164 + ], + "label": "irises", + "pos": [ + -941.412109375, + 4594 + ] + }, + { + "id": "66c805f6-6ccd-41f9-8a77-fc934b7f4713", + "name": "model_name", + "type": "COMBO", + "linkIds": [ + 165 + ], + "pos": [ + -941.412109375, + 4614 + ] + } + ], + "outputs": [ + { + "id": "f6309e1d-6397-4363-b38f-778a122abc51", + "name": "MASK_1", + "type": "MASK", + "linkIds": [ + 83 + ], + "label": "mask", + "pos": [ + 494, + 4484 + ] + }, + { + "id": "59669f0a-b4b2-49d1-85f8-fc2a88059b1a", + "name": "bboxes_1", + "type": "BOUNDING_BOX", + "linkIds": [ + 166 + ], + "label": "bboxes", + "pos": [ + 494, + 4504 + ] + }, + { + "id": "57f66731-e106-4f8b-a0a0-aed3c620b37b", + "name": "face_landmarks", + "type": "FACE_LANDMARKS", + "linkIds": [ + 167 + ], + "pos": [ + 494, + 4524 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 11, + "type": "MediaPipeFaceLandmarker", + "pos": [ + -60, + 4380 + ], + "size": [ + 350, + 220 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "face_detection_model", + "name": "face_detection_model", + "type": "FACE_DETECTION_MODEL", + "link": 66 + }, + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 149 + }, + { + "localized_name": "detector_variant", + "name": "detector_variant", + "type": "COMBO", + "widget": { + "name": "detector_variant" + }, + "link": 158 + }, + { + "localized_name": "num_faces", + "name": "num_faces", + "type": "INT", + "widget": { + "name": "num_faces" + }, + "link": 159 + }, + { + "localized_name": "min_confidence", + "name": "min_confidence", + "type": "FLOAT", + "widget": { + "name": "min_confidence" + }, + "link": null + }, + { + "localized_name": "missing_frame_fallback", + "name": "missing_frame_fallback", + "type": "COMBO", + "widget": { + "name": "missing_frame_fallback" + }, + "link": null + }, + { + "name": "face_landmarker", + "type": "FACE_LANDMARKER", + "link": 157 + } + ], + "outputs": [ + { + "localized_name": "face_landmarks", + "name": "face_landmarks", + "type": "FACE_LANDMARKS", + "links": [ + 46, + 167 + ] + }, + { + "localized_name": "bboxes", + "name": "bboxes", + "type": "BOUNDING_BOX", + "links": [ + 166 + ] + } + ], + "properties": { + "Node name for S&R": "MediaPipeFaceLandmarker", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.22.0" + }, + "widgets_values": [ + "full", + 0, + 0.5, + "empty" + ] + }, + { + "id": 2, + "type": "LoadMediaPipeFaceLandmarker", + "pos": [ + -70, + 4160 + ], + "size": [ + 350, + 140 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "model_name", + "name": "model_name", + "type": "COMBO", + "widget": { + "name": "model_name" + }, + "link": 165 + } + ], + "outputs": [ + { + "localized_name": "FACE_DETECTION_MODEL", + "name": "FACE_DETECTION_MODEL", + "type": "FACE_DETECTION_MODEL", + "links": [ + 66 + ] + } + ], + "properties": { + "Node name for S&R": "LoadMediaPipeFaceLandmarker", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.22.0", + "models": [ + { + "name": "mediapipe_face_fp32.safetensors", + "url": "https://huggingface.co/Comfy-Org/mediapipe/resolve/main/detection/mediapipe_face_fp32.safetensors", + "directory": "detection" + } + ] + }, + "widgets_values": [ + "mediapipe_face_fp32.safetensors" + ] + }, + { + "id": 20, + "type": "MediaPipeFaceMask", + "pos": [ + -70, + 4660 + ], + "size": [ + 360, + 180 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "face_landmarks", + "name": "face_landmarks", + "type": "FACE_LANDMARKS", + "link": 46 + }, + { + "localized_name": "regions", + "name": "regions", + "type": "COMFY_DYNAMICCOMBO_V3", + "widget": { + "name": "regions" + }, + "link": null + }, + { + "localized_name": "regions.face_oval", + "name": "regions.face_oval", + "type": "BOOLEAN", + "widget": { + "name": "regions.face_oval" + }, + "link": 160 + }, + { + "localized_name": "regions.lips", + "name": "regions.lips", + "type": "BOOLEAN", + "widget": { + "name": "regions.lips" + }, + "link": 161 + }, + { + "localized_name": "regions.left_eye", + "name": "regions.left_eye", + "type": "BOOLEAN", + "widget": { + "name": "regions.left_eye" + }, + "link": 162 + }, + { + "localized_name": "regions.right_eye", + "name": "regions.right_eye", + "type": "BOOLEAN", + "widget": { + "name": "regions.right_eye" + }, + "link": 163 + }, + { + "localized_name": "regions.irises", + "name": "regions.irises", + "type": "BOOLEAN", + "widget": { + "name": "regions.irises" + }, + "link": 164 + } + ], + "outputs": [ + { + "localized_name": "MASK", + "name": "MASK", + "type": "MASK", + "links": [ + 83 + ] + } + ], + "properties": { + "Node name for S&R": "MediaPipeFaceMask", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.22.0" + }, + "widgets_values": [ + "custom", + true, + false, + false, + false, + false + ] + }, + { + "id": 160, + "type": "GetVideoComponents", + "pos": [ + -420, + 4360 + ], + "size": [ + 230, + 120 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "video", + "name": "video", + "type": "VIDEO", + "link": 152 + } + ], + "outputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "links": [ + 149 + ] + }, + { + "localized_name": "audio", + "name": "audio", + "type": "AUDIO", + "links": null + }, + { + "localized_name": "fps", + "name": "fps", + "type": "FLOAT", + "links": null + } + ], + "properties": { + "Node name for S&R": "GetVideoComponents", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.22.0" + } + }, + { + "id": 164, + "type": "Video Slice", + "pos": [ + -780, + 4330 + ], + "size": [ + 270, + 170 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "localized_name": "video", + "name": "video", + "type": "VIDEO", + "link": 150 + }, + { + "localized_name": "start_time", + "name": "start_time", + "type": "FLOAT", + "widget": { + "name": "start_time" + }, + "link": 155 + }, + { + "localized_name": "duration", + "name": "duration", + "type": "FLOAT", + "widget": { + "name": "duration" + }, + "link": 156 + }, + { + "localized_name": "strict_duration", + "name": "strict_duration", + "type": "BOOLEAN", + "widget": { + "name": "strict_duration" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "VIDEO", + "name": "VIDEO", + "type": "VIDEO", + "links": [ + 151 + ] + } + ], + "properties": { + "Node name for S&R": "Video Slice", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.22.0" + }, + "widgets_values": [ + 0, + 0, + false + ] + }, + { + "id": 165, + "type": "ComfySwitchNode", + "pos": [ + -420, + 4590 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 153 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 151 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 154 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 152 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.22.0" + }, + "widgets_values": [ + false + ] + } + ], + "groups": [], + "links": [ + { + "id": 66, + "origin_id": 2, + "origin_slot": 0, + "target_id": 11, + "target_slot": 0, + "type": "FACE_DETECTION_MODEL" + }, + { + "id": 46, + "origin_id": 11, + "origin_slot": 0, + "target_id": 20, + "target_slot": 0, + "type": "FACE_LANDMARKS" + }, + { + "id": 83, + "origin_id": 20, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "MASK" + }, + { + "id": 149, + "origin_id": 160, + "origin_slot": 0, + "target_id": 11, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 150, + "origin_id": -10, + "origin_slot": 0, + "target_id": 164, + "target_slot": 0, + "type": "VIDEO" + }, + { + "id": 151, + "origin_id": 164, + "origin_slot": 0, + "target_id": 165, + "target_slot": 1, + "type": "VIDEO" + }, + { + "id": 152, + "origin_id": 165, + "origin_slot": 0, + "target_id": 160, + "target_slot": 0, + "type": "VIDEO" + }, + { + "id": 153, + "origin_id": -10, + "origin_slot": 0, + "target_id": 165, + "target_slot": 0, + "type": "VIDEO" + }, + { + "id": 154, + "origin_id": -10, + "origin_slot": 1, + "target_id": 165, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 155, + "origin_id": -10, + "origin_slot": 2, + "target_id": 164, + "target_slot": 1, + "type": "FLOAT" + }, + { + "id": 156, + "origin_id": -10, + "origin_slot": 3, + "target_id": 164, + "target_slot": 2, + "type": "FLOAT" + }, + { + "id": 157, + "origin_id": -10, + "origin_slot": 4, + "target_id": 11, + "target_slot": 6, + "type": "FACE_LANDMARKER" + }, + { + "id": 158, + "origin_id": -10, + "origin_slot": 5, + "target_id": 11, + "target_slot": 2, + "type": "COMBO" + }, + { + "id": 159, + "origin_id": -10, + "origin_slot": 6, + "target_id": 11, + "target_slot": 3, + "type": "INT" + }, + { + "id": 160, + "origin_id": -10, + "origin_slot": 7, + "target_id": 20, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 161, + "origin_id": -10, + "origin_slot": 8, + "target_id": 20, + "target_slot": 3, + "type": "BOOLEAN" + }, + { + "id": 162, + "origin_id": -10, + "origin_slot": 9, + "target_id": 20, + "target_slot": 4, + "type": "BOOLEAN" + }, + { + "id": 163, + "origin_id": -10, + "origin_slot": 10, + "target_id": 20, + "target_slot": 5, + "type": "BOOLEAN" + }, + { + "id": 164, + "origin_id": -10, + "origin_slot": 11, + "target_id": 20, + "target_slot": 6, + "type": "BOOLEAN" + }, + { + "id": 165, + "origin_id": -10, + "origin_slot": 12, + "target_id": 2, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 166, + "origin_id": 11, + "origin_slot": 1, + "target_id": -20, + "target_slot": 1, + "type": "BOUNDING_BOX" + }, + { + "id": 167, + "origin_id": 11, + "origin_slot": 0, + "target_id": -20, + "target_slot": 2, + "type": "FACE_LANDMARKS" + } + ], + "extra": {}, + "category": "Conditioning & Preprocessors/Face Detection" + } + ] + }, + "extra": {} +} \ No newline at end of file diff --git a/blueprints/Video Inpaint (VOID).json b/blueprints/Video Inpaint (VOID).json new file mode 100644 index 000000000..a7cc806b5 --- /dev/null +++ b/blueprints/Video Inpaint (VOID).json @@ -0,0 +1,4340 @@ +{ + "revision": 0, + "last_node_id": 167, + "last_link_id": 0, + "nodes": [ + { + "id": 167, + "type": "c3157b75-484a-459e-b8de-57823bef5130", + "pos": [ + -430, + 690 + ], + "size": [ + 590, + 723.9375 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "label": "Source video", + "localized_name": "source_video", + "name": "source_video", + "type": "VIDEO", + "link": null + }, + { + "label": "Positive prompt (inpaint fill)", + "localized_name": "positive_prompt", + "name": "positive_prompt", + "type": "STRING", + "widget": { + "name": "positive_prompt" + }, + "link": null + }, + { + "label": "Negative prompt", + "localized_name": "negative_prompt", + "name": "negative_prompt", + "type": "STRING", + "widget": { + "name": "negative_prompt" + }, + "link": null + }, + { + "label": "SAM3 object mask prompt", + "localized_name": "sam3_text_prompt", + "name": "sam3_text_prompt", + "type": "STRING", + "widget": { + "name": "sam3_text_prompt" + }, + "link": null + }, + { + "label": "Start frame index", + "localized_name": "start_frame_index", + "name": "start_frame_index", + "type": "INT", + "widget": { + "name": "start_frame_index" + }, + "link": null + }, + { + "label": "Clip duration (seconds)", + "localized_name": "duration_seconds", + "name": "duration_seconds", + "type": "INT", + "widget": { + "name": "duration_seconds" + }, + "link": null + }, + { + "label": "Width (pass 2)", + "localized_name": "latent_width", + "name": "latent_width", + "type": "INT", + "widget": { + "name": "latent_width" + }, + "link": null + }, + { + "label": "Height (pass 2)", + "localized_name": "latent_height", + "name": "latent_height", + "type": "INT", + "widget": { + "name": "latent_height" + }, + "link": null + }, + { + "label": "Skip pass 2 (reuse pass 1)", + "localized_name": "skip_pass_2", + "name": "skip_pass_2", + "type": "BOOLEAN", + "widget": { + "name": "skip_pass_2" + }, + "link": null + }, + { + "label": "Noise seed", + "localized_name": "noise_seed", + "name": "noise_seed", + "type": "INT", + "widget": { + "name": "noise_seed" + }, + "link": null + }, + { + "label": "SAM3 checkpoint", + "localized_name": "sam3_checkpoint", + "name": "sam3_checkpoint", + "type": "COMBO", + "widget": { + "name": "sam3_checkpoint" + }, + "link": null + }, + { + "label": "VOID UNet — pass 1", + "localized_name": "void_unet_pass1", + "name": "void_unet_pass1", + "type": "COMBO", + "widget": { + "name": "void_unet_pass1" + }, + "link": null + }, + { + "label": "VOID UNet — pass 2", + "localized_name": "void_unet_pass2", + "name": "void_unet_pass2", + "type": "COMBO", + "widget": { + "name": "void_unet_pass2" + }, + "link": null + }, + { + "label": "Optical flow model", + "localized_name": "optical_flow_model", + "name": "optical_flow_model", + "type": "COMBO", + "widget": { + "name": "optical_flow_model" + }, + "link": null + }, + { + "label": "CLIP / T5 weights", + "localized_name": "clip_name", + "name": "clip_name", + "type": "COMBO", + "widget": { + "name": "clip_name" + }, + "link": null + }, + { + "label": "VAE weights", + "localized_name": "vae_name", + "name": "vae_name", + "type": "COMBO", + "widget": { + "name": "vae_name" + }, + "link": null + } + ], + "outputs": [ + { + "label": "Pass 1 (intermediate)", + "localized_name": "pass_1_video", + "name": "pass_1_video", + "type": "VIDEO", + "links": [] + }, + { + "label": "Pass 2 (final)", + "localized_name": "final_pass_2_video", + "name": "final_pass_2_video", + "type": "VIDEO", + "links": [] + } + ], + "properties": { + "proxyWidgets": [ + [ + "6", + "text" + ], + [ + "7", + "text" + ], + [ + "149", + "text" + ], + [ + "168", + "value" + ], + [ + "163", + "value" + ], + [ + "147", + "value" + ], + [ + "148", + "value" + ], + [ + "153", + "value" + ], + [ + "141", + "noise_seed" + ], + [ + "149", + "ckpt_name" + ], + [ + "144", + "unet_name" + ], + [ + "143", + "unet_name" + ], + [ + "142", + "model_name" + ], + [ + "2", + "clip_name" + ], + [ + "3", + "vae_name" + ] + ] + }, + "widgets_values": [], + "title": "Video Inpaint (VOID)" + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "c3157b75-484a-459e-b8de-57823bef5130", + "version": 1, + "state": { + "lastGroupId": 13, + "lastNodeId": 171, + "lastLinkId": 406, + "lastRerouteId": 0 + }, + "revision": 5, + "config": {}, + "name": "Video Inpaint (VOID)", + "inputNode": { + "id": -10, + "bounding": [ + -1530, + 800, + 203.1796875, + 368 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + 2030, + 710, + 166.130859375, + 88 + ] + }, + "inputs": [ + { + "id": "1865ea29-14b1-4471-b5e0-d35bba595b9c", + "name": "source_video", + "type": "VIDEO", + "linkIds": [ + 373 + ], + "localized_name": "source_video", + "label": "Source video", + "pos": [ + -1350.8203125, + 824 + ] + }, + { + "id": "f1b2b2c4-bc2e-4e72-b16c-7e560e58d2d6", + "name": "positive_prompt", + "type": "STRING", + "linkIds": [ + 377 + ], + "localized_name": "positive_prompt", + "label": "Positive prompt (inpaint fill)", + "pos": [ + -1350.8203125, + 844 + ] + }, + { + "id": "931ac4dd-3cb6-4555-a1f0-619be81d64f6", + "name": "negative_prompt", + "type": "STRING", + "linkIds": [ + 387 + ], + "localized_name": "negative_prompt", + "label": "Negative prompt", + "pos": [ + -1350.8203125, + 864 + ] + }, + { + "id": "7a0963c3-bf2f-464d-80c2-6a6c90569883", + "name": "sam3_text_prompt", + "type": "STRING", + "linkIds": [ + 388 + ], + "localized_name": "sam3_text_prompt", + "label": "SAM3 object mask prompt", + "pos": [ + -1350.8203125, + 884 + ] + }, + { + "id": "f53f340f-2031-401d-b613-157622ef336f", + "name": "start_frame_index", + "type": "INT", + "linkIds": [ + 389 + ], + "localized_name": "start_frame_index", + "label": "Start frame index", + "pos": [ + -1350.8203125, + 904 + ] + }, + { + "id": "d5b8704b-7c8c-4cf0-87cd-26b293f65f83", + "name": "duration_seconds", + "type": "INT", + "linkIds": [ + 390 + ], + "localized_name": "duration_seconds", + "label": "Clip duration (seconds)", + "pos": [ + -1350.8203125, + 924 + ] + }, + { + "id": "7140209f-5058-4933-ae06-438256f77f23", + "name": "latent_width", + "type": "INT", + "linkIds": [ + 391 + ], + "localized_name": "latent_width", + "label": "Width (pass 2)", + "pos": [ + -1350.8203125, + 944 + ] + }, + { + "id": "084a140a-6fa9-4676-9483-ad30e0b14947", + "name": "latent_height", + "type": "INT", + "linkIds": [ + 392 + ], + "localized_name": "latent_height", + "label": "Height (pass 2)", + "pos": [ + -1350.8203125, + 964 + ] + }, + { + "id": "a8109321-e101-4ed8-b6f3-8ad1c815f35c", + "name": "skip_pass_2", + "type": "BOOLEAN", + "linkIds": [ + 393 + ], + "localized_name": "skip_pass_2", + "label": "Skip pass 2 (reuse pass 1)", + "pos": [ + -1350.8203125, + 984 + ] + }, + { + "id": "6964ab42-0662-47f2-9c2a-96782fdcb883", + "name": "noise_seed", + "type": "INT", + "linkIds": [ + 400 + ], + "localized_name": "noise_seed", + "label": "Noise seed", + "pos": [ + -1350.8203125, + 1004 + ] + }, + { + "id": "dccde360-461d-417e-b3f5-e1a4d6cece39", + "name": "sam3_checkpoint", + "type": "COMBO", + "linkIds": [ + 401 + ], + "localized_name": "sam3_checkpoint", + "label": "SAM3 checkpoint", + "pos": [ + -1350.8203125, + 1024 + ] + }, + { + "id": "5ce0d036-be08-4539-9ec6-e923fcdb8825", + "name": "void_unet_pass1", + "type": "COMBO", + "linkIds": [ + 402 + ], + "localized_name": "void_unet_pass1", + "label": "VOID UNet — pass 1", + "pos": [ + -1350.8203125, + 1044 + ] + }, + { + "id": "c1de695a-a08a-40bc-b9e4-d156fef73cd0", + "name": "void_unet_pass2", + "type": "COMBO", + "linkIds": [ + 403 + ], + "localized_name": "void_unet_pass2", + "label": "VOID UNet — pass 2", + "pos": [ + -1350.8203125, + 1064 + ] + }, + { + "id": "99da50bc-db57-4a21-9831-0f77b3c4fe99", + "name": "optical_flow_model", + "type": "COMBO", + "linkIds": [ + 404 + ], + "localized_name": "optical_flow_model", + "label": "Optical flow model", + "pos": [ + -1350.8203125, + 1084 + ] + }, + { + "id": "c756ce20-cfa6-4fe0-9eb0-543d56781cb7", + "name": "clip_name", + "type": "COMBO", + "linkIds": [ + 405 + ], + "localized_name": "clip_name", + "label": "CLIP / T5 weights", + "pos": [ + -1350.8203125, + 1104 + ] + }, + { + "id": "d8eb12ad-a805-42d9-86b4-6f2c2cc5a231", + "name": "vae_name", + "type": "COMBO", + "linkIds": [ + 406 + ], + "localized_name": "vae_name", + "label": "VAE weights", + "pos": [ + -1350.8203125, + 1124 + ] + } + ], + "outputs": [ + { + "id": "a21e83df-8c95-43a3-bd73-feeea67e90cd", + "name": "pass_1_video", + "type": "VIDEO", + "linkIds": [ + 77 + ], + "localized_name": "pass_1_video", + "label": "Pass 1 (intermediate)", + "pos": [ + 2054, + 734 + ] + }, + { + "id": "02c265f3-012f-499f-a4e8-a6d6aaf72885", + "name": "final_pass_2_video", + "type": "VIDEO", + "linkIds": [ + 362 + ], + "localized_name": "final_pass_2_video", + "label": "Pass 2 (final)", + "pos": [ + 2054, + 754 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 2, + "type": "CLIPLoader", + "pos": [ + -710, + 30 + ], + "size": [ + 320, + 150 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "clip_name", + "name": "clip_name", + "type": "COMBO", + "widget": { + "name": "clip_name" + }, + "link": 405 + }, + { + "localized_name": "type", + "name": "type", + "type": "COMBO", + "widget": { + "name": "type" + }, + "link": null + }, + { + "localized_name": "device", + "name": "device", + "shape": 7, + "type": "COMBO", + "widget": { + "name": "device" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "slot_index": 0, + "links": [ + 2, + 3 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPLoader", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "models": [ + { + "name": "t5xxl_fp16.safetensors", + "url": "https://huggingface.co/comfyanonymous/flux_text_encoders/resolve/main/t5xxl_fp16.safetensors", + "directory": "text_encoders" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "t5xxl_fp16.safetensors", + "cogvideox", + "default" + ] + }, + { + "id": 3, + "type": "VAELoader", + "pos": [ + -710, + 220 + ], + "size": [ + 320, + 90 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "vae_name", + "name": "vae_name", + "type": "COMBO", + "widget": { + "name": "vae_name" + }, + "link": 406 + } + ], + "outputs": [ + { + "localized_name": "VAE", + "name": "VAE", + "type": "VAE", + "slot_index": 0, + "links": [ + 4, + 45, + 70 + ] + } + ], + "properties": { + "Node name for S&R": "VAELoader", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "models": [ + { + "name": "cogvideox_vae.safetensors", + "url": "https://huggingface.co/Comfy-Org/void-model/resolve/main/vae/cogvideox_vae.safetensors", + "directory": "vae" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "cogvideox_vae.safetensors" + ] + }, + { + "id": 7, + "type": "CLIPTextEncode", + "pos": [ + -260, + 200 + ], + "size": [ + 590, + 180 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 3 + }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": 387 + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "slot_index": 0, + "links": [ + 9 + ] + } + ], + "title": "Negative Prompt", + "properties": { + "Node name for S&R": "CLIPTextEncode", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "" + ], + "color": "#223", + "bgcolor": "#335" + }, + { + "id": 136, + "type": "CFGGuider", + "pos": [ + 410, + 1640 + ], + "size": [ + 300, + 130 + ], + "flags": {}, + "order": 16, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 322 + }, + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "link": 309 + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "link": 310 + }, + { + "localized_name": "cfg", + "name": "cfg", + "type": "FLOAT", + "widget": { + "name": "cfg" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "GUIDER", + "name": "GUIDER", + "type": "GUIDER", + "links": [ + 311 + ] + } + ], + "title": "CFGGuider (Pass 2 cfg=6)", + "properties": { + "Node name for S&R": "CFGGuider", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 6 + ] + }, + { + "id": 138, + "type": "BasicScheduler", + "pos": [ + 410, + 160 + ], + "size": [ + 270, + 150 + ], + "flags": {}, + "order": 18, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 324 + }, + { + "localized_name": "scheduler", + "name": "scheduler", + "type": "COMBO", + "widget": { + "name": "scheduler" + }, + "link": null + }, + { + "localized_name": "steps", + "name": "steps", + "type": "INT", + "widget": { + "name": "steps" + }, + "link": null + }, + { + "localized_name": "denoise", + "name": "denoise", + "type": "FLOAT", + "widget": { + "name": "denoise" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "SIGMAS", + "name": "SIGMAS", + "type": "SIGMAS", + "links": [ + 315 + ] + } + ], + "properties": { + "Node name for S&R": "BasicScheduler", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "simple", + 30, + 1 + ] + }, + { + "id": 140, + "type": "CFGGuider", + "pos": [ + 410, + -30 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 19, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 325 + }, + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "link": 317 + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "link": 318 + }, + { + "localized_name": "cfg", + "name": "cfg", + "type": "FLOAT", + "widget": { + "name": "cfg" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "GUIDER", + "name": "GUIDER", + "type": "GUIDER", + "links": [ + 319 + ] + } + ], + "properties": { + "Node name for S&R": "CFGGuider", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 6 + ] + }, + { + "id": 141, + "type": "RandomNoise", + "pos": [ + 410, + -180 + ], + "size": [ + 270, + 90 + ], + "flags": {}, + "order": 20, + "mode": 0, + "inputs": [ + { + "localized_name": "noise_seed", + "name": "noise_seed", + "type": "INT", + "widget": { + "name": "noise_seed" + }, + "link": 400 + } + ], + "outputs": [ + { + "localized_name": "NOISE", + "name": "NOISE", + "type": "NOISE", + "links": [ + 320 + ] + } + ], + "properties": { + "Node name for S&R": "RandomNoise", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 43, + "fixed" + ] + }, + { + "id": 31, + "type": "VOIDWarpedNoise", + "pos": [ + 410, + 1090 + ], + "size": [ + 300, + 200 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "localized_name": "optical_flow", + "name": "optical_flow", + "type": "OPTICAL_FLOW", + "link": 321 + }, + { + "localized_name": "video", + "name": "video", + "type": "IMAGE", + "link": 72 + }, + { + "localized_name": "width", + "name": "width", + "type": "INT", + "widget": { + "name": "width" + }, + "link": 333 + }, + { + "localized_name": "height", + "name": "height", + "type": "INT", + "widget": { + "name": "height" + }, + "link": 335 + }, + { + "localized_name": "length", + "name": "length", + "type": "INT", + "widget": { + "name": "length" + }, + "link": 67 + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { + "name": "batch_size" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "warped_noise", + "name": "warped_noise", + "type": "LATENT", + "slot_index": 0, + "links": [ + 53 + ] + } + ], + "title": "Warped Noise (from Pass 1 output)", + "properties": { + "Node name for S&R": "VOIDWarpedNoise", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 672, + 384, + 45, + 1 + ] + }, + { + "id": 35, + "type": "SamplerCustomAdvanced", + "pos": [ + 870, + 1110 + ], + "size": [ + 250, + 170 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "localized_name": "noise", + "name": "noise", + "type": "NOISE", + "link": 54 + }, + { + "localized_name": "guider", + "name": "guider", + "type": "GUIDER", + "link": 311 + }, + { + "localized_name": "sampler", + "name": "sampler", + "type": "SAMPLER", + "link": 305 + }, + { + "localized_name": "sigmas", + "name": "sigmas", + "type": "SIGMAS", + "link": 313 + }, + { + "localized_name": "latent_image", + "name": "latent_image", + "type": "LATENT", + "link": 48 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "LATENT", + "slot_index": 0, + "links": [ + 49 + ] + }, + { + "localized_name": "denoised_output", + "name": "denoised_output", + "type": "LATENT", + "slot_index": 1, + "links": [] + } + ], + "title": "Pass 2 Sample", + "properties": { + "Node name for S&R": "SamplerCustomAdvanced", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 132, + "type": "MaskPreview", + "pos": [ + 390, + 560 + ], + "size": [ + 790, + 430 + ], + "flags": {}, + "order": 15, + "mode": 4, + "inputs": [ + { + "localized_name": "mask", + "name": "mask", + "type": "MASK", + "link": 340 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "MaskPreview", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 142, + "type": "OpticalFlowLoader", + "pos": [ + -710, + 410 + ], + "size": [ + 320, + 90 + ], + "flags": {}, + "order": 21, + "mode": 0, + "inputs": [ + { + "localized_name": "model_name", + "name": "model_name", + "type": "COMBO", + "widget": { + "name": "model_name" + }, + "link": 404 + } + ], + "outputs": [ + { + "localized_name": "OPTICAL_FLOW", + "name": "OPTICAL_FLOW", + "type": "OPTICAL_FLOW", + "links": [ + 321 + ] + } + ], + "properties": { + "Node name for S&R": "OpticalFlowLoader", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "models": [ + { + "name": "raft_large_C_T_SKHT_V2-ff5fadd5.safetensors", + "url": "https://huggingface.co/Comfy-Org/void-model/resolve/main/optical_flow/raft_large_C_T_SKHT_V2-ff5fadd5.safetensors", + "directory": "optical_flow" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "raft_large_C_T_SKHT_V2-ff5fadd5.safetensors" + ] + }, + { + "id": 10, + "type": "VOIDInpaintConditioning", + "pos": [ + -110, + 430 + ], + "size": [ + 300, + 280 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "link": 8 + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "link": 9 + }, + { + "localized_name": "vae", + "name": "vae", + "type": "VAE", + "link": 4 + }, + { + "localized_name": "video", + "name": "video", + "type": "IMAGE", + "link": 326 + }, + { + "localized_name": "quadmask", + "name": "quadmask", + "type": "MASK", + "link": 339 + }, + { + "localized_name": "width", + "name": "width", + "type": "INT", + "widget": { + "name": "width" + }, + "link": 332 + }, + { + "localized_name": "height", + "name": "height", + "type": "INT", + "widget": { + "name": "height" + }, + "link": 334 + }, + { + "localized_name": "length", + "name": "length", + "type": "INT", + "widget": { + "name": "length" + }, + "link": 63 + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { + "name": "batch_size" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "slot_index": 0, + "links": [ + 309, + 317 + ] + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "slot_index": 1, + "links": [ + 310, + 318 + ] + }, + { + "localized_name": "latent", + "name": "latent", + "type": "LATENT", + "slot_index": 2, + "links": [ + 48, + 82 + ] + } + ], + "properties": { + "Node name for S&R": "VOIDInpaintConditioning", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 672, + 384, + 45, + 1 + ] + }, + { + "id": 32, + "type": "VOIDWarpedNoiseSource", + "pos": [ + 410, + 1350 + ], + "size": [ + 300, + 50 + ], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "localized_name": "warped_noise", + "name": "warped_noise", + "type": "LATENT", + "link": 53 + } + ], + "outputs": [ + { + "localized_name": "NOISE", + "name": "NOISE", + "type": "NOISE", + "slot_index": 0, + "links": [ + 54 + ] + } + ], + "title": "Warped Noise → NOISE", + "properties": { + "Node name for S&R": "VOIDWarpedNoiseSource", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 137, + "type": "BasicScheduler", + "pos": [ + 410, + 1470 + ], + "size": [ + 300, + 150 + ], + "flags": {}, + "order": 17, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 323 + }, + { + "localized_name": "scheduler", + "name": "scheduler", + "type": "COMBO", + "widget": { + "name": "scheduler" + }, + "link": null + }, + { + "localized_name": "steps", + "name": "steps", + "type": "INT", + "widget": { + "name": "steps" + }, + "link": null + }, + { + "localized_name": "denoise", + "name": "denoise", + "type": "FLOAT", + "widget": { + "name": "denoise" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "SIGMAS", + "name": "SIGMAS", + "type": "SIGMAS", + "links": [ + 313 + ] + } + ], + "properties": { + "Node name for S&R": "BasicScheduler", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "simple", + 30, + 1 + ] + }, + { + "id": 134, + "type": "VOIDSampler", + "pos": [ + 410, + 1800 + ], + "size": [ + 300, + 50 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [], + "outputs": [ + { + "localized_name": "SAMPLER", + "name": "SAMPLER", + "type": "SAMPLER", + "links": [ + 305 + ] + } + ], + "properties": { + "Node name for S&R": "VOIDSampler", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 143, + "type": "UNETLoader", + "pos": [ + -710, + 550 + ], + "size": [ + 320, + 120 + ], + "flags": {}, + "order": 22, + "mode": 0, + "inputs": [ + { + "localized_name": "unet_name", + "name": "unet_name", + "type": "COMBO", + "widget": { + "name": "unet_name" + }, + "link": 403 + }, + { + "localized_name": "weight_dtype", + "name": "weight_dtype", + "type": "COMBO", + "widget": { + "name": "weight_dtype" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "links": [ + 322, + 323 + ] + } + ], + "properties": { + "Node name for S&R": "UNETLoader", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "models": [ + { + "name": "void_pass2.safetensors", + "url": "https://huggingface.co/Comfy-Org/void-model/resolve/main/diffusion_models/void_pass2.safetensors", + "directory": "diffusion_models" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "void_pass2.safetensors", + "default" + ] + }, + { + "id": 144, + "type": "UNETLoader", + "pos": [ + -720, + -150 + ], + "size": [ + 320, + 120 + ], + "flags": {}, + "order": 23, + "mode": 0, + "inputs": [ + { + "localized_name": "unet_name", + "name": "unet_name", + "type": "COMBO", + "widget": { + "name": "unet_name" + }, + "link": 402 + }, + { + "localized_name": "weight_dtype", + "name": "weight_dtype", + "type": "COMBO", + "widget": { + "name": "weight_dtype" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "links": [ + 324, + 325 + ] + } + ], + "properties": { + "Node name for S&R": "UNETLoader", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "models": [ + { + "name": "void_pass1.safetensors", + "url": "https://huggingface.co/Comfy-Org/void-model/resolve/main/diffusion_models/void_pass1.safetensors", + "directory": "diffusion_models" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "void_pass1.safetensors", + "default" + ] + }, + { + "id": 46, + "type": "CreateVideo", + "pos": [ + 1230, + -20 + ], + "size": [ + 240, + 110 + ], + "flags": {}, + "order": 13, + "mode": 0, + "inputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "link": 73 + }, + { + "localized_name": "audio", + "name": "audio", + "shape": 7, + "type": "AUDIO", + "link": 355 + }, + { + "localized_name": "fps", + "name": "fps", + "type": "FLOAT", + "widget": { + "name": "fps" + }, + "link": 368 + } + ], + "outputs": [ + { + "localized_name": "VIDEO", + "name": "VIDEO", + "type": "VIDEO", + "links": [ + 77 + ] + } + ], + "properties": { + "Node name for S&R": "CreateVideo", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 30 + ] + }, + { + "id": 133, + "type": "VOIDSampler", + "pos": [ + 410, + 370 + ], + "size": [ + 280, + 50 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [], + "outputs": [ + { + "localized_name": "SAMPLER", + "name": "SAMPLER", + "type": "SAMPLER", + "links": [ + 304 + ] + } + ], + "properties": { + "Node name for S&R": "VOIDSampler", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 49, + "type": "SamplerCustomAdvanced", + "pos": [ + 880, + -180 + ], + "size": [ + 250, + 270 + ], + "flags": {}, + "order": 14, + "mode": 0, + "inputs": [ + { + "localized_name": "noise", + "name": "noise", + "type": "NOISE", + "link": 320 + }, + { + "localized_name": "guider", + "name": "guider", + "type": "GUIDER", + "link": 319 + }, + { + "localized_name": "sampler", + "name": "sampler", + "type": "SAMPLER", + "link": 304 + }, + { + "localized_name": "sigmas", + "name": "sigmas", + "type": "SIGMAS", + "link": 315 + }, + { + "localized_name": "latent_image", + "name": "latent_image", + "type": "LATENT", + "link": 82 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "LATENT", + "links": [ + 83 + ] + }, + { + "localized_name": "denoised_output", + "name": "denoised_output", + "type": "LATENT", + "links": null + } + ], + "properties": { + "Node name for S&R": "SamplerCustomAdvanced", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 45, + "type": "VAEDecode", + "pos": [ + 1230, + -180 + ], + "size": [ + 230, + 80 + ], + "flags": {}, + "order": 12, + "mode": 0, + "inputs": [ + { + "localized_name": "samples", + "name": "samples", + "type": "LATENT", + "link": 83 + }, + { + "localized_name": "vae", + "name": "vae", + "type": "VAE", + "link": 70 + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 72, + 73, + 342 + ] + } + ], + "properties": { + "Node name for S&R": "VAEDecode", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 6, + "type": "CLIPTextEncode", + "pos": [ + -260, + -180 + ], + "size": [ + 580, + 310 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 2 + }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": 377 + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "slot_index": 0, + "links": [ + 8 + ] + } + ], + "title": "Positive Prompt", + "properties": { + "Node name for S&R": "CLIPTextEncode", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "" + ], + "color": "#232", + "bgcolor": "#353" + }, + { + "id": 145, + "type": "ImageFromBatch", + "pos": [ + -410, + 850 + ], + "size": [ + 230, + 120 + ], + "flags": {}, + "order": 24, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 366 + }, + { + "localized_name": "batch_index", + "name": "batch_index", + "type": "INT", + "widget": { + "name": "batch_index" + }, + "link": 384 + }, + { + "localized_name": "length", + "name": "length", + "type": "INT", + "widget": { + "name": "length" + }, + "link": 361 + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 326, + 327, + 336 + ] + } + ], + "properties": { + "Node name for S&R": "ImageFromBatch", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 0, + 197 + ] + }, + { + "id": 36, + "type": "VAEDecode", + "pos": [ + 1220, + 1110 + ], + "size": [ + 230, + 80 + ], + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "localized_name": "samples", + "name": "samples", + "type": "LATENT", + "link": 49 + }, + { + "localized_name": "vae", + "name": "vae", + "type": "VAE", + "link": 45 + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "slot_index": 0, + "links": [ + 341 + ] + } + ], + "title": "Pass 2 VAE Decode", + "properties": { + "Node name for S&R": "VAEDecode", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 149, + "type": "c3e0d783-9aa3-4e75-a94d-19937968ef86", + "pos": [ + -20, + 840 + ], + "size": [ + 290, + 370 + ], + "flags": {}, + "order": 27, + "mode": 0, + "inputs": [ + { + "label": "image", + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 336 + }, + { + "label": "object", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": 388 + }, + { + "name": "bboxes", + "shape": 7, + "type": "BOUNDING_BOX", + "link": null + }, + { + "name": "positive_coords", + "shape": 7, + "type": "STRING", + "link": null + }, + { + "name": "negative_coords", + "shape": 7, + "type": "STRING", + "link": null + }, + { + "name": "threshold", + "type": "FLOAT", + "widget": { + "name": "threshold" + }, + "link": null + }, + { + "name": "refine_iterations", + "type": "INT", + "widget": { + "name": "refine_iterations" + }, + "link": null + }, + { + "name": "individual_masks", + "type": "BOOLEAN", + "widget": { + "name": "individual_masks" + }, + "link": null + }, + { + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": 401 + } + ], + "outputs": [ + { + "localized_name": "masks", + "name": "masks", + "type": "MASK", + "links": [ + 339, + 340 + ] + }, + { + "localized_name": "bboxes", + "name": "bboxes", + "type": "BOUNDING_BOX", + "links": [] + } + ], + "properties": { + "proxyWidgets": [ + [ + "78", + "text" + ], + [ + "75", + "threshold" + ], + [ + "75", + "refine_iterations" + ], + [ + "75", + "individual_masks" + ], + [ + "77", + "ckpt_name" + ] + ], + "cnr_id": "comfy-core", + "ver": "0.19.3", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "ue_properties": { + "widget_ue_connectable": { + "text": true + }, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [] + }, + { + "id": 43, + "type": "GetImageSize", + "pos": [ + -410, + 1140 + ], + "size": [ + 230, + 160 + ], + "flags": { + "collapsed": false + }, + "order": 11, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 327 + } + ], + "outputs": [ + { + "localized_name": "width", + "name": "width", + "type": "INT", + "links": null + }, + { + "localized_name": "height", + "name": "height", + "type": "INT", + "links": null + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "links": [ + 63, + 67 + ] + } + ], + "properties": { + "Node name for S&R": "GetImageSize", + "cnr_id": "comfy-core", + "ver": "0.20.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 147, + "type": "PrimitiveInt", + "pos": [ + -570, + 1660 + ], + "size": [ + 270, + 90 + ], + "flags": {}, + "order": 25, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "INT", + "widget": { + "name": "value" + }, + "link": 391 + } + ], + "outputs": [ + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 332, + 333 + ] + } + ], + "title": "Int (Width)", + "properties": { + "Node name for S&R": "PrimitiveInt", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 672, + "fixed" + ] + }, + { + "id": 148, + "type": "PrimitiveInt", + "pos": [ + -570, + 1790 + ], + "size": [ + 270, + 90 + ], + "flags": {}, + "order": 26, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "INT", + "widget": { + "name": "value" + }, + "link": 392 + } + ], + "outputs": [ + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 334, + 335 + ] + } + ], + "title": "Int (Height)", + "properties": { + "Node name for S&R": "PrimitiveInt", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 384, + "fixed" + ] + }, + { + "id": 150, + "type": "ComfySwitchNode", + "pos": [ + 1510, + 1080 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 28, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 342 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 341 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 346 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 363 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + false + ] + }, + { + "id": 153, + "type": "PrimitiveBoolean", + "pos": [ + -580, + 1440 + ], + "size": [ + 270, + 80 + ], + "flags": {}, + "order": 29, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "BOOLEAN", + "widget": { + "name": "value" + }, + "link": 393 + } + ], + "outputs": [ + { + "localized_name": "BOOLEAN", + "name": "BOOLEAN", + "type": "BOOLEAN", + "links": [ + 346 + ] + } + ], + "title": "Boolean (Skip Pass 2?)", + "properties": { + "Node name for S&R": "PrimitiveBoolean", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + false + ] + }, + { + "id": 158, + "type": "TrimAudioDuration", + "pos": [ + -10, + 1580 + ], + "size": [ + 270, + 120 + ], + "flags": {}, + "order": 30, + "mode": 0, + "inputs": [ + { + "localized_name": "audio", + "name": "audio", + "type": "AUDIO", + "link": 367 + }, + { + "localized_name": "start_index", + "name": "start_index", + "type": "FLOAT", + "widget": { + "name": "start_index" + }, + "link": 386 + }, + { + "localized_name": "duration", + "name": "duration", + "type": "FLOAT", + "widget": { + "name": "duration" + }, + "link": 385 + } + ], + "outputs": [ + { + "localized_name": "AUDIO", + "name": "AUDIO", + "type": "AUDIO", + "links": [ + 355, + 364 + ] + } + ], + "properties": { + "Node name for S&R": "TrimAudioDuration", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 0, + 60 + ] + }, + { + "id": 163, + "type": "PrimitiveInt", + "pos": [ + -740, + 1170 + ], + "size": [ + 230, + 90 + ], + "flags": {}, + "order": 31, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "INT", + "widget": { + "name": "value" + }, + "link": 390 + } + ], + "outputs": [ + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 360 + ] + } + ], + "title": "Int (Video duration)", + "properties": { + "Node name for S&R": "PrimitiveInt", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 5, + "fixed" + ] + }, + { + "id": 164, + "type": "ComfyMathExpression", + "pos": [ + -740, + 1300 + ], + "size": [ + 230, + 100 + ], + "flags": { + "collapsed": true + }, + "order": 32, + "mode": 0, + "inputs": [ + { + "label": "a", + "localized_name": "values.a", + "name": "values.a", + "type": "FLOAT,INT,BOOLEAN", + "link": 360 + }, + { + "label": "b", + "localized_name": "values.b", + "name": "values.b", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": 371 + }, + { + "label": "c", + "localized_name": "values.c", + "name": "values.c", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": null + }, + { + "localized_name": "expression", + "name": "expression", + "type": "STRING", + "widget": { + "name": "expression" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": [ + 385 + ] + }, + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 361 + ] + }, + { + "localized_name": "BOOL", + "name": "BOOL", + "type": "BOOLEAN", + "links": null + } + ], + "properties": { + "Node name for S&R": "ComfyMathExpression" + }, + "widgets_values": [ + "a * b" + ] + }, + { + "id": 165, + "type": "CreateVideo", + "pos": [ + 1510, + 1270 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 33, + "mode": 0, + "inputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "link": 363 + }, + { + "localized_name": "audio", + "name": "audio", + "shape": 7, + "type": "AUDIO", + "link": 364 + }, + { + "localized_name": "fps", + "name": "fps", + "type": "FLOAT", + "widget": { + "name": "fps" + }, + "link": 372 + } + ], + "outputs": [ + { + "localized_name": "VIDEO", + "name": "VIDEO", + "type": "VIDEO", + "links": [ + 362 + ] + } + ], + "properties": { + "Node name for S&R": "CreateVideo" + }, + "widgets_values": [ + 24 + ] + }, + { + "id": 166, + "type": "GetVideoComponents", + "pos": [ + -740, + 840 + ], + "size": [ + 230, + 100 + ], + "flags": {}, + "order": 34, + "mode": 0, + "inputs": [ + { + "localized_name": "video", + "name": "video", + "type": "VIDEO", + "link": 373 + } + ], + "outputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "links": [ + 366 + ] + }, + { + "localized_name": "audio", + "name": "audio", + "type": "AUDIO", + "links": [ + 367 + ] + }, + { + "localized_name": "fps", + "name": "fps", + "type": "FLOAT", + "links": [ + 368, + 371, + 372, + 383 + ] + } + ], + "properties": { + "Node name for S&R": "GetVideoComponents" + } + }, + { + "id": 168, + "type": "PrimitiveInt", + "pos": [ + -740, + 980 + ], + "size": [ + 230, + 90 + ], + "flags": {}, + "order": 35, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "INT", + "widget": { + "name": "value" + }, + "link": 389 + } + ], + "outputs": [ + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 382 + ] + } + ], + "title": "Int (Index)", + "properties": { + "Node name for S&R": "PrimitiveInt", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 0, + "fixed" + ] + }, + { + "id": 169, + "type": "ComfyMathExpression", + "pos": [ + -740, + 1110 + ], + "size": [ + 230, + 100 + ], + "flags": { + "collapsed": true + }, + "order": 36, + "mode": 0, + "inputs": [ + { + "label": "a", + "localized_name": "values.a", + "name": "values.a", + "type": "FLOAT,INT,BOOLEAN", + "link": 382 + }, + { + "label": "b", + "localized_name": "values.b", + "name": "values.b", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": 383 + }, + { + "label": "c", + "localized_name": "values.c", + "name": "values.c", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": null + }, + { + "localized_name": "expression", + "name": "expression", + "type": "STRING", + "widget": { + "name": "expression" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": [ + 386 + ] + }, + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 384 + ] + }, + { + "localized_name": "BOOL", + "name": "BOOL", + "type": "BOOLEAN", + "links": null + } + ], + "properties": { + "Node name for S&R": "ComfyMathExpression" + }, + "widgets_values": [ + "a * b" + ] + } + ], + "groups": [ + { + "id": 1, + "title": "Models", + "bounding": [ + -790, + -260, + 470, + 990 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 2, + "title": "Input videos (place files in ComfyUI/input/)", + "bounding": [ + -790, + 760, + 660, + 560 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 3, + "title": "Shared: Text & Mask Conditioning", + "bounding": [ + -290, + -260, + 640, + 990 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 4, + "title": "Pass 1: Sample (Random Noise → DDIM)", + "bounding": [ + 380, + -260, + 810, + 750 + ], + "color": "#8A8", + "flags": {} + }, + { + "id": 6, + "title": "Pass 2: Sample (Warped Noise → DDIM)", + "bounding": [ + 380, + 1020, + 810, + 880 + ], + "color": "#8A8", + "flags": {} + }, + { + "id": 8, + "title": "Create Mask", + "bounding": [ + -100, + 760, + 450, + 560 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 9, + "title": "Pass 1", + "bounding": [ + -730, + -220, + 360, + 210 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 10, + "title": "Pass 2", + "bounding": [ + -720, + 340, + 340, + 340 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 11, + "title": "Output Video Size", + "bounding": [ + -790, + 1580, + 660, + 320 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 12, + "title": "Skip Pass 2", + "bounding": [ + -790, + 1350, + 660, + 200 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 13, + "title": "Trim Audio", + "bounding": [ + -100, + 1350, + 450, + 550 + ], + "color": "#3f789e", + "flags": {} + } + ], + "links": [ + { + "id": 3, + "origin_id": 2, + "origin_slot": 0, + "target_id": 7, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 322, + "origin_id": 143, + "origin_slot": 0, + "target_id": 136, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 309, + "origin_id": 10, + "origin_slot": 0, + "target_id": 136, + "target_slot": 1, + "type": "CONDITIONING" + }, + { + "id": 310, + "origin_id": 10, + "origin_slot": 1, + "target_id": 136, + "target_slot": 2, + "type": "CONDITIONING" + }, + { + "id": 324, + "origin_id": 144, + "origin_slot": 0, + "target_id": 138, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 325, + "origin_id": 144, + "origin_slot": 0, + "target_id": 140, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 317, + "origin_id": 10, + "origin_slot": 0, + "target_id": 140, + "target_slot": 1, + "type": "CONDITIONING" + }, + { + "id": 318, + "origin_id": 10, + "origin_slot": 1, + "target_id": 140, + "target_slot": 2, + "type": "CONDITIONING" + }, + { + "id": 321, + "origin_id": 142, + "origin_slot": 0, + "target_id": 31, + "target_slot": 0, + "type": "OPTICAL_FLOW" + }, + { + "id": 72, + "origin_id": 45, + "origin_slot": 0, + "target_id": 31, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 333, + "origin_id": 147, + "origin_slot": 0, + "target_id": 31, + "target_slot": 2, + "type": "INT" + }, + { + "id": 335, + "origin_id": 148, + "origin_slot": 0, + "target_id": 31, + "target_slot": 3, + "type": "INT" + }, + { + "id": 67, + "origin_id": 43, + "origin_slot": 2, + "target_id": 31, + "target_slot": 4, + "type": "INT" + }, + { + "id": 54, + "origin_id": 32, + "origin_slot": 0, + "target_id": 35, + "target_slot": 0, + "type": "NOISE" + }, + { + "id": 311, + "origin_id": 136, + "origin_slot": 0, + "target_id": 35, + "target_slot": 1, + "type": "GUIDER" + }, + { + "id": 305, + "origin_id": 134, + "origin_slot": 0, + "target_id": 35, + "target_slot": 2, + "type": "SAMPLER" + }, + { + "id": 313, + "origin_id": 137, + "origin_slot": 0, + "target_id": 35, + "target_slot": 3, + "type": "SIGMAS" + }, + { + "id": 48, + "origin_id": 10, + "origin_slot": 2, + "target_id": 35, + "target_slot": 4, + "type": "LATENT" + }, + { + "id": 340, + "origin_id": 149, + "origin_slot": 0, + "target_id": 132, + "target_slot": 0, + "type": "MASK" + }, + { + "id": 8, + "origin_id": 6, + "origin_slot": 0, + "target_id": 10, + "target_slot": 0, + "type": "CONDITIONING" + }, + { + "id": 9, + "origin_id": 7, + "origin_slot": 0, + "target_id": 10, + "target_slot": 1, + "type": "CONDITIONING" + }, + { + "id": 4, + "origin_id": 3, + "origin_slot": 0, + "target_id": 10, + "target_slot": 2, + "type": "VAE" + }, + { + "id": 326, + "origin_id": 145, + "origin_slot": 0, + "target_id": 10, + "target_slot": 3, + "type": "IMAGE" + }, + { + "id": 339, + "origin_id": 149, + "origin_slot": 0, + "target_id": 10, + "target_slot": 4, + "type": "MASK" + }, + { + "id": 332, + "origin_id": 147, + "origin_slot": 0, + "target_id": 10, + "target_slot": 5, + "type": "INT" + }, + { + "id": 334, + "origin_id": 148, + "origin_slot": 0, + "target_id": 10, + "target_slot": 6, + "type": "INT" + }, + { + "id": 63, + "origin_id": 43, + "origin_slot": 2, + "target_id": 10, + "target_slot": 7, + "type": "INT" + }, + { + "id": 53, + "origin_id": 31, + "origin_slot": 0, + "target_id": 32, + "target_slot": 0, + "type": "LATENT" + }, + { + "id": 323, + "origin_id": 143, + "origin_slot": 0, + "target_id": 137, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 73, + "origin_id": 45, + "origin_slot": 0, + "target_id": 46, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 355, + "origin_id": 158, + "origin_slot": 0, + "target_id": 46, + "target_slot": 1, + "type": "AUDIO" + }, + { + "id": 368, + "origin_id": 166, + "origin_slot": 2, + "target_id": 46, + "target_slot": 2, + "type": "FLOAT" + }, + { + "id": 320, + "origin_id": 141, + "origin_slot": 0, + "target_id": 49, + "target_slot": 0, + "type": "NOISE" + }, + { + "id": 319, + "origin_id": 140, + "origin_slot": 0, + "target_id": 49, + "target_slot": 1, + "type": "GUIDER" + }, + { + "id": 304, + "origin_id": 133, + "origin_slot": 0, + "target_id": 49, + "target_slot": 2, + "type": "SAMPLER" + }, + { + "id": 315, + "origin_id": 138, + "origin_slot": 0, + "target_id": 49, + "target_slot": 3, + "type": "SIGMAS" + }, + { + "id": 82, + "origin_id": 10, + "origin_slot": 2, + "target_id": 49, + "target_slot": 4, + "type": "LATENT" + }, + { + "id": 83, + "origin_id": 49, + "origin_slot": 0, + "target_id": 45, + "target_slot": 0, + "type": "LATENT" + }, + { + "id": 70, + "origin_id": 3, + "origin_slot": 0, + "target_id": 45, + "target_slot": 1, + "type": "VAE" + }, + { + "id": 2, + "origin_id": 2, + "origin_slot": 0, + "target_id": 6, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 366, + "origin_id": 166, + "origin_slot": 0, + "target_id": 145, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 361, + "origin_id": 164, + "origin_slot": 1, + "target_id": 145, + "target_slot": 2, + "type": "INT" + }, + { + "id": 49, + "origin_id": 35, + "origin_slot": 0, + "target_id": 36, + "target_slot": 0, + "type": "LATENT" + }, + { + "id": 45, + "origin_id": 3, + "origin_slot": 0, + "target_id": 36, + "target_slot": 1, + "type": "VAE" + }, + { + "id": 336, + "origin_id": 145, + "origin_slot": 0, + "target_id": 149, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 327, + "origin_id": 145, + "origin_slot": 0, + "target_id": 43, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 342, + "origin_id": 45, + "origin_slot": 0, + "target_id": 150, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 341, + "origin_id": 36, + "origin_slot": 0, + "target_id": 150, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 346, + "origin_id": 153, + "origin_slot": 0, + "target_id": 150, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 367, + "origin_id": 166, + "origin_slot": 1, + "target_id": 158, + "target_slot": 0, + "type": "AUDIO" + }, + { + "id": 360, + "origin_id": 163, + "origin_slot": 0, + "target_id": 164, + "target_slot": 0, + "type": "INT" + }, + { + "id": 371, + "origin_id": 166, + "origin_slot": 2, + "target_id": 164, + "target_slot": 1, + "type": "FLOAT" + }, + { + "id": 363, + "origin_id": 150, + "origin_slot": 0, + "target_id": 165, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 364, + "origin_id": 158, + "origin_slot": 0, + "target_id": 165, + "target_slot": 1, + "type": "AUDIO" + }, + { + "id": 372, + "origin_id": 166, + "origin_slot": 2, + "target_id": 165, + "target_slot": 2, + "type": "FLOAT" + }, + { + "id": 373, + "origin_id": -10, + "origin_slot": 0, + "target_id": 166, + "target_slot": 0, + "type": "VIDEO" + }, + { + "id": 77, + "origin_id": 46, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "VIDEO" + }, + { + "id": 362, + "origin_id": 165, + "origin_slot": 0, + "target_id": -20, + "target_slot": 1, + "type": "VIDEO" + }, + { + "id": 377, + "origin_id": -10, + "origin_slot": 1, + "target_id": 6, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 382, + "origin_id": 168, + "origin_slot": 0, + "target_id": 169, + "target_slot": 0, + "type": "INT" + }, + { + "id": 383, + "origin_id": 166, + "origin_slot": 2, + "target_id": 169, + "target_slot": 1, + "type": "FLOAT" + }, + { + "id": 384, + "origin_id": 169, + "origin_slot": 1, + "target_id": 145, + "target_slot": 1, + "type": "INT" + }, + { + "id": 385, + "origin_id": 164, + "origin_slot": 0, + "target_id": 158, + "target_slot": 2, + "type": "FLOAT" + }, + { + "id": 386, + "origin_id": 169, + "origin_slot": 0, + "target_id": 158, + "target_slot": 1, + "type": "FLOAT" + }, + { + "id": 387, + "origin_id": -10, + "origin_slot": 2, + "target_id": 7, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 388, + "origin_id": -10, + "origin_slot": 3, + "target_id": 149, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 389, + "origin_id": -10, + "origin_slot": 4, + "target_id": 168, + "target_slot": 0, + "type": "INT" + }, + { + "id": 390, + "origin_id": -10, + "origin_slot": 5, + "target_id": 163, + "target_slot": 0, + "type": "INT" + }, + { + "id": 391, + "origin_id": -10, + "origin_slot": 6, + "target_id": 147, + "target_slot": 0, + "type": "INT" + }, + { + "id": 392, + "origin_id": -10, + "origin_slot": 7, + "target_id": 148, + "target_slot": 0, + "type": "INT" + }, + { + "id": 393, + "origin_id": -10, + "origin_slot": 8, + "target_id": 153, + "target_slot": 0, + "type": "BOOLEAN" + }, + { + "id": 400, + "origin_id": -10, + "origin_slot": 9, + "target_id": 141, + "target_slot": 0, + "type": "INT" + }, + { + "id": 401, + "origin_id": -10, + "origin_slot": 10, + "target_id": 149, + "target_slot": 8, + "type": "COMBO" + }, + { + "id": 402, + "origin_id": -10, + "origin_slot": 11, + "target_id": 144, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 403, + "origin_id": -10, + "origin_slot": 12, + "target_id": 143, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 404, + "origin_id": -10, + "origin_slot": 13, + "target_id": 142, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 405, + "origin_id": -10, + "origin_slot": 14, + "target_id": 2, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 406, + "origin_id": -10, + "origin_slot": 15, + "target_id": 3, + "target_slot": 0, + "type": "COMBO" + } + ], + "extra": {}, + "category": "Video generation and editing/Inpaint video", + "description": "Removes objects from video by inpainting masked regions using VOID (CogVideoX), with SAM3 text-guided segmentation and optional two-pass optical-flow refinement." + }, + { + "id": "c3e0d783-9aa3-4e75-a94d-19937968ef86", + "version": 1, + "state": { + "lastGroupId": 13, + "lastNodeId": 171, + "lastLinkId": 406, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Image Segmentation (SAM3)", + "description": "Segments images into masks using Meta SAM3 from text prompts, points, or boxes.", + "inputNode": { + "id": -10, + "bounding": [ + -2260, + -3450, + 144.369140625, + 228 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + -1130, + -3305, + 128, + 88 + ] + }, + "inputs": [ + { + "id": "a6e75fa2-162a-4af0-a2fd-1e9c899a5ab6", + "name": "image", + "type": "IMAGE", + "linkIds": [ + 264 + ], + "localized_name": "image", + "label": "image", + "pos": [ + -2139.630859375, + -3426 + ] + }, + { + "id": "3cefd304-7631-4ff6-a5a0-5a0ffb120745", + "name": "text", + "type": "STRING", + "linkIds": [ + 265 + ], + "label": "object", + "pos": [ + -2139.630859375, + -3406 + ] + }, + { + "id": "1aec91c5-d8d2-441c-928c-49c14e7e80ed", + "name": "bboxes", + "type": "BOUNDING_BOX", + "linkIds": [ + 266 + ], + "pos": [ + -2139.630859375, + -3386 + ] + }, + { + "id": "1ec7ce1a-8257-4719-8a81-60ebc8a98899", + "name": "positive_coords", + "type": "STRING", + "linkIds": [ + 267 + ], + "pos": [ + -2139.630859375, + -3366 + ] + }, + { + "id": "c65f8b87-9bd7-48be-9fc2-823431e95019", + "name": "negative_coords", + "type": "STRING", + "linkIds": [ + 268 + ], + "pos": [ + -2139.630859375, + -3346 + ] + }, + { + "id": "bb4ba35a-ccfe-4c37-98e5-d9b0d69585fb", + "name": "threshold", + "type": "FLOAT", + "linkIds": [ + 269 + ], + "pos": [ + -2139.630859375, + -3326 + ] + }, + { + "id": "b1439668-b050-490b-a5dc-fc4052c55666", + "name": "refine_iterations", + "type": "INT", + "linkIds": [ + 270 + ], + "pos": [ + -2139.630859375, + -3306 + ] + }, + { + "id": "86e239e5-c098-4302-b54d-d42a38bc0f89", + "name": "individual_masks", + "type": "BOOLEAN", + "linkIds": [ + 271 + ], + "pos": [ + -2139.630859375, + -3286 + ] + }, + { + "id": "f9e0b9d4-b2f1-4907-a4a5-305656576706", + "name": "ckpt_name", + "type": "COMBO", + "linkIds": [ + 272 + ], + "pos": [ + -2139.630859375, + -3266 + ] + } + ], + "outputs": [ + { + "id": "ff50da09-1e59-4a58-9b7f-be1a00aa5913", + "name": "masks", + "type": "MASK", + "linkIds": [ + 231 + ], + "localized_name": "masks", + "pos": [ + -1106, + -3281 + ] + }, + { + "id": "8f622e40-8528-4078-b7d3-147e9f872194", + "name": "bboxes", + "type": "BOUNDING_BOX", + "linkIds": [ + 232 + ], + "localized_name": "bboxes", + "pos": [ + -1106, + -3261 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 75, + "type": "SAM3_Detect", + "pos": [ + -1470, + -3460 + ], + "size": [ + 270, + 260 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "label": "model", + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 237 + }, + { + "label": "image", + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 264 + }, + { + "label": "conditioning", + "localized_name": "conditioning", + "name": "conditioning", + "shape": 7, + "type": "CONDITIONING", + "link": 200 + }, + { + "label": "bboxes", + "localized_name": "bboxes", + "name": "bboxes", + "shape": 7, + "type": "BOUNDING_BOX", + "link": 266 + }, + { + "label": "positive_coords", + "localized_name": "positive_coords", + "name": "positive_coords", + "shape": 7, + "type": "STRING", + "link": 267 + }, + { + "label": "negative_coords", + "localized_name": "negative_coords", + "name": "negative_coords", + "shape": 7, + "type": "STRING", + "link": 268 + }, + { + "localized_name": "threshold", + "name": "threshold", + "type": "FLOAT", + "widget": { + "name": "threshold" + }, + "link": 269 + }, + { + "localized_name": "refine_iterations", + "name": "refine_iterations", + "type": "INT", + "widget": { + "name": "refine_iterations" + }, + "link": 270 + }, + { + "localized_name": "individual_masks", + "name": "individual_masks", + "type": "BOOLEAN", + "widget": { + "name": "individual_masks" + }, + "link": 271 + } + ], + "outputs": [ + { + "localized_name": "masks", + "name": "masks", + "type": "MASK", + "links": [ + 231 + ] + }, + { + "localized_name": "bboxes", + "name": "bboxes", + "type": "BOUNDING_BOX", + "links": [ + 232 + ] + } + ], + "properties": { + "Node name for S&R": "SAM3_Detect", + "cnr_id": "comfy-core", + "ver": "0.19.3", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + 0.5, + 2, + false + ] + }, + { + "id": 77, + "type": "CheckpointLoaderSimple", + "pos": [ + -1970, + -3200 + ], + "size": [ + 330, + 140 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "ckpt_name", + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": 272 + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "links": [ + 237 + ] + }, + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "links": [ + 240 + ] + }, + { + "localized_name": "VAE", + "name": "VAE", + "type": "VAE", + "links": null + } + ], + "properties": { + "Node name for S&R": "CheckpointLoaderSimple", + "cnr_id": "comfy-core", + "ver": "0.19.3", + "models": [ + { + "name": "sam3.1_multiplex_fp16.safetensors", + "url": "https://huggingface.co/Comfy-Org/sam3.1/resolve/main/checkpoints/sam3.1_multiplex_fp16.safetensors", + "directory": "checkpoints" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + "sam3.1_multiplex_fp16.safetensors" + ] + }, + { + "id": 78, + "type": "CLIPTextEncode", + "pos": [ + -2000, + -3000 + ], + "size": [ + 400, + 200 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 240 + }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": 265 + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "links": [ + 200 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPTextEncode", + "cnr_id": "comfy-core", + "ver": "0.19.3", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + "" + ] + } + ], + "groups": [], + "links": [ + { + "id": 237, + "origin_id": 77, + "origin_slot": 0, + "target_id": 75, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 200, + "origin_id": 78, + "origin_slot": 0, + "target_id": 75, + "target_slot": 2, + "type": "CONDITIONING" + }, + { + "id": 240, + "origin_id": 77, + "origin_slot": 1, + "target_id": 78, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 231, + "origin_id": 75, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "MASK" + }, + { + "id": 232, + "origin_id": 75, + "origin_slot": 1, + "target_id": -20, + "target_slot": 1, + "type": "BOUNDING_BOX" + }, + { + "id": 264, + "origin_id": -10, + "origin_slot": 0, + "target_id": 75, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 265, + "origin_id": -10, + "origin_slot": 1, + "target_id": 78, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 266, + "origin_id": -10, + "origin_slot": 2, + "target_id": 75, + "target_slot": 3, + "type": "BOUNDING_BOX" + }, + { + "id": 267, + "origin_id": -10, + "origin_slot": 3, + "target_id": 75, + "target_slot": 4, + "type": "STRING" + }, + { + "id": 268, + "origin_id": -10, + "origin_slot": 4, + "target_id": 75, + "target_slot": 5, + "type": "STRING" + }, + { + "id": 269, + "origin_id": -10, + "origin_slot": 5, + "target_id": 75, + "target_slot": 6, + "type": "FLOAT" + }, + { + "id": 270, + "origin_id": -10, + "origin_slot": 6, + "target_id": 75, + "target_slot": 7, + "type": "INT" + }, + { + "id": 271, + "origin_id": -10, + "origin_slot": 7, + "target_id": 75, + "target_slot": 8, + "type": "BOOLEAN" + }, + { + "id": 272, + "origin_id": -10, + "origin_slot": 8, + "target_id": 77, + "target_slot": 0, + "type": "COMBO" + } + ], + "extra": { + "ue_links": [] + } + } + ] + }, + "extra": {} +} \ No newline at end of file diff --git a/blueprints/Video Inpaint(Wan2.1 VACE).json b/blueprints/Video Inpaint(Wan2.1 VACE).json deleted file mode 100644 index a658be5f8..000000000 --- a/blueprints/Video Inpaint(Wan2.1 VACE).json +++ /dev/null @@ -1,2388 +0,0 @@ -{ - "id": "2f429c60-2e03-4117-908b-31e1fab04bba", - "revision": 0, - "last_node_id": 229, - "last_link_id": 366, - "nodes": [ - { - "id": 229, - "type": "53a657f3-c9eb-40f2-9ebd-1ed77d25ed67", - "pos": [ - -230, - 160 - ], - "size": [ - 400, - 480 - ], - "flags": {}, - "order": 0, - "mode": 0, - "inputs": [ - { - "label": "video mask", - "localized_name": "mask", - "name": "mask", - "type": "MASK", - "link": null - }, - { - "localized_name": "video", - "name": "video", - "type": "VIDEO", - "link": null - }, - { - "name": "width", - "type": "INT", - "widget": { - "name": "width" - }, - "link": null - }, - { - "name": "height", - "type": "INT", - "widget": { - "name": "height" - }, - "link": null - }, - { - "label": "reference image", - "name": "reference_image_1", - "type": "IMAGE", - "link": null - }, - { - "name": "unet_name", - "type": "COMBO", - "widget": { - "name": "unet_name" - }, - "link": null - }, - { - "name": "lora_name", - "type": "COMBO", - "widget": { - "name": "lora_name" - }, - "link": null - }, - { - "name": "clip_name", - "type": "COMBO", - "widget": { - "name": "clip_name" - }, - "link": null - }, - { - "name": "vae_name", - "type": "COMBO", - "widget": { - "name": "vae_name" - }, - "link": null - } - ], - "outputs": [ - { - "localized_name": "VIDEO", - "name": "VIDEO", - "type": "VIDEO", - "links": [] - } - ], - "properties": { - "proxyWidgets": [ - [ - "6", - "text" - ], - [ - "-1", - "width" - ], - [ - "-1", - "height" - ], - [ - "3", - "seed" - ], - [ - "3", - "control_after_generate" - ], - [ - "-1", - "unet_name" - ], - [ - "-1", - "lora_name" - ], - [ - "-1", - "clip_name" - ], - [ - "-1", - "vae_name" - ] - ], - "cnr_id": "comfy-core", - "ver": "0.13.0" - }, - "widgets_values": [ - null, - 720, - 720, - null, - null, - "wan2.1_vace_14B_fp16.safetensors", - "Wan21_CausVid_14B_T2V_lora_rank32.safetensors", - "umt5_xxl_fp8_e4m3fn_scaled.safetensors", - "wan_2.1_vae.safetensors" - ] - } - ], - "links": [], - "groups": [], - "definitions": { - "subgraphs": [ - { - "id": "53a657f3-c9eb-40f2-9ebd-1ed77d25ed67", - "version": 1, - "state": { - "lastGroupId": 25, - "lastNodeId": 229, - "lastLinkId": 366, - "lastRerouteId": 0 - }, - "revision": 0, - "config": {}, - "name": "Video Inpaint (Wan 2.1 VACE)", - "inputNode": { - "id": -10, - "bounding": [ - -970, - 800, - 132.54296875, - 220 - ] - }, - "outputNode": { - "id": -20, - "bounding": [ - 1480, - 535, - 120, - 60 - ] - }, - "inputs": [ - { - "id": "9fdda38d-6aa7-48ad-b425-f493d8aa585c", - "name": "mask", - "type": "MASK", - "linkIds": [ - 351, - 335, - 345 - ], - "localized_name": "mask", - "label": "video mask", - "pos": [ - -857.45703125, - 820 - ] - }, - { - "id": "8b1788cc-46d2-4f40-8b33-70fd56b4cb24", - "name": "video", - "type": "VIDEO", - "linkIds": [ - 336 - ], - "localized_name": "video", - "pos": [ - -857.45703125, - 840 - ] - }, - { - "id": "09393f21-257e-4476-bb02-54899a8252b8", - "name": "width", - "type": "INT", - "linkIds": [ - 355 - ], - "pos": [ - -857.45703125, - 860 - ] - }, - { - "id": "07a030f7-7eac-4b3f-b8f3-f00ee87b191d", - "name": "height", - "type": "INT", - "linkIds": [ - 356 - ], - "pos": [ - -857.45703125, - 880 - ] - }, - { - "id": "255908d3-6cc9-48fc-b76b-ab9fb72695bc", - "name": "reference_image_1", - "type": "IMAGE", - "linkIds": [ - 361 - ], - "label": "reference image", - "pos": [ - -857.45703125, - 900 - ] - }, - { - "id": "18a5d241-523c-433d-ae05-25b6e69d1e29", - "name": "unet_name", - "type": "COMBO", - "linkIds": [ - 363 - ], - "pos": [ - -857.45703125, - 920 - ] - }, - { - "id": "d7576e1b-da5f-402f-81b2-d37f838b1f8f", - "name": "lora_name", - "type": "COMBO", - "linkIds": [ - 364 - ], - "pos": [ - -857.45703125, - 940 - ] - }, - { - "id": "41676a3e-c710-4723-821e-f651ad3784b1", - "name": "clip_name", - "type": "COMBO", - "linkIds": [ - 365 - ], - "pos": [ - -857.45703125, - 960 - ] - }, - { - "id": "41fc878c-9aa6-4c12-bef3-ceda6b094b7c", - "name": "vae_name", - "type": "COMBO", - "linkIds": [ - 366 - ], - "pos": [ - -857.45703125, - 980 - ] - } - ], - "outputs": [ - { - "id": "d4861f39-1011-49dc-80fd-ee318b614a8d", - "name": "VIDEO", - "type": "VIDEO", - "linkIds": [ - 129 - ], - "localized_name": "VIDEO", - "pos": [ - 1500, - 555 - ] - } - ], - "widgets": [], - "nodes": [ - { - "id": 58, - "type": "TrimVideoLatent", - "pos": [ - 760, - 390 - ], - "size": [ - 315, - 60 - ], - "flags": { - "collapsed": false - }, - "order": 13, - "mode": 0, - "inputs": [ - { - "localized_name": "samples", - "name": "samples", - "type": "LATENT", - "link": 116 - }, - { - "localized_name": "trim_amount", - "name": "trim_amount", - "type": "INT", - "widget": { - "name": "trim_amount" - }, - "link": 115 - } - ], - "outputs": [ - { - "localized_name": "LATENT", - "name": "LATENT", - "type": "LATENT", - "links": [ - 117 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "TrimVideoLatent", - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": { - "trim_amount": true - } - }, - "widgets_values": [ - 0 - ] - }, - { - "id": 8, - "type": "VAEDecode", - "pos": [ - 770, - 500 - ], - "size": [ - 315, - 46 - ], - "flags": { - "collapsed": false - }, - "order": 11, - "mode": 0, - "inputs": [ - { - "localized_name": "samples", - "name": "samples", - "type": "LATENT", - "link": 117 - }, - { - "localized_name": "vae", - "name": "vae", - "type": "VAE", - "link": 76 - } - ], - "outputs": [ - { - "localized_name": "IMAGE", - "name": "IMAGE", - "type": "IMAGE", - "slot_index": 0, - "links": [ - 139 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "VAEDecode", - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": {} - }, - "widgets_values": [] - }, - { - "id": 48, - "type": "ModelSamplingSD3", - "pos": [ - 400, - 50 - ], - "size": [ - 315, - 58 - ], - "flags": {}, - "order": 9, - "mode": 0, - "inputs": [ - { - "localized_name": "model", - "name": "model", - "type": "MODEL", - "link": 279 - }, - { - "localized_name": "shift", - "name": "shift", - "type": "FLOAT", - "widget": { - "name": "shift" - }, - "link": null - } - ], - "outputs": [ - { - "localized_name": "MODEL", - "name": "MODEL", - "type": "MODEL", - "slot_index": 0, - "links": [ - 280 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "ModelSamplingSD3", - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": {} - }, - "widgets_values": [ - 5 - ] - }, - { - "id": 219, - "type": "InvertMask", - "pos": [ - 400, - 990 - ], - "size": [ - 140, - 26 - ], - "flags": {}, - "order": 24, - "mode": 0, - "inputs": [ - { - "localized_name": "mask", - "name": "mask", - "type": "MASK", - "link": 351 - } - ], - "outputs": [ - { - "localized_name": "MASK", - "name": "MASK", - "type": "MASK", - "links": [ - 352 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.40", - "Node name for S&R": "InvertMask" - }, - "widgets_values": [] - }, - { - "id": 216, - "type": "MaskToImage", - "pos": [ - 560, - 990 - ], - "size": [ - 193.2779296875, - 26 - ], - "flags": {}, - "order": 23, - "mode": 0, - "inputs": [ - { - "localized_name": "mask", - "name": "mask", - "type": "MASK", - "link": 352 - } - ], - "outputs": [ - { - "localized_name": "IMAGE", - "name": "IMAGE", - "type": "IMAGE", - "links": [ - 334 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.40", - "Node name for S&R": "MaskToImage" - }, - "widgets_values": [] - }, - { - "id": 213, - "type": "RebatchImages", - "pos": [ - 410, - 690 - ], - "size": [ - 230, - 60 - ], - "flags": {}, - "order": 21, - "mode": 0, - "inputs": [ - { - "localized_name": "images", - "name": "images", - "type": "IMAGE", - "link": 360 - }, - { - "localized_name": "batch_size", - "name": "batch_size", - "type": "INT", - "widget": { - "name": "batch_size" - }, - "link": 340 - } - ], - "outputs": [ - { - "localized_name": "IMAGE", - "name": "IMAGE", - "shape": 6, - "type": "IMAGE", - "links": [ - 333 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.40", - "Node name for S&R": "RebatchImages" - }, - "widgets_values": [ - 1 - ] - }, - { - "id": 68, - "type": "CreateVideo", - "pos": [ - 1150, - 50 - ], - "size": [ - 270, - 78 - ], - "flags": { - "collapsed": false - }, - "order": 14, - "mode": 0, - "inputs": [ - { - "localized_name": "images", - "name": "images", - "type": "IMAGE", - "link": 139 - }, - { - "localized_name": "audio", - "name": "audio", - "shape": 7, - "type": "AUDIO", - "link": 362 - }, - { - "localized_name": "fps", - "name": "fps", - "type": "FLOAT", - "widget": { - "name": "fps" - }, - "link": 353 - } - ], - "outputs": [ - { - "localized_name": "VIDEO", - "name": "VIDEO", - "type": "VIDEO", - "links": [ - 129 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "CreateVideo", - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": {} - }, - "widgets_values": [ - 16 - ] - }, - { - "id": 208, - "type": "ImageCompositeMasked", - "pos": [ - 410, - 790 - ], - "size": [ - 230, - 146 - ], - "flags": {}, - "order": 18, - "mode": 0, - "inputs": [ - { - "localized_name": "destination", - "name": "destination", - "type": "IMAGE", - "link": 333 - }, - { - "localized_name": "source", - "name": "source", - "type": "IMAGE", - "link": 334 - }, - { - "localized_name": "mask", - "name": "mask", - "shape": 7, - "type": "MASK", - "link": 335 - }, - { - "localized_name": "x", - "name": "x", - "type": "INT", - "widget": { - "name": "x" - }, - "link": null - }, - { - "localized_name": "y", - "name": "y", - "type": "INT", - "widget": { - "name": "y" - }, - "link": null - }, - { - "localized_name": "resize_source", - "name": "resize_source", - "type": "BOOLEAN", - "widget": { - "name": "resize_source" - }, - "link": null - } - ], - "outputs": [ - { - "localized_name": "IMAGE", - "name": "IMAGE", - "type": "IMAGE", - "links": [ - 341, - 344 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.40", - "Node name for S&R": "ImageCompositeMasked" - }, - "widgets_values": [ - 0, - 0, - true - ] - }, - { - "id": 214, - "type": "PreviewImage", - "pos": [ - 760, - 690 - ], - "size": [ - 300, - 300 - ], - "flags": {}, - "order": 22, - "mode": 0, - "inputs": [ - { - "localized_name": "images", - "name": "images", - "type": "IMAGE", - "link": 341 - } - ], - "outputs": [], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.40", - "Node name for S&R": "PreviewImage" - }, - "widgets_values": [] - }, - { - "id": 111, - "type": "MaskToImage", - "pos": [ - 20, - 1270 - ], - "size": [ - 240, - 26 - ], - "flags": {}, - "order": 15, - "mode": 0, - "inputs": [ - { - "localized_name": "mask", - "name": "mask", - "type": "MASK", - "link": 345 - } - ], - "outputs": [ - { - "localized_name": "IMAGE", - "name": "IMAGE", - "type": "IMAGE", - "links": [ - 201 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "MaskToImage", - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": {} - }, - "widgets_values": [] - }, - { - "id": 129, - "type": "RepeatImageBatch", - "pos": [ - 20, - 1160 - ], - "size": [ - 240, - 60 - ], - "flags": {}, - "order": 16, - "mode": 0, - "inputs": [ - { - "localized_name": "image", - "name": "image", - "type": "IMAGE", - "link": 201 - }, - { - "localized_name": "amount", - "name": "amount", - "type": "INT", - "widget": { - "name": "amount" - }, - "link": 346 - } - ], - "outputs": [ - { - "localized_name": "IMAGE", - "name": "IMAGE", - "type": "IMAGE", - "links": [ - 202 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "RepeatImageBatch", - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": { - "amount": true - } - }, - "widgets_values": [ - 17 - ] - }, - { - "id": 130, - "type": "ImageToMask", - "pos": [ - 20, - 1050 - ], - "size": [ - 240, - 60 - ], - "flags": {}, - "order": 17, - "mode": 0, - "inputs": [ - { - "localized_name": "image", - "name": "image", - "type": "IMAGE", - "link": 202 - }, - { - "localized_name": "channel", - "name": "channel", - "type": "COMBO", - "widget": { - "name": "channel" - }, - "link": null - } - ], - "outputs": [ - { - "localized_name": "MASK", - "name": "MASK", - "type": "MASK", - "links": [ - 349 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "ImageToMask", - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": {} - }, - "widgets_values": [ - "red" - ] - }, - { - "id": 3, - "type": "KSampler", - "pos": [ - 770, - 50 - ], - "size": [ - 315, - 262 - ], - "flags": {}, - "order": 10, - "mode": 0, - "inputs": [ - { - "localized_name": "model", - "name": "model", - "type": "MODEL", - "link": 280 - }, - { - "localized_name": "positive", - "name": "positive", - "type": "CONDITIONING", - "link": 98 - }, - { - "localized_name": "negative", - "name": "negative", - "type": "CONDITIONING", - "link": 99 - }, - { - "localized_name": "latent_image", - "name": "latent_image", - "type": "LATENT", - "link": 160 - }, - { - "localized_name": "seed", - "name": "seed", - "type": "INT", - "widget": { - "name": "seed" - }, - "link": null - }, - { - "localized_name": "steps", - "name": "steps", - "type": "INT", - "widget": { - "name": "steps" - }, - "link": null - }, - { - "localized_name": "cfg", - "name": "cfg", - "type": "FLOAT", - "widget": { - "name": "cfg" - }, - "link": null - }, - { - "localized_name": "sampler_name", - "name": "sampler_name", - "type": "COMBO", - "widget": { - "name": "sampler_name" - }, - "link": null - }, - { - "localized_name": "scheduler", - "name": "scheduler", - "type": "COMBO", - "widget": { - "name": "scheduler" - }, - "link": null - }, - { - "localized_name": "denoise", - "name": "denoise", - "type": "FLOAT", - "widget": { - "name": "denoise" - }, - "link": null - } - ], - "outputs": [ - { - "localized_name": "LATENT", - "name": "LATENT", - "type": "LATENT", - "slot_index": 0, - "links": [ - 116 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "KSampler", - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": {} - }, - "widgets_values": [ - 584027519362099, - "randomize", - 4, - 1, - "uni_pc", - "simple", - 1 - ] - }, - { - "id": 224, - "type": "MarkdownNote", - "pos": [ - 420, - -160 - ], - "size": [ - 310, - 110 - ], - "flags": {}, - "order": 0, - "mode": 0, - "inputs": [], - "outputs": [], - "title": "About Video Size", - "properties": {}, - "widgets_values": [ - "| Model | 480P | 720P |\n| ------------------------------------------------------------ | ---- | ---- |\n| [VACE-1.3B](https://huggingface.co/Wan-AI/Wan2.1-VACE-1.3B) | ✅ | ❌ |\n| [VACE-14B](https://huggingface.co/Wan-AI/Wan2.1-VACE-14B) | ✅ | ✅ |" - ], - "color": "#432", - "bgcolor": "#000" - }, - { - "id": 223, - "type": "MarkdownNote", - "pos": [ - 770, - -210 - ], - "size": [ - 303.90106201171875, - 158.5415802001953 - ], - "flags": {}, - "order": 1, - "mode": 0, - "inputs": [], - "outputs": [], - "title": "KSampler Setting", - "properties": {}, - "widgets_values": [ - "## Default\n\n- steps:20\n- cfg:6.0\n\n## For CausVid LoRA\n\n- steps: 2-4\n- cfg: 1.0\n\n" - ], - "color": "#432", - "bgcolor": "#000" - }, - { - "id": 6, - "type": "CLIPTextEncode", - "pos": [ - -80, - 60 - ], - "size": [ - 420, - 280 - ], - "flags": {}, - "order": 7, - "mode": 0, - "inputs": [ - { - "localized_name": "clip", - "name": "clip", - "type": "CLIP", - "link": 74 - }, - { - "localized_name": "text", - "name": "text", - "type": "STRING", - "widget": { - "name": "text" - }, - "link": null - } - ], - "outputs": [ - { - "localized_name": "CONDITIONING", - "name": "CONDITIONING", - "type": "CONDITIONING", - "slot_index": 0, - "links": [ - 96 - ] - } - ], - "title": "CLIP Text Encode (Positive Prompt)", - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "CLIPTextEncode", - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": {} - }, - "widgets_values": [ - "" - ], - "color": "#232", - "bgcolor": "#353" - }, - { - "id": 140, - "type": "UNETLoader", - "pos": [ - -505.8336486816406, - 88.22794342041016 - ], - "size": [ - 360, - 82 - ], - "flags": {}, - "order": 2, - "mode": 0, - "inputs": [ - { - "localized_name": "unet_name", - "name": "unet_name", - "type": "COMBO", - "widget": { - "name": "unet_name" - }, - "link": 363 - }, - { - "localized_name": "weight_dtype", - "name": "weight_dtype", - "type": "COMBO", - "widget": { - "name": "weight_dtype" - }, - "link": null - } - ], - "outputs": [ - { - "localized_name": "MODEL", - "name": "MODEL", - "type": "MODEL", - "slot_index": 0, - "links": [ - 248 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "UNETLoader", - "models": [ - { - "name": "wan2.1_vace_14B_fp16.safetensors", - "url": "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/diffusion_models/wan2.1_vace_14B_fp16.safetensors", - "directory": "diffusion_models" - } - ], - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": {} - }, - "widgets_values": [ - "wan2.1_vace_14B_fp16.safetensors", - "fp8_e4m3fn_fast" - ] - }, - { - "id": 154, - "type": "LoraLoaderModelOnly", - "pos": [ - -505.8336486816406, - 228.2279510498047 - ], - "size": [ - 360, - 85.11004638671875 - ], - "flags": {}, - "order": 6, - "mode": 0, - "inputs": [ - { - "localized_name": "model", - "name": "model", - "type": "MODEL", - "link": 248 - }, - { - "localized_name": "lora_name", - "name": "lora_name", - "type": "COMBO", - "widget": { - "name": "lora_name" - }, - "link": 364 - }, - { - "localized_name": "strength_model", - "name": "strength_model", - "type": "FLOAT", - "widget": { - "name": "strength_model" - }, - "link": null - } - ], - "outputs": [ - { - "localized_name": "MODEL", - "name": "MODEL", - "type": "MODEL", - "links": [ - 279 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "LoraLoaderModelOnly", - "models": [ - { - "name": "Wan21_CausVid_14B_T2V_lora_rank32.safetensors", - "url": "https://huggingface.co/Kijai/WanVideo_comfy/resolve/main/Wan21_CausVid_14B_T2V_lora_rank32.safetensors", - "directory": "loras" - } - ], - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": {} - }, - "widgets_values": [ - "Wan21_CausVid_14B_T2V_lora_rank32.safetensors", - 0.30000000000000004 - ] - }, - { - "id": 38, - "type": "CLIPLoader", - "pos": [ - -499.14141845703125, - 368.0911865234375 - ], - "size": [ - 360, - 106 - ], - "flags": {}, - "order": 3, - "mode": 0, - "inputs": [ - { - "localized_name": "clip_name", - "name": "clip_name", - "type": "COMBO", - "widget": { - "name": "clip_name" - }, - "link": 365 - }, - { - "localized_name": "type", - "name": "type", - "type": "COMBO", - "widget": { - "name": "type" - }, - "link": null - }, - { - "localized_name": "device", - "name": "device", - "shape": 7, - "type": "COMBO", - "widget": { - "name": "device" - }, - "link": null - } - ], - "outputs": [ - { - "localized_name": "CLIP", - "name": "CLIP", - "type": "CLIP", - "slot_index": 0, - "links": [ - 74, - 75 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "CLIPLoader", - "models": [ - { - "name": "umt5_xxl_fp8_e4m3fn_scaled.safetensors", - "url": "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/text_encoders/umt5_xxl_fp8_e4m3fn_scaled.safetensors?download=true", - "directory": "text_encoders" - } - ], - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": {} - }, - "widgets_values": [ - "umt5_xxl_fp8_e4m3fn_scaled.safetensors", - "wan", - "default" - ] - }, - { - "id": 39, - "type": "VAELoader", - "pos": [ - -498.5298156738281, - 517.2576293945312 - ], - "size": [ - 360, - 60 - ], - "flags": {}, - "order": 4, - "mode": 0, - "inputs": [ - { - "localized_name": "vae_name", - "name": "vae_name", - "type": "COMBO", - "widget": { - "name": "vae_name" - }, - "link": 366 - } - ], - "outputs": [ - { - "localized_name": "VAE", - "name": "VAE", - "type": "VAE", - "slot_index": 0, - "links": [ - 76, - 101 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "VAELoader", - "models": [ - { - "name": "wan_2.1_vae.safetensors", - "url": "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/vae/wan_2.1_vae.safetensors", - "directory": "vae" - } - ], - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": {} - }, - "widgets_values": [ - "wan_2.1_vae.safetensors" - ] - }, - { - "id": 221, - "type": "MarkdownNote", - "pos": [ - 380, - 1090 - ], - "size": [ - 480, - 170 - ], - "flags": {}, - "order": 5, - "mode": 0, - "inputs": [], - "outputs": [], - "title": "[EN] About video mask", - "properties": { - "widget_ue_connectable": {} - }, - "widgets_values": [ - "Currently, it's difficult to perfectly draw dynamic masks for different frames using only core nodes. However, to avoid requiring users to install additional custom nodes, our templates only use core nodes. You can refer to this implementation idea to achieve video inpainting.\n\nYou can use KJNode’s Points Editor and Sam2Segmentation to create some dynamic mask functions.\n\nCustom node links:\n- [ComfyUI-KJNodes](https://github.com/kijai/ComfyUI-KJNodes)\n- [ComfyUI-segment-anything-2](https://github.com/kijai/ComfyUI-segment-anything-2)" - ], - "color": "#432", - "bgcolor": "#000" - }, - { - "id": 7, - "type": "CLIPTextEncode", - "pos": [ - -80, - 390 - ], - "size": [ - 425.27801513671875, - 180.6060791015625 - ], - "flags": {}, - "order": 8, - "mode": 0, - "inputs": [ - { - "localized_name": "clip", - "name": "clip", - "type": "CLIP", - "link": 75 - }, - { - "localized_name": "text", - "name": "text", - "type": "STRING", - "widget": { - "name": "text" - }, - "link": null - } - ], - "outputs": [ - { - "localized_name": "CONDITIONING", - "name": "CONDITIONING", - "type": "CONDITIONING", - "slot_index": 0, - "links": [ - 97 - ] - } - ], - "title": "CLIP Text Encode (Negative Prompt)", - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "CLIPTextEncode", - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": {} - }, - "widgets_values": [ - "过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走,过曝," - ], - "color": "#223", - "bgcolor": "#335" - }, - { - "id": 229, - "type": "ImageFromBatch", - "pos": [ - -510, - 800 - ], - "size": [ - 270, - 82 - ], - "flags": {}, - "order": 25, - "mode": 0, - "inputs": [ - { - "localized_name": "image", - "name": "image", - "type": "IMAGE", - "link": 358 - }, - { - "localized_name": "batch_index", - "name": "batch_index", - "type": "INT", - "widget": { - "name": "batch_index" - }, - "link": null - }, - { - "localized_name": "length", - "name": "length", - "type": "INT", - "widget": { - "name": "length" - }, - "link": null - } - ], - "outputs": [ - { - "localized_name": "IMAGE", - "name": "IMAGE", - "type": "IMAGE", - "links": [ - 359, - 360 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.13.0", - "Node name for S&R": "ImageFromBatch" - }, - "widgets_values": [ - 0, - 81 - ] - }, - { - "id": 49, - "type": "WanVaceToVideo", - "pos": [ - 400, - 200 - ], - "size": [ - 315, - 254 - ], - "flags": {}, - "order": 12, - "mode": 0, - "inputs": [ - { - "localized_name": "positive", - "name": "positive", - "type": "CONDITIONING", - "link": 96 - }, - { - "localized_name": "negative", - "name": "negative", - "type": "CONDITIONING", - "link": 97 - }, - { - "localized_name": "vae", - "name": "vae", - "type": "VAE", - "link": 101 - }, - { - "localized_name": "control_video", - "name": "control_video", - "shape": 7, - "type": "IMAGE", - "link": 344 - }, - { - "localized_name": "control_masks", - "name": "control_masks", - "shape": 7, - "type": "MASK", - "link": 349 - }, - { - "localized_name": "reference_image", - "name": "reference_image", - "shape": 7, - "type": "IMAGE", - "link": 361 - }, - { - "localized_name": "width", - "name": "width", - "type": "INT", - "widget": { - "name": "width" - }, - "link": 355 - }, - { - "localized_name": "height", - "name": "height", - "type": "INT", - "widget": { - "name": "height" - }, - "link": 356 - }, - { - "localized_name": "length", - "name": "length", - "type": "INT", - "widget": { - "name": "length" - }, - "link": null - }, - { - "localized_name": "batch_size", - "name": "batch_size", - "type": "INT", - "widget": { - "name": "batch_size" - }, - "link": null - }, - { - "localized_name": "strength", - "name": "strength", - "type": "FLOAT", - "widget": { - "name": "strength" - }, - "link": null - } - ], - "outputs": [ - { - "localized_name": "positive", - "name": "positive", - "type": "CONDITIONING", - "links": [ - 98 - ] - }, - { - "localized_name": "negative", - "name": "negative", - "type": "CONDITIONING", - "links": [ - 99 - ] - }, - { - "localized_name": "latent", - "name": "latent", - "type": "LATENT", - "links": [ - 160 - ] - }, - { - "localized_name": "trim_latent", - "name": "trim_latent", - "type": "INT", - "links": [ - 115 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "WanVaceToVideo", - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": { - "width": true, - "height": true, - "length": true - } - }, - "widgets_values": [ - 720, - 720, - 81, - 1, - 1 - ] - }, - { - "id": 211, - "type": "GetImageSize", - "pos": [ - 70, - 800 - ], - "size": [ - 190, - 66 - ], - "flags": { - "collapsed": false - }, - "order": 20, - "mode": 0, - "inputs": [ - { - "localized_name": "image", - "name": "image", - "type": "IMAGE", - "link": 359 - } - ], - "outputs": [ - { - "localized_name": "width", - "name": "width", - "type": "INT", - "links": null - }, - { - "localized_name": "height", - "name": "height", - "type": "INT", - "links": null - }, - { - "localized_name": "batch_size", - "name": "batch_size", - "type": "INT", - "links": [ - 340, - 346 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.40", - "Node name for S&R": "GetImageSize" - }, - "widgets_values": [] - }, - { - "id": 210, - "type": "GetVideoComponents", - "pos": [ - -510, - 690 - ], - "size": [ - 193.530859375, - 66 - ], - "flags": {}, - "order": 19, - "mode": 0, - "inputs": [ - { - "localized_name": "video", - "name": "video", - "type": "VIDEO", - "link": 336 - } - ], - "outputs": [ - { - "localized_name": "images", - "name": "images", - "type": "IMAGE", - "links": [ - 358 - ] - }, - { - "localized_name": "audio", - "name": "audio", - "type": "AUDIO", - "links": [ - 362 - ] - }, - { - "localized_name": "fps", - "name": "fps", - "type": "FLOAT", - "links": [ - 353 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.40", - "Node name for S&R": "GetVideoComponents" - }, - "widgets_values": [] - } - ], - "groups": [ - { - "id": 1, - "title": "Step1 - Load models here", - "bounding": [ - -540, - -30, - 430, - 620 - ], - "color": "#3f789e", - "font_size": 24, - "flags": {} - }, - { - "id": 2, - "title": "Prompt", - "bounding": [ - -90, - -30, - 450, - 620 - ], - "color": "#3f789e", - "font_size": 24, - "flags": {} - }, - { - "id": 3, - "title": "Sampling & Decoding", - "bounding": [ - 380, - -30, - 720, - 620 - ], - "color": "#3f789e", - "font_size": 24, - "flags": {} - }, - { - "id": 10, - "title": "Repeat Mask Batch", - "bounding": [ - -90, - 910, - 450, - 460 - ], - "color": "#3f789e", - "font_size": 24, - "flags": {} - }, - { - "id": 21, - "title": "Get video info", - "bounding": [ - -540, - 610, - 900, - 290 - ], - "color": "#3f789e", - "font_size": 24, - "flags": {} - }, - { - "id": 22, - "title": "Composite video & masks", - "bounding": [ - 380, - 610, - 720, - 420 - ], - "color": "#3f789e", - "font_size": 24, - "flags": {} - }, - { - "id": 23, - "title": "Step4 - Set video size & length", - "bounding": [ - 390, - 130, - 360, - 340 - ], - "color": "#A88", - "font_size": 24, - "flags": {} - }, - { - "id": 25, - "title": "14B", - "bounding": [ - -520, - 10, - 380, - 308.7100524902344 - ], - "color": "#3f789e", - "font_size": 24, - "flags": {} - } - ], - "links": [ - { - "id": 116, - "origin_id": 3, - "origin_slot": 0, - "target_id": 58, - "target_slot": 0, - "type": "LATENT" - }, - { - "id": 115, - "origin_id": 49, - "origin_slot": 3, - "target_id": 58, - "target_slot": 1, - "type": "INT" - }, - { - "id": 117, - "origin_id": 58, - "origin_slot": 0, - "target_id": 8, - "target_slot": 0, - "type": "LATENT" - }, - { - "id": 76, - "origin_id": 39, - "origin_slot": 0, - "target_id": 8, - "target_slot": 1, - "type": "VAE" - }, - { - "id": 279, - "origin_id": 154, - "origin_slot": 0, - "target_id": 48, - "target_slot": 0, - "type": "MODEL" - }, - { - "id": 352, - "origin_id": 219, - "origin_slot": 0, - "target_id": 216, - "target_slot": 0, - "type": "MASK" - }, - { - "id": 340, - "origin_id": 211, - "origin_slot": 2, - "target_id": 213, - "target_slot": 1, - "type": "INT" - }, - { - "id": 96, - "origin_id": 6, - "origin_slot": 0, - "target_id": 49, - "target_slot": 0, - "type": "CONDITIONING" - }, - { - "id": 97, - "origin_id": 7, - "origin_slot": 0, - "target_id": 49, - "target_slot": 1, - "type": "CONDITIONING" - }, - { - "id": 101, - "origin_id": 39, - "origin_slot": 0, - "target_id": 49, - "target_slot": 2, - "type": "VAE" - }, - { - "id": 344, - "origin_id": 208, - "origin_slot": 0, - "target_id": 49, - "target_slot": 3, - "type": "IMAGE" - }, - { - "id": 349, - "origin_id": 130, - "origin_slot": 0, - "target_id": 49, - "target_slot": 4, - "type": "MASK" - }, - { - "id": 139, - "origin_id": 8, - "origin_slot": 0, - "target_id": 68, - "target_slot": 0, - "type": "IMAGE" - }, - { - "id": 353, - "origin_id": 210, - "origin_slot": 2, - "target_id": 68, - "target_slot": 2, - "type": "FLOAT" - }, - { - "id": 333, - "origin_id": 213, - "origin_slot": 0, - "target_id": 208, - "target_slot": 0, - "type": "IMAGE" - }, - { - "id": 334, - "origin_id": 216, - "origin_slot": 0, - "target_id": 208, - "target_slot": 1, - "type": "IMAGE" - }, - { - "id": 341, - "origin_id": 208, - "origin_slot": 0, - "target_id": 214, - "target_slot": 0, - "type": "IMAGE" - }, - { - "id": 201, - "origin_id": 111, - "origin_slot": 0, - "target_id": 129, - "target_slot": 0, - "type": "IMAGE" - }, - { - "id": 346, - "origin_id": 211, - "origin_slot": 2, - "target_id": 129, - "target_slot": 1, - "type": "INT" - }, - { - "id": 202, - "origin_id": 129, - "origin_slot": 0, - "target_id": 130, - "target_slot": 0, - "type": "IMAGE" - }, - { - "id": 280, - "origin_id": 48, - "origin_slot": 0, - "target_id": 3, - "target_slot": 0, - "type": "MODEL" - }, - { - "id": 98, - "origin_id": 49, - "origin_slot": 0, - "target_id": 3, - "target_slot": 1, - "type": "CONDITIONING" - }, - { - "id": 99, - "origin_id": 49, - "origin_slot": 1, - "target_id": 3, - "target_slot": 2, - "type": "CONDITIONING" - }, - { - "id": 160, - "origin_id": 49, - "origin_slot": 2, - "target_id": 3, - "target_slot": 3, - "type": "LATENT" - }, - { - "id": 74, - "origin_id": 38, - "origin_slot": 0, - "target_id": 6, - "target_slot": 0, - "type": "CLIP" - }, - { - "id": 248, - "origin_id": 140, - "origin_slot": 0, - "target_id": 154, - "target_slot": 0, - "type": "MODEL" - }, - { - "id": 75, - "origin_id": 38, - "origin_slot": 0, - "target_id": 7, - "target_slot": 0, - "type": "CLIP" - }, - { - "id": 351, - "origin_id": -10, - "origin_slot": 0, - "target_id": 219, - "target_slot": 0, - "type": "MASK" - }, - { - "id": 335, - "origin_id": -10, - "origin_slot": 0, - "target_id": 208, - "target_slot": 2, - "type": "MASK" - }, - { - "id": 345, - "origin_id": -10, - "origin_slot": 0, - "target_id": 111, - "target_slot": 0, - "type": "MASK" - }, - { - "id": 336, - "origin_id": -10, - "origin_slot": 1, - "target_id": 210, - "target_slot": 0, - "type": "VIDEO" - }, - { - "id": 129, - "origin_id": 68, - "origin_slot": 0, - "target_id": -20, - "target_slot": 0, - "type": "VIDEO" - }, - { - "id": 355, - "origin_id": -10, - "origin_slot": 2, - "target_id": 49, - "target_slot": 6, - "type": "INT" - }, - { - "id": 356, - "origin_id": -10, - "origin_slot": 3, - "target_id": 49, - "target_slot": 7, - "type": "INT" - }, - { - "id": 358, - "origin_id": 210, - "origin_slot": 0, - "target_id": 229, - "target_slot": 0, - "type": "IMAGE" - }, - { - "id": 359, - "origin_id": 229, - "origin_slot": 0, - "target_id": 211, - "target_slot": 0, - "type": "IMAGE" - }, - { - "id": 360, - "origin_id": 229, - "origin_slot": 0, - "target_id": 213, - "target_slot": 0, - "type": "IMAGE" - }, - { - "id": 361, - "origin_id": -10, - "origin_slot": 4, - "target_id": 49, - "target_slot": 5, - "type": "IMAGE" - }, - { - "id": 362, - "origin_id": 210, - "origin_slot": 1, - "target_id": 68, - "target_slot": 1, - "type": "AUDIO" - }, - { - "id": 363, - "origin_id": -10, - "origin_slot": 5, - "target_id": 140, - "target_slot": 0, - "type": "COMBO" - }, - { - "id": 364, - "origin_id": -10, - "origin_slot": 6, - "target_id": 154, - "target_slot": 1, - "type": "COMBO" - }, - { - "id": 365, - "origin_id": -10, - "origin_slot": 7, - "target_id": 38, - "target_slot": 0, - "type": "COMBO" - }, - { - "id": 366, - "origin_id": -10, - "origin_slot": 8, - "target_id": 39, - "target_slot": 0, - "type": "COMBO" - } - ], - "extra": { - "workflowRendererVersion": "LG" - }, - "category": "Video generation and editing/Inpaint video", - "description": "Inpaints masked regions in video frames using Wan 2.1 VACE." - } - ] - }, - "config": {}, - "extra": { - "workflowRendererVersion": "LG", - "ds": { - "scale": 0.8183828377358485, - "offset": [ - 1215.8643989712405, - 178.87024992690183 - ] - } - }, - "version": 0.4 -} diff --git a/blueprints/Video Inpainting (Wan2.1 VACE).json b/blueprints/Video Inpainting (Wan2.1 VACE).json new file mode 100644 index 000000000..7460f3d44 --- /dev/null +++ b/blueprints/Video Inpainting (Wan2.1 VACE).json @@ -0,0 +1,4196 @@ +{ + "revision": 0, + "last_node_id": 306, + "last_link_id": 0, + "nodes": [ + { + "id": 306, + "type": "bd7f73a0-ec67-4f46-8671-17088d8e31b7", + "pos": [ + -2950, + -410 + ], + "size": [ + 440, + 650 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "label": "source_video", + "localized_name": "video", + "name": "video", + "type": "VIDEO", + "link": null + }, + { + "label": "reference_image", + "name": "reference_image_1", + "shape": 7, + "type": "IMAGE", + "link": null + }, + { + "label": "prompt", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": null + }, + { + "label": "width", + "name": "value", + "type": "INT", + "widget": { + "name": "value" + }, + "link": null + }, + { + "label": "height", + "name": "value_1", + "type": "INT", + "widget": { + "name": "value_1" + }, + "link": null + }, + { + "label": "frame_counts", + "name": "length", + "type": "INT", + "widget": { + "name": "length" + }, + "link": null + }, + { + "name": "seed", + "type": "INT", + "widget": { + "name": "seed" + }, + "link": null + }, + { + "label": "wan_vace_model", + "name": "unet_name", + "type": "COMBO", + "widget": { + "name": "unet_name" + }, + "link": null + }, + { + "label": "clip_model", + "name": "clip_name", + "type": "COMBO", + "widget": { + "name": "clip_name" + }, + "link": null + }, + { + "label": "vae_model", + "name": "vae_name", + "type": "COMBO", + "widget": { + "name": "vae_name" + }, + "link": null + }, + { + "label": "enable_turbo_mode", + "name": "value_2", + "type": "BOOLEAN", + "widget": { + "name": "value_2" + }, + "link": null + }, + { + "label": "lightning_lora", + "name": "lora_name", + "type": "COMBO", + "widget": { + "name": "lora_name" + }, + "link": null + }, + { + "label": "sam3_mask_object", + "name": "text_1", + "type": "STRING", + "widget": { + "name": "text_1" + }, + "link": null + }, + { + "label": "mask_expand", + "name": "expand", + "type": "INT", + "widget": { + "name": "expand" + }, + "link": null + }, + { + "label": "sam3_model", + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "VIDEO", + "name": "VIDEO", + "type": "VIDEO", + "links": [] + } + ], + "properties": { + "proxyWidgets": [ + [ + "280", + "text" + ], + [ + "297", + "value" + ], + [ + "290", + "value" + ], + [ + "289", + "length" + ], + [ + "288", + "seed" + ], + [ + "299", + "unet_name" + ], + [ + "277", + "clip_name" + ], + [ + "278", + "vae_name" + ], + [ + "300", + "value" + ], + [ + "272", + "lora_name" + ], + [ + "268", + "text" + ], + [ + "269", + "expand" + ], + [ + "268", + "ckpt_name" + ], + [ + "312", + "$$canvas-image-preview" + ] + ], + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [], + "title": "Video Inpainting (Wan2.1 VACE)" + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "bd7f73a0-ec67-4f46-8671-17088d8e31b7", + "version": 1, + "state": { + "lastGroupId": 31, + "lastNodeId": 315, + "lastLinkId": 499, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Video Inpainting (Wan2.1 VACE)", + "inputNode": { + "id": -10, + "bounding": [ + -3450, + 3170, + 159.744140625, + 348 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + 900, + 2840, + 128, + 68 + ] + }, + "inputs": [ + { + "id": "a636746e-5b9f-4b91-96f0-7f2657415b93", + "name": "video", + "type": "VIDEO", + "linkIds": [ + 473 + ], + "localized_name": "video", + "label": "source_video", + "pos": [ + -3314.255859375, + 3194 + ] + }, + { + "id": "46275350-98b8-4d7c-8ca4-c452dc40a6bd", + "name": "reference_image_1", + "type": "IMAGE", + "linkIds": [ + 478 + ], + "label": "reference_image", + "pos": [ + -3314.255859375, + 3214 + ] + }, + { + "id": "0f5bee71-3485-4e10-81a7-2b9f85851353", + "name": "text", + "type": "STRING", + "linkIds": [ + 479 + ], + "label": "prompt", + "pos": [ + -3314.255859375, + 3234 + ] + }, + { + "id": "16675512-c229-43ed-944e-190a7f61b571", + "name": "value", + "type": "INT", + "linkIds": [ + 480 + ], + "label": "width", + "pos": [ + -3314.255859375, + 3254 + ] + }, + { + "id": "84330129-a0c7-44cd-91fe-c033946749db", + "name": "value_1", + "type": "INT", + "linkIds": [ + 481 + ], + "label": "height", + "pos": [ + -3314.255859375, + 3274 + ] + }, + { + "id": "3bd895e6-cba9-477b-bf6e-8c77dd56bb4a", + "name": "length", + "type": "INT", + "linkIds": [ + 494 + ], + "label": "frame_counts", + "pos": [ + -3314.255859375, + 3294 + ] + }, + { + "id": "dbc2e9c5-f86a-48ba-874a-2991c75d1ae7", + "name": "seed", + "type": "INT", + "linkIds": [ + 483 + ], + "pos": [ + -3314.255859375, + 3314 + ] + }, + { + "id": "572db94d-e64d-464f-bf3c-23a23aeb79f1", + "name": "unet_name", + "type": "COMBO", + "linkIds": [ + 485 + ], + "label": "wan_vace_model", + "pos": [ + -3314.255859375, + 3334 + ] + }, + { + "id": "32185180-f627-47c2-971b-6ef3007e9455", + "name": "clip_name", + "type": "COMBO", + "linkIds": [ + 486 + ], + "label": "clip_model", + "pos": [ + -3314.255859375, + 3354 + ] + }, + { + "id": "2af354d3-108a-42a9-acfc-7bad158715aa", + "name": "vae_name", + "type": "COMBO", + "linkIds": [ + 487 + ], + "label": "vae_model", + "pos": [ + -3314.255859375, + 3374 + ] + }, + { + "id": "c9777a8c-267f-4c5e-b4d5-e9727d822e50", + "name": "value_2", + "type": "BOOLEAN", + "linkIds": [ + 489 + ], + "label": "enable_turbo_mode", + "pos": [ + -3314.255859375, + 3394 + ] + }, + { + "id": "84a258a3-4f25-4edb-9f50-6fcd8411394e", + "name": "lora_name", + "type": "COMBO", + "linkIds": [ + 490 + ], + "label": "lightning_lora", + "pos": [ + -3314.255859375, + 3414 + ] + }, + { + "id": "9c5fb6f8-407b-4a13-94d8-cbbba546a082", + "name": "text_1", + "type": "STRING", + "linkIds": [ + 491 + ], + "label": "sam3_mask_object", + "pos": [ + -3314.255859375, + 3434 + ] + }, + { + "id": "598323c9-2256-44bd-9745-492a74628300", + "name": "expand", + "type": "INT", + "linkIds": [ + 496 + ], + "label": "mask_expand", + "pos": [ + -3314.255859375, + 3454 + ] + }, + { + "id": "856c1937-8caa-4d85-9d8a-6a900234d6d6", + "name": "ckpt_name", + "type": "COMBO", + "linkIds": [ + 497 + ], + "label": "sam3_model", + "pos": [ + -3314.255859375, + 3474 + ] + } + ], + "outputs": [ + { + "id": "be46c9d5-ced7-445b-996f-fff59d9b684d", + "name": "VIDEO", + "type": "VIDEO", + "linkIds": [ + 474 + ], + "localized_name": "VIDEO", + "pos": [ + 924, + 2864 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 266, + "type": "ModelSamplingSD3", + "pos": [ + -560, + 1940 + ], + "size": [ + 320, + 110 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 422 + }, + { + "localized_name": "shift", + "name": "shift", + "type": "FLOAT", + "widget": { + "name": "shift" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "slot_index": 0, + "links": [ + 454 + ] + } + ], + "properties": { + "Node name for S&R": "ModelSamplingSD3", + "cnr_id": "comfy-core", + "ver": "0.3.34", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "widget_ue_connectable": {} + }, + "widgets_values": [ + 5 + ] + }, + { + "id": 267, + "type": "CreateVideo", + "pos": [ + 530, + 2590 + ], + "size": [ + 310, + 130 + ], + "flags": { + "collapsed": false + }, + "order": 6, + "mode": 0, + "inputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "link": 423 + }, + { + "localized_name": "audio", + "name": "audio", + "shape": 7, + "type": "AUDIO", + "link": 424 + }, + { + "localized_name": "fps", + "name": "fps", + "type": "FLOAT", + "widget": { + "name": "fps" + }, + "link": 425 + } + ], + "outputs": [ + { + "localized_name": "VIDEO", + "name": "VIDEO", + "type": "VIDEO", + "links": [ + 474 + ] + } + ], + "properties": { + "Node name for S&R": "CreateVideo", + "cnr_id": "comfy-core", + "ver": "0.3.34", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "widget_ue_connectable": {} + }, + "widgets_values": [ + 16 + ] + }, + { + "id": 268, + "type": "17df2eeb-d89e-46ee-9480-a4ca2494b207", + "pos": [ + -1960, + 3220 + ], + "size": [ + 290, + 370 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "label": "image", + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 426 + }, + { + "label": "object", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": 491 + }, + { + "name": "bboxes", + "shape": 7, + "type": "BOUNDING_BOX", + "link": null + }, + { + "name": "positive_coords", + "shape": 7, + "type": "STRING", + "link": null + }, + { + "name": "negative_coords", + "shape": 7, + "type": "STRING", + "link": null + }, + { + "name": "threshold", + "type": "FLOAT", + "widget": { + "name": "threshold" + }, + "link": null + }, + { + "name": "refine_iterations", + "type": "INT", + "widget": { + "name": "refine_iterations" + }, + "link": null + }, + { + "name": "individual_masks", + "type": "BOOLEAN", + "widget": { + "name": "individual_masks" + }, + "link": null + }, + { + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": 497 + } + ], + "outputs": [ + { + "localized_name": "masks", + "name": "masks", + "type": "MASK", + "links": [ + 427 + ] + }, + { + "localized_name": "bboxes", + "name": "bboxes", + "type": "BOUNDING_BOX", + "links": [] + } + ], + "properties": { + "proxyWidgets": [ + [ + "237", + "text" + ], + [ + "75", + "threshold" + ], + [ + "75", + "refine_iterations" + ], + [ + "75", + "individual_masks" + ], + [ + "236", + "ckpt_name" + ] + ], + "cnr_id": "comfy-core", + "ver": "0.19.3", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "ue_properties": { + "widget_ue_connectable": { + "text": true + }, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [] + }, + { + "id": 269, + "type": "GrowMask", + "pos": [ + -1530, + 3220 + ], + "size": [ + 270, + 140 + ], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "localized_name": "mask", + "name": "mask", + "type": "MASK", + "link": 427 + }, + { + "localized_name": "expand", + "name": "expand", + "type": "INT", + "widget": { + "name": "expand" + }, + "link": 496 + }, + { + "localized_name": "tapered_corners", + "name": "tapered_corners", + "type": "BOOLEAN", + "widget": { + "name": "tapered_corners" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "MASK", + "name": "MASK", + "type": "MASK", + "links": [ + 441, + 445, + 449, + 498 + ] + } + ], + "properties": { + "Node name for S&R": "GrowMask", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 20, + true + ] + }, + { + "id": 270, + "type": "PrimitiveInt", + "pos": [ + -1350, + 1980 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "INT", + "widget": { + "name": "value" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 466 + ] + } + ], + "title": "Int (Steps)", + "properties": { + "Node name for S&R": "PrimitiveInt", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 20, + "fixed" + ] + }, + { + "id": 271, + "type": "PrimitiveFloat", + "pos": [ + -1340, + 2140 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "FLOAT", + "widget": { + "name": "value" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": [ + 432 + ] + } + ], + "title": "Float (CFG)", + "properties": { + "Node name for S&R": "PrimitiveFloat", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 6 + ] + }, + { + "id": 272, + "type": "LoraLoaderModelOnly", + "pos": [ + -1380, + 2390 + ], + "size": [ + 350, + 140 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 428 + }, + { + "localized_name": "lora_name", + "name": "lora_name", + "type": "COMBO", + "widget": { + "name": "lora_name" + }, + "link": 490 + }, + { + "localized_name": "strength_model", + "name": "strength_model", + "type": "FLOAT", + "widget": { + "name": "strength_model" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "links": [ + 430 + ] + } + ], + "properties": { + "Node name for S&R": "LoraLoaderModelOnly", + "cnr_id": "comfy-core", + "ver": "0.3.34", + "models": [ + { + "name": "Wan21_CausVid_14B_T2V_lora_rank32.safetensors", + "url": "https://huggingface.co/Kijai/WanVideo_comfy/resolve/main/Wan21_CausVid_14B_T2V_lora_rank32.safetensors", + "directory": "loras" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "widget_ue_connectable": {} + }, + "widgets_values": [ + "Wan21_CausVid_14B_T2V_lora_rank32.safetensors", + 0.30000000000000004 + ] + }, + { + "id": 273, + "type": "PrimitiveInt", + "pos": [ + -1340, + 2600 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "INT", + "widget": { + "name": "value" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 467 + ] + } + ], + "title": "Int (Steps)", + "properties": { + "Node name for S&R": "PrimitiveInt", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 6, + "fixed" + ] + }, + { + "id": 274, + "type": "PrimitiveFloat", + "pos": [ + -1340, + 2760 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "FLOAT", + "widget": { + "name": "value" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": [ + 433 + ] + } + ], + "title": "Float (CFG)", + "properties": { + "Node name for S&R": "PrimitiveFloat", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 1 + ] + }, + { + "id": 275, + "type": "ComfySwitchNode", + "pos": [ + -960, + 2530 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 429 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 430 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 431 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 422 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + false + ] + }, + { + "id": 276, + "type": "ComfySwitchNode", + "pos": [ + -960, + 2340 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 11, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 432 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 433 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 434 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 459 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + false + ] + }, + { + "id": 277, + "type": "CLIPLoader", + "pos": [ + -2710, + 2210 + ], + "size": [ + 360, + 170 + ], + "flags": {}, + "order": 12, + "mode": 0, + "inputs": [ + { + "localized_name": "clip_name", + "name": "clip_name", + "type": "COMBO", + "widget": { + "name": "clip_name" + }, + "link": 486 + }, + { + "localized_name": "type", + "name": "type", + "type": "COMBO", + "widget": { + "name": "type" + }, + "link": null + }, + { + "localized_name": "device", + "name": "device", + "shape": 7, + "type": "COMBO", + "widget": { + "name": "device" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "slot_index": 0, + "links": [ + 435, + 436 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPLoader", + "cnr_id": "comfy-core", + "ver": "0.3.34", + "models": [ + { + "name": "umt5_xxl_fp8_e4m3fn_scaled.safetensors", + "url": "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/text_encoders/umt5_xxl_fp8_e4m3fn_scaled.safetensors?download=true", + "directory": "text_encoders" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "widget_ue_connectable": {} + }, + "widgets_values": [ + "umt5_xxl_fp8_e4m3fn_scaled.safetensors", + "wan", + "default" + ] + }, + { + "id": 278, + "type": "VAELoader", + "pos": [ + -2700, + 2500 + ], + "size": [ + 360, + 110 + ], + "flags": {}, + "order": 13, + "mode": 0, + "inputs": [ + { + "localized_name": "vae_name", + "name": "vae_name", + "type": "COMBO", + "widget": { + "name": "vae_name" + }, + "link": 487 + } + ], + "outputs": [ + { + "localized_name": "VAE", + "name": "VAE", + "type": "VAE", + "slot_index": 0, + "links": [ + 439, + 471 + ] + } + ], + "properties": { + "Node name for S&R": "VAELoader", + "cnr_id": "comfy-core", + "ver": "0.3.34", + "models": [ + { + "name": "wan_2.1_vae.safetensors", + "url": "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/vae/wan_2.1_vae.safetensors", + "directory": "vae" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "widget_ue_connectable": {} + }, + "widgets_values": [ + "wan_2.1_vae.safetensors" + ] + }, + { + "id": 279, + "type": "CLIPTextEncode", + "pos": [ + -2280, + 2410 + ], + "size": [ + 430, + 190 + ], + "flags": {}, + "order": 14, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 435 + }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "slot_index": 0, + "links": [ + 438 + ] + } + ], + "title": "CLIP Text Encode (Negative Prompt)", + "properties": { + "Node name for S&R": "CLIPTextEncode", + "cnr_id": "comfy-core", + "ver": "0.3.34", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "widget_ue_connectable": {} + }, + "widgets_values": [ + "过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走,过曝," + ], + "color": "#223", + "bgcolor": "#335" + }, + { + "id": 280, + "type": "CLIPTextEncode", + "pos": [ + -2270, + 1940 + ], + "size": [ + 420, + 420 + ], + "flags": {}, + "order": 15, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 436 + }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": 479 + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "slot_index": 0, + "links": [ + 437 + ] + } + ], + "title": "CLIP Text Encode (Positive Prompt)", + "properties": { + "Node name for S&R": "CLIPTextEncode", + "cnr_id": "comfy-core", + "ver": "0.3.34", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "widget_ue_connectable": {} + }, + "widgets_values": [ + "" + ], + "color": "#232", + "bgcolor": "#353" + }, + { + "id": 281, + "type": "WanVaceToVideo", + "pos": [ + -1780, + 1940 + ], + "size": [ + 320, + 360 + ], + "flags": {}, + "order": 16, + "mode": 0, + "inputs": [ + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "link": 437 + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "link": 438 + }, + { + "localized_name": "vae", + "name": "vae", + "type": "VAE", + "link": 439 + }, + { + "localized_name": "control_video", + "name": "control_video", + "shape": 7, + "type": "IMAGE", + "link": 440 + }, + { + "localized_name": "control_masks", + "name": "control_masks", + "shape": 7, + "type": "MASK", + "link": 441 + }, + { + "localized_name": "reference_image", + "name": "reference_image", + "shape": 7, + "type": "IMAGE", + "link": 478 + }, + { + "localized_name": "width", + "name": "width", + "type": "INT", + "widget": { + "name": "width" + }, + "link": 442 + }, + { + "localized_name": "height", + "name": "height", + "type": "INT", + "widget": { + "name": "height" + }, + "link": 443 + }, + { + "localized_name": "length", + "name": "length", + "type": "INT", + "widget": { + "name": "length" + }, + "link": 444 + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { + "name": "batch_size" + }, + "link": null + }, + { + "localized_name": "strength", + "name": "strength", + "type": "FLOAT", + "widget": { + "name": "strength" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "links": [ + 455 + ] + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "links": [ + 456 + ] + }, + { + "localized_name": "latent", + "name": "latent", + "type": "LATENT", + "links": [ + 457 + ] + }, + { + "localized_name": "trim_latent", + "name": "trim_latent", + "type": "INT", + "links": [ + 453 + ] + } + ], + "properties": { + "Node name for S&R": "WanVaceToVideo", + "cnr_id": "comfy-core", + "ver": "0.3.34", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "widget_ue_connectable": { + "width": true, + "height": true, + "length": true + } + }, + "widgets_values": [ + 720, + 720, + 81, + 1, + 1 + ] + }, + { + "id": 282, + "type": "InvertMask", + "pos": [ + -1510, + 3410 + ], + "size": [ + 230, + 80 + ], + "flags": {}, + "order": 17, + "mode": 0, + "inputs": [ + { + "localized_name": "mask", + "name": "mask", + "type": "MASK", + "link": 445 + } + ], + "outputs": [ + { + "localized_name": "MASK", + "name": "MASK", + "type": "MASK", + "links": [ + 446 + ] + } + ], + "properties": { + "Node name for S&R": "InvertMask", + "cnr_id": "comfy-core", + "ver": "0.3.40", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 283, + "type": "MaskToImage", + "pos": [ + -1510, + 3550 + ], + "size": [ + 230, + 80 + ], + "flags": {}, + "order": 18, + "mode": 0, + "inputs": [ + { + "localized_name": "mask", + "name": "mask", + "type": "MASK", + "link": 446 + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 448 + ] + } + ], + "properties": { + "Node name for S&R": "MaskToImage", + "cnr_id": "comfy-core", + "ver": "0.3.40", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 284, + "type": "ImageCompositeMasked", + "pos": [ + -1210, + 3210 + ], + "size": [ + 230, + 220 + ], + "flags": {}, + "order": 19, + "mode": 0, + "inputs": [ + { + "localized_name": "destination", + "name": "destination", + "type": "IMAGE", + "link": 447 + }, + { + "localized_name": "source", + "name": "source", + "type": "IMAGE", + "link": 448 + }, + { + "localized_name": "mask", + "name": "mask", + "shape": 7, + "type": "MASK", + "link": 449 + }, + { + "localized_name": "x", + "name": "x", + "type": "INT", + "widget": { + "name": "x" + }, + "link": null + }, + { + "localized_name": "y", + "name": "y", + "type": "INT", + "widget": { + "name": "y" + }, + "link": null + }, + { + "localized_name": "resize_source", + "name": "resize_source", + "type": "BOOLEAN", + "widget": { + "name": "resize_source" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 440, + 499 + ] + } + ], + "properties": { + "Node name for S&R": "ImageCompositeMasked", + "cnr_id": "comfy-core", + "ver": "0.3.40", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 0, + 0, + true + ] + }, + { + "id": 287, + "type": "TrimVideoLatent", + "pos": [ + -220, + 1950 + ], + "size": [ + 320, + 110 + ], + "flags": { + "collapsed": false + }, + "order": 20, + "mode": 0, + "inputs": [ + { + "localized_name": "samples", + "name": "samples", + "type": "LATENT", + "link": 452 + }, + { + "localized_name": "trim_amount", + "name": "trim_amount", + "type": "INT", + "widget": { + "name": "trim_amount" + }, + "link": 453 + } + ], + "outputs": [ + { + "localized_name": "LATENT", + "name": "LATENT", + "type": "LATENT", + "links": [ + 470 + ] + } + ], + "properties": { + "Node name for S&R": "TrimVideoLatent", + "cnr_id": "comfy-core", + "ver": "0.3.34", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "widget_ue_connectable": { + "trim_amount": true + } + }, + "widgets_values": [ + 0 + ] + }, + { + "id": 288, + "type": "KSampler", + "pos": [ + -560, + 2120 + ], + "size": [ + 320, + 350 + ], + "flags": {}, + "order": 21, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 454 + }, + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "link": 455 + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "link": 456 + }, + { + "localized_name": "latent_image", + "name": "latent_image", + "type": "LATENT", + "link": 457 + }, + { + "localized_name": "seed", + "name": "seed", + "type": "INT", + "widget": { + "name": "seed" + }, + "link": 483 + }, + { + "localized_name": "steps", + "name": "steps", + "type": "INT", + "widget": { + "name": "steps" + }, + "link": 458 + }, + { + "localized_name": "cfg", + "name": "cfg", + "type": "FLOAT", + "widget": { + "name": "cfg" + }, + "link": 459 + }, + { + "localized_name": "sampler_name", + "name": "sampler_name", + "type": "COMBO", + "widget": { + "name": "sampler_name" + }, + "link": null + }, + { + "localized_name": "scheduler", + "name": "scheduler", + "type": "COMBO", + "widget": { + "name": "scheduler" + }, + "link": null + }, + { + "localized_name": "denoise", + "name": "denoise", + "type": "FLOAT", + "widget": { + "name": "denoise" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "LATENT", + "name": "LATENT", + "type": "LATENT", + "slot_index": 0, + "links": [ + 452 + ] + } + ], + "properties": { + "Node name for S&R": "KSampler", + "cnr_id": "comfy-core", + "ver": "0.3.34", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "widget_ue_connectable": {} + }, + "widgets_values": [ + 832378512055965, + "fixed", + 4, + 1, + "uni_pc", + "simple", + 1 + ] + }, + { + "id": 289, + "type": "ImageFromBatch", + "pos": [ + -2360, + 3410 + ], + "size": [ + 270, + 140 + ], + "flags": {}, + "order": 22, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 460 + }, + { + "localized_name": "batch_index", + "name": "batch_index", + "type": "INT", + "widget": { + "name": "batch_index" + }, + "link": null + }, + { + "localized_name": "length", + "name": "length", + "type": "INT", + "widget": { + "name": "length" + }, + "link": 494 + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 463 + ] + } + ], + "properties": { + "Node name for S&R": "ImageFromBatch", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 0, + 81 + ] + }, + { + "id": 290, + "type": "PrimitiveInt", + "pos": [ + -2690, + 3540 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 23, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "INT", + "widget": { + "name": "value" + }, + "link": 481 + } + ], + "outputs": [ + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 461 + ] + } + ], + "title": "Int (Height)", + "properties": { + "Node name for S&R": "PrimitiveInt", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 720, + "fixed" + ] + }, + { + "id": 291, + "type": "ComfyMathExpression", + "pos": [ + -2650, + 3700 + ], + "size": [ + 230, + 80 + ], + "flags": { + "collapsed": true + }, + "order": 24, + "mode": 0, + "inputs": [ + { + "label": "a", + "localized_name": "values.a", + "name": "values.a", + "type": "FLOAT,INT,BOOLEAN", + "link": 461 + }, + { + "label": "b", + "localized_name": "values.b", + "name": "values.b", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": null + }, + { + "localized_name": "expression", + "name": "expression", + "type": "STRING", + "widget": { + "name": "expression" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": [] + }, + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 465 + ] + }, + { + "localized_name": "BOOL", + "name": "BOOL", + "type": "BOOLEAN", + "links": [] + } + ], + "properties": { + "Node name for S&R": "ComfyMathExpression", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "floor(a/16)*16" + ] + }, + { + "id": 292, + "type": "ComfyMathExpression", + "pos": [ + -2650, + 3500 + ], + "size": [ + 230, + 80 + ], + "flags": { + "collapsed": true + }, + "order": 25, + "mode": 0, + "inputs": [ + { + "label": "a", + "localized_name": "values.a", + "name": "values.a", + "type": "FLOAT,INT,BOOLEAN", + "link": 462 + }, + { + "label": "b", + "localized_name": "values.b", + "name": "values.b", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": null + }, + { + "localized_name": "expression", + "name": "expression", + "type": "STRING", + "widget": { + "name": "expression" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": [] + }, + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 464 + ] + }, + { + "localized_name": "BOOL", + "name": "BOOL", + "type": "BOOLEAN", + "links": [] + } + ], + "properties": { + "Node name for S&R": "ComfyMathExpression", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "floor(a/16)*16" + ] + }, + { + "id": 293, + "type": "ResizeImageMaskNode", + "pos": [ + -2360, + 3590 + ], + "size": [ + 280, + 160 + ], + "flags": {}, + "order": 26, + "mode": 0, + "inputs": [ + { + "localized_name": "input", + "name": "input", + "type": "IMAGE,MASK", + "link": 463 + }, + { + "localized_name": "resize_type", + "name": "resize_type", + "type": "COMFY_DYNAMICCOMBO_V3", + "widget": { + "name": "resize_type" + }, + "link": null + }, + { + "localized_name": "width", + "name": "resize_type.width", + "type": "INT", + "widget": { + "name": "resize_type.width" + }, + "link": 464 + }, + { + "localized_name": "height", + "name": "resize_type.height", + "type": "INT", + "widget": { + "name": "resize_type.height" + }, + "link": 465 + }, + { + "localized_name": "crop", + "name": "resize_type.crop", + "type": "COMBO", + "widget": { + "name": "resize_type.crop" + }, + "link": null + }, + { + "localized_name": "scale_method", + "name": "scale_method", + "type": "COMBO", + "widget": { + "name": "scale_method" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "resized", + "name": "resized", + "type": "*", + "links": [ + 426, + 447, + 469 + ] + } + ], + "properties": { + "Node name for S&R": "ResizeImageMaskNode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "scale dimensions", + 512, + 512, + "center", + "area" + ] + }, + { + "id": 294, + "type": "ComfySwitchNode", + "pos": [ + -960, + 2150 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 27, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 466 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 467 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 468 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 458 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + false + ] + }, + { + "id": 295, + "type": "GetImageSize", + "pos": [ + -2010, + 2920 + ], + "size": [ + 230, + 120 + ], + "flags": {}, + "order": 28, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 469 + } + ], + "outputs": [ + { + "localized_name": "width", + "name": "width", + "type": "INT", + "links": [ + 442 + ] + }, + { + "localized_name": "height", + "name": "height", + "type": "INT", + "links": [ + 443 + ] + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "links": [ + 444 + ] + } + ], + "properties": { + "Node name for S&R": "GetImageSize", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 296, + "type": "VAEDecode", + "pos": [ + 520, + 2450 + ], + "size": [ + 320, + 100 + ], + "flags": { + "collapsed": false + }, + "order": 29, + "mode": 0, + "inputs": [ + { + "localized_name": "samples", + "name": "samples", + "type": "LATENT", + "link": 470 + }, + { + "localized_name": "vae", + "name": "vae", + "type": "VAE", + "link": 471 + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "slot_index": 0, + "links": [ + 423 + ] + } + ], + "properties": { + "Node name for S&R": "VAEDecode", + "cnr_id": "comfy-core", + "ver": "0.3.34", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "widget_ue_connectable": {} + } + }, + { + "id": 297, + "type": "PrimitiveInt", + "pos": [ + -2690, + 3350 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 30, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "INT", + "widget": { + "name": "value" + }, + "link": 480 + } + ], + "outputs": [ + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 462 + ] + } + ], + "title": "Int (Width)", + "properties": { + "Node name for S&R": "PrimitiveInt", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 720, + "fixed" + ] + }, + { + "id": 298, + "type": "GetVideoComponents", + "pos": [ + -2330, + 3210 + ], + "size": [ + 230, + 120 + ], + "flags": { + "collapsed": false + }, + "order": 31, + "mode": 0, + "inputs": [ + { + "localized_name": "video", + "name": "video", + "type": "VIDEO", + "link": 473 + } + ], + "outputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "links": [ + 460 + ] + }, + { + "localized_name": "audio", + "name": "audio", + "type": "AUDIO", + "links": [ + 424 + ] + }, + { + "localized_name": "fps", + "name": "fps", + "type": "FLOAT", + "links": [ + 425 + ] + } + ], + "properties": { + "Node name for S&R": "GetVideoComponents", + "cnr_id": "comfy-core", + "ver": "0.3.40", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 299, + "type": "UNETLoader", + "pos": [ + -2720, + 1980 + ], + "size": [ + 370, + 140 + ], + "flags": {}, + "order": 32, + "mode": 0, + "inputs": [ + { + "localized_name": "unet_name", + "name": "unet_name", + "type": "COMBO", + "widget": { + "name": "unet_name" + }, + "link": 485 + }, + { + "localized_name": "weight_dtype", + "name": "weight_dtype", + "type": "COMBO", + "widget": { + "name": "weight_dtype" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "slot_index": 0, + "links": [ + 428, + 429 + ] + } + ], + "properties": { + "Node name for S&R": "UNETLoader", + "cnr_id": "comfy-core", + "ver": "0.3.34", + "models": [ + { + "name": "wan2.1_vace_14B_fp16.safetensors", + "url": "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/diffusion_models/wan2.1_vace_14B_fp16.safetensors", + "directory": "diffusion_models" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "widget_ue_connectable": {} + }, + "widgets_values": [ + "wan2.1_vace_14B_fp16.safetensors", + "fp8_e4m3fn_fast" + ] + }, + { + "id": 300, + "type": "PrimitiveBoolean", + "pos": [ + -1390, + 2980 + ], + "size": [ + 270, + 100 + ], + "flags": {}, + "order": 33, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "BOOLEAN", + "widget": { + "name": "value" + }, + "link": 489 + } + ], + "outputs": [ + { + "localized_name": "BOOLEAN", + "name": "BOOLEAN", + "type": "BOOLEAN", + "links": [ + 431, + 434, + 468 + ] + } + ], + "title": "Boolean (Enable Lightning LoRA)", + "properties": { + "Node name for S&R": "PrimitiveBoolean", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + true + ] + }, + { + "id": 308, + "type": "ImageFromBatch", + "pos": [ + -2360, + 3410 + ], + "size": [ + 270, + 140 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": null + }, + { + "localized_name": "batch_index", + "name": "batch_index", + "type": "INT", + "widget": { + "name": "batch_index" + }, + "link": null + }, + { + "localized_name": "length", + "name": "length", + "type": "INT", + "widget": { + "name": "length" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": null + } + ], + "properties": { + "Node name for S&R": "ImageFromBatch", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 0, + 1 + ] + }, + { + "id": 310, + "type": "MaskPreview", + "pos": [ + -900, + 3230 + ], + "size": [ + 230, + 80 + ], + "flags": {}, + "order": 34, + "mode": 4, + "inputs": [ + { + "localized_name": "mask", + "name": "mask", + "type": "MASK", + "link": 498 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "MaskPreview", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 312, + "type": "PreviewImage", + "pos": [ + -520, + 3230 + ], + "size": [ + 230, + 80 + ], + "flags": {}, + "order": 35, + "mode": 4, + "inputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "link": 499 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "PreviewImage", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + } + ], + "groups": [ + { + "id": 1, + "title": "Models", + "bounding": [ + -2750, + 1860, + 430, + 770 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 2, + "title": "Prompt", + "bounding": [ + -2290, + 1860, + 460, + 770 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 3, + "title": "Sampling", + "bounding": [ + -590, + 1860, + 700, + 620 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 20, + "title": "Create Video Mask", + "bounding": [ + -2030, + 3110, + 440, + 550 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 23, + "title": "Conditioning", + "bounding": [ + -1800, + 1860, + 370, + 450 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 26, + "title": "Apply Mask to Video", + "bounding": [ + -1560, + 3110, + 1320, + 550 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 29, + "title": "Swtich Logic", + "bounding": [ + -1400, + 1860, + 780, + 1060 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 27, + "title": "Lightning LoRA", + "bounding": [ + -1390, + 2290, + 370, + 620 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 28, + "title": "Original", + "bounding": [ + -1390, + 1900, + 370, + 370 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 31, + "title": "Video Size Preprocessing", + "bounding": [ + -2740, + 3110, + 680, + 770 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 30, + "title": "Size", + "bounding": [ + -2710, + 3270, + 330, + 470 + ], + "color": "#3f789e", + "flags": {} + } + ], + "links": [ + { + "id": 422, + "origin_id": 275, + "origin_slot": 0, + "target_id": 266, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 423, + "origin_id": 296, + "origin_slot": 0, + "target_id": 267, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 424, + "origin_id": 298, + "origin_slot": 1, + "target_id": 267, + "target_slot": 1, + "type": "AUDIO" + }, + { + "id": 425, + "origin_id": 298, + "origin_slot": 2, + "target_id": 267, + "target_slot": 2, + "type": "FLOAT" + }, + { + "id": 426, + "origin_id": 293, + "origin_slot": 0, + "target_id": 268, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 427, + "origin_id": 268, + "origin_slot": 0, + "target_id": 269, + "target_slot": 0, + "type": "MASK" + }, + { + "id": 428, + "origin_id": 299, + "origin_slot": 0, + "target_id": 272, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 429, + "origin_id": 299, + "origin_slot": 0, + "target_id": 275, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 430, + "origin_id": 272, + "origin_slot": 0, + "target_id": 275, + "target_slot": 1, + "type": "MODEL" + }, + { + "id": 431, + "origin_id": 300, + "origin_slot": 0, + "target_id": 275, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 432, + "origin_id": 271, + "origin_slot": 0, + "target_id": 276, + "target_slot": 0, + "type": "FLOAT" + }, + { + "id": 433, + "origin_id": 274, + "origin_slot": 0, + "target_id": 276, + "target_slot": 1, + "type": "FLOAT" + }, + { + "id": 434, + "origin_id": 300, + "origin_slot": 0, + "target_id": 276, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 435, + "origin_id": 277, + "origin_slot": 0, + "target_id": 279, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 436, + "origin_id": 277, + "origin_slot": 0, + "target_id": 280, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 437, + "origin_id": 280, + "origin_slot": 0, + "target_id": 281, + "target_slot": 0, + "type": "CONDITIONING" + }, + { + "id": 438, + "origin_id": 279, + "origin_slot": 0, + "target_id": 281, + "target_slot": 1, + "type": "CONDITIONING" + }, + { + "id": 439, + "origin_id": 278, + "origin_slot": 0, + "target_id": 281, + "target_slot": 2, + "type": "VAE" + }, + { + "id": 440, + "origin_id": 284, + "origin_slot": 0, + "target_id": 281, + "target_slot": 3, + "type": "IMAGE" + }, + { + "id": 441, + "origin_id": 269, + "origin_slot": 0, + "target_id": 281, + "target_slot": 4, + "type": "MASK" + }, + { + "id": 442, + "origin_id": 295, + "origin_slot": 0, + "target_id": 281, + "target_slot": 6, + "type": "INT" + }, + { + "id": 443, + "origin_id": 295, + "origin_slot": 1, + "target_id": 281, + "target_slot": 7, + "type": "INT" + }, + { + "id": 444, + "origin_id": 295, + "origin_slot": 2, + "target_id": 281, + "target_slot": 8, + "type": "INT" + }, + { + "id": 445, + "origin_id": 269, + "origin_slot": 0, + "target_id": 282, + "target_slot": 0, + "type": "MASK" + }, + { + "id": 446, + "origin_id": 282, + "origin_slot": 0, + "target_id": 283, + "target_slot": 0, + "type": "MASK" + }, + { + "id": 447, + "origin_id": 293, + "origin_slot": 0, + "target_id": 284, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 448, + "origin_id": 283, + "origin_slot": 0, + "target_id": 284, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 449, + "origin_id": 269, + "origin_slot": 0, + "target_id": 284, + "target_slot": 2, + "type": "MASK" + }, + { + "id": 452, + "origin_id": 288, + "origin_slot": 0, + "target_id": 287, + "target_slot": 0, + "type": "LATENT" + }, + { + "id": 453, + "origin_id": 281, + "origin_slot": 3, + "target_id": 287, + "target_slot": 1, + "type": "INT" + }, + { + "id": 454, + "origin_id": 266, + "origin_slot": 0, + "target_id": 288, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 455, + "origin_id": 281, + "origin_slot": 0, + "target_id": 288, + "target_slot": 1, + "type": "CONDITIONING" + }, + { + "id": 456, + "origin_id": 281, + "origin_slot": 1, + "target_id": 288, + "target_slot": 2, + "type": "CONDITIONING" + }, + { + "id": 457, + "origin_id": 281, + "origin_slot": 2, + "target_id": 288, + "target_slot": 3, + "type": "LATENT" + }, + { + "id": 458, + "origin_id": 294, + "origin_slot": 0, + "target_id": 288, + "target_slot": 5, + "type": "INT" + }, + { + "id": 459, + "origin_id": 276, + "origin_slot": 0, + "target_id": 288, + "target_slot": 6, + "type": "FLOAT" + }, + { + "id": 460, + "origin_id": 298, + "origin_slot": 0, + "target_id": 289, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 461, + "origin_id": 290, + "origin_slot": 0, + "target_id": 291, + "target_slot": 0, + "type": "INT" + }, + { + "id": 462, + "origin_id": 297, + "origin_slot": 0, + "target_id": 292, + "target_slot": 0, + "type": "INT" + }, + { + "id": 463, + "origin_id": 289, + "origin_slot": 0, + "target_id": 293, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 464, + "origin_id": 292, + "origin_slot": 1, + "target_id": 293, + "target_slot": 2, + "type": "INT" + }, + { + "id": 465, + "origin_id": 291, + "origin_slot": 1, + "target_id": 293, + "target_slot": 3, + "type": "INT" + }, + { + "id": 466, + "origin_id": 270, + "origin_slot": 0, + "target_id": 294, + "target_slot": 0, + "type": "INT" + }, + { + "id": 467, + "origin_id": 273, + "origin_slot": 0, + "target_id": 294, + "target_slot": 1, + "type": "INT" + }, + { + "id": 468, + "origin_id": 300, + "origin_slot": 0, + "target_id": 294, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 469, + "origin_id": 293, + "origin_slot": 0, + "target_id": 295, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 470, + "origin_id": 287, + "origin_slot": 0, + "target_id": 296, + "target_slot": 0, + "type": "LATENT" + }, + { + "id": 471, + "origin_id": 278, + "origin_slot": 0, + "target_id": 296, + "target_slot": 1, + "type": "VAE" + }, + { + "id": 473, + "origin_id": -10, + "origin_slot": 0, + "target_id": 298, + "target_slot": 0, + "type": "VIDEO" + }, + { + "id": 474, + "origin_id": 267, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "VIDEO" + }, + { + "id": 478, + "origin_id": -10, + "origin_slot": 1, + "target_id": 281, + "target_slot": 5, + "type": "IMAGE" + }, + { + "id": 479, + "origin_id": -10, + "origin_slot": 2, + "target_id": 280, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 480, + "origin_id": -10, + "origin_slot": 3, + "target_id": 297, + "target_slot": 0, + "type": "INT" + }, + { + "id": 481, + "origin_id": -10, + "origin_slot": 4, + "target_id": 290, + "target_slot": 0, + "type": "INT" + }, + { + "id": 494, + "origin_id": -10, + "origin_slot": 5, + "target_id": 289, + "target_slot": 2, + "type": "INT" + }, + { + "id": 483, + "origin_id": -10, + "origin_slot": 6, + "target_id": 288, + "target_slot": 4, + "type": "INT" + }, + { + "id": 485, + "origin_id": -10, + "origin_slot": 7, + "target_id": 299, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 486, + "origin_id": -10, + "origin_slot": 8, + "target_id": 277, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 487, + "origin_id": -10, + "origin_slot": 9, + "target_id": 278, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 489, + "origin_id": -10, + "origin_slot": 10, + "target_id": 300, + "target_slot": 0, + "type": "BOOLEAN" + }, + { + "id": 490, + "origin_id": -10, + "origin_slot": 11, + "target_id": 272, + "target_slot": 1, + "type": "COMBO" + }, + { + "id": 491, + "origin_id": -10, + "origin_slot": 12, + "target_id": 268, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 496, + "origin_id": -10, + "origin_slot": 13, + "target_id": 269, + "target_slot": 1, + "type": "INT" + }, + { + "id": 497, + "origin_id": -10, + "origin_slot": 14, + "target_id": 268, + "target_slot": 8, + "type": "COMBO" + }, + { + "id": 498, + "origin_id": 269, + "origin_slot": 0, + "target_id": 310, + "target_slot": 0, + "type": "MASK" + }, + { + "id": 499, + "origin_id": 284, + "origin_slot": 0, + "target_id": 312, + "target_slot": 0, + "type": "IMAGE" + } + ], + "extra": {}, + "category": "Video generation and editing/Inpaint video", + "description": "Removes objects from video by inpainting masked regions using Wan 2.1 VACE, with SAM3 text-guided segmentation and optional Lightning LoRA turbo mode." + }, + { + "id": "17df2eeb-d89e-46ee-9480-a4ca2494b207", + "version": 1, + "state": { + "lastGroupId": 31, + "lastNodeId": 315, + "lastLinkId": 499, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Image Segmentation (SAM3)", + "description": "Segments images into masks using Meta SAM3 from text prompts, points, or boxes.", + "inputNode": { + "id": -10, + "bounding": [ + -2260, + -3450, + 136.369140625, + 220 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + -1130, + -3305, + 120, + 80 + ] + }, + "inputs": [ + { + "id": "a6e75fa2-162a-4af0-a2fd-1e9c899a5ab6", + "name": "image", + "type": "IMAGE", + "linkIds": [ + 264 + ], + "localized_name": "image", + "label": "image", + "pos": [ + -2143.630859375, + -3430 + ] + }, + { + "id": "3cefd304-7631-4ff6-a5a0-5a0ffb120745", + "name": "text", + "type": "STRING", + "linkIds": [ + 265 + ], + "label": "object", + "pos": [ + -2143.630859375, + -3410 + ] + }, + { + "id": "1aec91c5-d8d2-441c-928c-49c14e7e80ed", + "name": "bboxes", + "type": "BOUNDING_BOX", + "linkIds": [ + 266 + ], + "pos": [ + -2143.630859375, + -3390 + ] + }, + { + "id": "1ec7ce1a-8257-4719-8a81-60ebc8a98899", + "name": "positive_coords", + "type": "STRING", + "linkIds": [ + 267 + ], + "pos": [ + -2143.630859375, + -3370 + ] + }, + { + "id": "c65f8b87-9bd7-48be-9fc2-823431e95019", + "name": "negative_coords", + "type": "STRING", + "linkIds": [ + 268 + ], + "pos": [ + -2143.630859375, + -3350 + ] + }, + { + "id": "bb4ba35a-ccfe-4c37-98e5-d9b0d69585fb", + "name": "threshold", + "type": "FLOAT", + "linkIds": [ + 269 + ], + "pos": [ + -2143.630859375, + -3330 + ] + }, + { + "id": "b1439668-b050-490b-a5dc-fc4052c55666", + "name": "refine_iterations", + "type": "INT", + "linkIds": [ + 270 + ], + "pos": [ + -2143.630859375, + -3310 + ] + }, + { + "id": "86e239e5-c098-4302-b54d-d42a38bc0f89", + "name": "individual_masks", + "type": "BOOLEAN", + "linkIds": [ + 271 + ], + "pos": [ + -2143.630859375, + -3290 + ] + }, + { + "id": "f9e0b9d4-b2f1-4907-a4a5-305656576706", + "name": "ckpt_name", + "type": "COMBO", + "linkIds": [ + 272 + ], + "pos": [ + -2143.630859375, + -3270 + ] + } + ], + "outputs": [ + { + "id": "ff50da09-1e59-4a58-9b7f-be1a00aa5913", + "name": "masks", + "type": "MASK", + "linkIds": [ + 231 + ], + "localized_name": "masks", + "pos": [ + -1110, + -3285 + ] + }, + { + "id": "8f622e40-8528-4078-b7d3-147e9f872194", + "name": "bboxes", + "type": "BOUNDING_BOX", + "linkIds": [ + 232 + ], + "localized_name": "bboxes", + "pos": [ + -1110, + -3265 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 75, + "type": "SAM3_Detect", + "pos": [ + -1470, + -3460 + ], + "size": [ + 270, + 260 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "label": "model", + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 237 + }, + { + "label": "image", + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 264 + }, + { + "label": "conditioning", + "localized_name": "conditioning", + "name": "conditioning", + "shape": 7, + "type": "CONDITIONING", + "link": 200 + }, + { + "label": "bboxes", + "localized_name": "bboxes", + "name": "bboxes", + "shape": 7, + "type": "BOUNDING_BOX", + "link": 266 + }, + { + "label": "positive_coords", + "localized_name": "positive_coords", + "name": "positive_coords", + "shape": 7, + "type": "STRING", + "link": 267 + }, + { + "label": "negative_coords", + "localized_name": "negative_coords", + "name": "negative_coords", + "shape": 7, + "type": "STRING", + "link": 268 + }, + { + "localized_name": "threshold", + "name": "threshold", + "type": "FLOAT", + "widget": { + "name": "threshold" + }, + "link": 269 + }, + { + "localized_name": "refine_iterations", + "name": "refine_iterations", + "type": "INT", + "widget": { + "name": "refine_iterations" + }, + "link": 270 + }, + { + "localized_name": "individual_masks", + "name": "individual_masks", + "type": "BOOLEAN", + "widget": { + "name": "individual_masks" + }, + "link": 271 + } + ], + "outputs": [ + { + "localized_name": "masks", + "name": "masks", + "type": "MASK", + "links": [ + 231 + ] + }, + { + "localized_name": "bboxes", + "name": "bboxes", + "type": "BOUNDING_BOX", + "links": [ + 232 + ] + } + ], + "properties": { + "Node name for S&R": "SAM3_Detect", + "cnr_id": "comfy-core", + "ver": "0.19.3", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + 0.5, + 2, + false + ] + }, + { + "id": 236, + "type": "CheckpointLoaderSimple", + "pos": [ + -1970, + -3200 + ], + "size": [ + 330, + 140 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "ckpt_name", + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": 272 + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "links": [ + 237 + ] + }, + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "links": [ + 240 + ] + }, + { + "localized_name": "VAE", + "name": "VAE", + "type": "VAE", + "links": null + } + ], + "properties": { + "Node name for S&R": "CheckpointLoaderSimple", + "cnr_id": "comfy-core", + "ver": "0.19.3", + "models": [ + { + "name": "sam3.1_multiplex_fp16.safetensors", + "url": "https://huggingface.co/Comfy-Org/sam3.1/resolve/main/checkpoints/sam3.1_multiplex_fp16.safetensors", + "directory": "checkpoints" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + "sam3.1_multiplex_fp16.safetensors" + ] + }, + { + "id": 237, + "type": "CLIPTextEncode", + "pos": [ + -2000, + -3000 + ], + "size": [ + 400, + 200 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 240 + }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": 265 + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "links": [ + 200 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPTextEncode", + "cnr_id": "comfy-core", + "ver": "0.19.3", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + "" + ] + } + ], + "groups": [], + "links": [ + { + "id": 237, + "origin_id": 236, + "origin_slot": 0, + "target_id": 75, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 200, + "origin_id": 237, + "origin_slot": 0, + "target_id": 75, + "target_slot": 2, + "type": "CONDITIONING" + }, + { + "id": 240, + "origin_id": 236, + "origin_slot": 1, + "target_id": 237, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 231, + "origin_id": 75, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "MASK" + }, + { + "id": 232, + "origin_id": 75, + "origin_slot": 1, + "target_id": -20, + "target_slot": 1, + "type": "BOUNDING_BOX" + }, + { + "id": 264, + "origin_id": -10, + "origin_slot": 0, + "target_id": 75, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 265, + "origin_id": -10, + "origin_slot": 1, + "target_id": 237, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 266, + "origin_id": -10, + "origin_slot": 2, + "target_id": 75, + "target_slot": 3, + "type": "BOUNDING_BOX" + }, + { + "id": 267, + "origin_id": -10, + "origin_slot": 3, + "target_id": 75, + "target_slot": 4, + "type": "STRING" + }, + { + "id": 268, + "origin_id": -10, + "origin_slot": 4, + "target_id": 75, + "target_slot": 5, + "type": "STRING" + }, + { + "id": 269, + "origin_id": -10, + "origin_slot": 5, + "target_id": 75, + "target_slot": 6, + "type": "FLOAT" + }, + { + "id": 270, + "origin_id": -10, + "origin_slot": 6, + "target_id": 75, + "target_slot": 7, + "type": "INT" + }, + { + "id": 271, + "origin_id": -10, + "origin_slot": 7, + "target_id": 75, + "target_slot": 8, + "type": "BOOLEAN" + }, + { + "id": 272, + "origin_id": -10, + "origin_slot": 8, + "target_id": 236, + "target_slot": 0, + "type": "COMBO" + } + ], + "extra": { + "ue_links": [] + } + } + ] + }, + "extra": {} +} \ No newline at end of file diff --git a/blueprints/Video Segmentation (SAM3).json b/blueprints/Video Segmentation (SAM3).json index 4d9a13412..4c7253869 100644 --- a/blueprints/Video Segmentation (SAM3).json +++ b/blueprints/Video Segmentation (SAM3).json @@ -818,7 +818,7 @@ } ], "extra": {}, - "category": "Video Tools", + "category": "Conditioning & Preprocessors/Segmentation & Mask", "description": "Segments video into temporally consistent masks using Meta SAM3 from text or interactive prompts." } ] diff --git a/blueprints/Video Upscale(GAN x4).json b/blueprints/Video Upscale(GAN x4).json index 73476e36b..fc291ac41 100644 --- a/blueprints/Video Upscale(GAN x4).json +++ b/blueprints/Video Upscale(GAN x4).json @@ -412,7 +412,7 @@ "extra": { "workflowRendererVersion": "LG" }, - "category": "Video generation and editing/Enhance video", + "category": "Video generation and editing/Upscale", "description": "Upscales video to 4× resolution using a GAN-based upscaling model." } ] diff --git a/blueprints/Video to Pose Map (SDPose Multi-Person).json b/blueprints/Video to Pose Map (SDPose Multi-Person).json new file mode 100644 index 000000000..64ef6e524 --- /dev/null +++ b/blueprints/Video to Pose Map (SDPose Multi-Person).json @@ -0,0 +1,1323 @@ +{ + "revision": 0, + "last_node_id": 675, + "last_link_id": 0, + "nodes": [ + { + "id": 675, + "type": "01b6a731-fb78-4070-9a38-c87146da9604", + "pos": [ + -2480, + 3400 + ], + "size": [ + 370, + 638.625 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "label": "resize_target_longer_size", + "name": "resize_type.longer_size", + "type": "INT", + "widget": { + "name": "resize_type.longer_size" + }, + "link": null + }, + { + "name": "scale_method", + "type": "COMBO", + "widget": { + "name": "scale_method" + }, + "link": null + }, + { + "name": "draw_body", + "type": "BOOLEAN", + "widget": { + "name": "draw_body" + }, + "link": null + }, + { + "name": "draw_hands", + "type": "BOOLEAN", + "widget": { + "name": "draw_hands" + }, + "link": null + }, + { + "name": "draw_face", + "type": "BOOLEAN", + "widget": { + "name": "draw_face" + }, + "link": null + }, + { + "name": "draw_feet", + "type": "BOOLEAN", + "widget": { + "name": "draw_feet" + }, + "link": null + }, + { + "name": "stick_width", + "type": "INT", + "widget": { + "name": "stick_width" + }, + "link": null + }, + { + "name": "face_point_size", + "type": "INT", + "widget": { + "name": "face_point_size" + }, + "link": null + }, + { + "name": "score_threshold", + "type": "FLOAT", + "widget": { + "name": "score_threshold" + }, + "link": null + }, + { + "label": "detect_threshold", + "name": "threshold", + "type": "FLOAT", + "widget": { + "name": "threshold" + }, + "link": null + }, + { + "label": "detect_class", + "name": "class_name", + "type": "COMBO", + "widget": { + "name": "class_name" + }, + "link": null + }, + { + "name": "max_detections", + "type": "INT", + "widget": { + "name": "max_detections" + }, + "link": null + }, + { + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": null + }, + { + "name": "unet_name", + "type": "COMBO", + "widget": { + "name": "unet_name" + }, + "link": null + }, + { + "name": "video", + "type": "VIDEO", + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [] + }, + { + "name": "keypoints", + "type": "POSE_KEYPOINT", + "links": null + }, + { + "name": "bboxes", + "type": "BOUNDING_BOX", + "links": [] + }, + { + "name": "audio", + "type": "AUDIO", + "links": [] + }, + { + "name": "fps", + "type": "FLOAT", + "links": [] + } + ], + "properties": { + "proxyWidgets": [ + [ + "674", + "resize_type.longer_size" + ], + [ + "674", + "scale_method" + ], + [ + "672", + "draw_body" + ], + [ + "672", + "draw_hands" + ], + [ + "672", + "draw_face" + ], + [ + "672", + "draw_feet" + ], + [ + "672", + "stick_width" + ], + [ + "672", + "face_point_size" + ], + [ + "672", + "score_threshold" + ], + [ + "678", + "threshold" + ], + [ + "678", + "class_name" + ], + [ + "678", + "max_detections" + ], + [ + "673", + "ckpt_name" + ], + [ + "677", + "unet_name" + ] + ], + "cnr_id": "comfy-core", + "ver": "0.15.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [], + "title": "Video to Pose Map (SDPose Multi-Person)" + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "01b6a731-fb78-4070-9a38-c87146da9604", + "version": 1, + "state": { + "lastGroupId": 2, + "lastNodeId": 699, + "lastLinkId": 1754, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Video to Pose Map (SDPose Multi-Person)", + "inputNode": { + "id": -10, + "bounding": [ + -3570, + 3300, + 182.8984375, + 340 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + -1890, + 3730, + 120, + 140 + ] + }, + "inputs": [ + { + "id": "088eefc1-cd8a-4573-993f-9e4da008a12d", + "name": "resize_type.longer_size", + "type": "INT", + "linkIds": [ + 1704 + ], + "label": "resize_target_longer_size", + "pos": [ + -3407.1015625, + 3320 + ] + }, + { + "id": "b6449bd3-73d4-41c8-b81f-cf8d33f76a2e", + "name": "scale_method", + "type": "COMBO", + "linkIds": [ + 1705 + ], + "pos": [ + -3407.1015625, + 3340 + ] + }, + { + "id": "4cff52ad-ed07-4c97-8803-fcbd89554fd0", + "name": "draw_body", + "type": "BOOLEAN", + "linkIds": [ + 1706 + ], + "pos": [ + -3407.1015625, + 3360 + ] + }, + { + "id": "7af63dce-f7df-4d7e-8215-d7c7f60bf81c", + "name": "draw_hands", + "type": "BOOLEAN", + "linkIds": [ + 1707 + ], + "pos": [ + -3407.1015625, + 3380 + ] + }, + { + "id": "af3a9bce-61f9-4aca-b530-9f65e028b35e", + "name": "draw_face", + "type": "BOOLEAN", + "linkIds": [ + 1708 + ], + "pos": [ + -3407.1015625, + 3400 + ] + }, + { + "id": "4620f6a3-2c85-4b79-ad8f-35d0326b568f", + "name": "draw_feet", + "type": "BOOLEAN", + "linkIds": [ + 1709 + ], + "pos": [ + -3407.1015625, + 3420 + ] + }, + { + "id": "fee5d0c9-8d4b-4934-81d8-ba2206dc56cb", + "name": "stick_width", + "type": "INT", + "linkIds": [ + 1710 + ], + "pos": [ + -3407.1015625, + 3440 + ] + }, + { + "id": "aafdd060-ba81-4324-a9cc-b656e1ebc133", + "name": "face_point_size", + "type": "INT", + "linkIds": [ + 1711 + ], + "pos": [ + -3407.1015625, + 3460 + ] + }, + { + "id": "514c5503-f9e6-4d23-b1ae-1d3291acb2a3", + "name": "score_threshold", + "type": "FLOAT", + "linkIds": [ + 1712 + ], + "pos": [ + -3407.1015625, + 3480 + ] + }, + { + "id": "4eb3e4ea-7a36-4511-8483-0d12aadd32f7", + "name": "threshold", + "type": "FLOAT", + "linkIds": [ + 1718 + ], + "label": "detect_threshold", + "pos": [ + -3407.1015625, + 3500 + ] + }, + { + "id": "c76a7a05-81e6-4b17-a9e0-85f47a5844f2", + "name": "class_name", + "type": "COMBO", + "linkIds": [ + 1719 + ], + "label": "detect_class", + "pos": [ + -3407.1015625, + 3520 + ] + }, + { + "id": "4417e988-6e80-4236-be31-4c179037f5a2", + "name": "max_detections", + "type": "INT", + "linkIds": [ + 1720 + ], + "pos": [ + -3407.1015625, + 3540 + ] + }, + { + "id": "7d7c4a0b-0d1b-4c98-942b-f90548d2a492", + "name": "ckpt_name", + "type": "COMBO", + "linkIds": [ + 1721 + ], + "pos": [ + -3407.1015625, + 3560 + ] + }, + { + "id": "4d75122c-2c14-452a-98fe-d1545d3e012a", + "name": "unet_name", + "type": "COMBO", + "linkIds": [ + 1722 + ], + "pos": [ + -3407.1015625, + 3580 + ] + }, + { + "id": "6c46c988-4dd1-41a2-957e-03caf60d7657", + "name": "video", + "type": "VIDEO", + "linkIds": [ + 1741 + ], + "pos": [ + -3407.1015625, + 3600 + ] + } + ], + "outputs": [ + { + "id": "f05ed8cc-9403-4f14-8085-4364b06f8a48", + "name": "IMAGE", + "type": "IMAGE", + "linkIds": [ + 1701 + ], + "localized_name": "IMAGE", + "pos": [ + -1870, + 3750 + ] + }, + { + "id": "4b64118e-3cef-4eeb-9dad-4cd09cfd63a2", + "name": "keypoints", + "type": "POSE_KEYPOINT", + "linkIds": [ + 1725 + ], + "pos": [ + -1870, + 3770 + ] + }, + { + "id": "a27f7e34-dcbc-4fb0-a4e1-2c5fc423ca5f", + "name": "bboxes", + "type": "BOUNDING_BOX", + "linkIds": [ + 1726 + ], + "pos": [ + -1870, + 3790 + ] + }, + { + "id": "b7fe351d-2b38-41ea-9f4d-3be1a0aad275", + "name": "audio", + "type": "AUDIO", + "linkIds": [ + 1743 + ], + "pos": [ + -1870, + 3810 + ] + }, + { + "id": "ae187b6f-c9ca-4487-b5c1-3ad775fe945e", + "name": "fps", + "type": "FLOAT", + "linkIds": [ + 1744 + ], + "pos": [ + -1870, + 3830 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 671, + "type": "SDPoseKeypointExtractor", + "pos": [ + -2550, + 3080 + ], + "size": [ + 270, + 180 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 1696 + }, + { + "localized_name": "vae", + "name": "vae", + "type": "VAE", + "link": 1697 + }, + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 1698 + }, + { + "localized_name": "bboxes", + "name": "bboxes", + "shape": 7, + "type": "BOUNDING_BOX", + "link": 1717 + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { + "name": "batch_size" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "keypoints", + "name": "keypoints", + "type": "POSE_KEYPOINT", + "links": [ + 1699, + 1725 + ] + } + ], + "properties": { + "Node name for S&R": "SDPoseKeypointExtractor", + "cnr_id": "comfy-core", + "ver": "0.15.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 16 + ] + }, + { + "id": 674, + "type": "ResizeImageMaskNode", + "pos": [ + -3010, + 3880 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "input", + "name": "input", + "type": "IMAGE,MASK", + "link": 1742 + }, + { + "localized_name": "resize_type", + "name": "resize_type", + "type": "COMFY_DYNAMICCOMBO_V3", + "widget": { + "name": "resize_type" + }, + "link": null + }, + { + "localized_name": "resize_type.longer_size", + "name": "resize_type.longer_size", + "type": "INT", + "widget": { + "name": "resize_type.longer_size" + }, + "link": 1704 + }, + { + "localized_name": "scale_method", + "name": "scale_method", + "type": "COMBO", + "widget": { + "name": "scale_method" + }, + "link": 1705 + } + ], + "outputs": [ + { + "localized_name": "resized", + "name": "resized", + "type": "*", + "links": [ + 1698, + 1716 + ] + } + ], + "properties": { + "Node name for S&R": "ResizeImageMaskNode", + "cnr_id": "comfy-core", + "ver": "0.15.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "scale longer dimension", + 1024, + "lanczos" + ] + }, + { + "id": 672, + "type": "SDPoseDrawKeypoints", + "pos": [ + -2540, + 3590 + ], + "size": [ + 270, + 280 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "keypoints", + "name": "keypoints", + "type": "POSE_KEYPOINT", + "link": 1699 + }, + { + "localized_name": "draw_body", + "name": "draw_body", + "type": "BOOLEAN", + "widget": { + "name": "draw_body" + }, + "link": 1706 + }, + { + "localized_name": "draw_hands", + "name": "draw_hands", + "type": "BOOLEAN", + "widget": { + "name": "draw_hands" + }, + "link": 1707 + }, + { + "localized_name": "draw_face", + "name": "draw_face", + "type": "BOOLEAN", + "widget": { + "name": "draw_face" + }, + "link": 1708 + }, + { + "localized_name": "draw_feet", + "name": "draw_feet", + "type": "BOOLEAN", + "widget": { + "name": "draw_feet" + }, + "link": 1709 + }, + { + "localized_name": "stick_width", + "name": "stick_width", + "type": "INT", + "widget": { + "name": "stick_width" + }, + "link": 1710 + }, + { + "localized_name": "face_point_size", + "name": "face_point_size", + "type": "INT", + "widget": { + "name": "face_point_size" + }, + "link": 1711 + }, + { + "localized_name": "score_threshold", + "name": "score_threshold", + "type": "FLOAT", + "widget": { + "name": "score_threshold" + }, + "link": 1712 + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 1701 + ] + } + ], + "properties": { + "Node name for S&R": "SDPoseDrawKeypoints", + "cnr_id": "comfy-core", + "ver": "0.15.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + true, + true, + true, + true, + 4, + 2, + 0.5 + ] + }, + { + "id": 673, + "type": "CheckpointLoaderSimple", + "pos": [ + -3040, + 3080 + ], + "size": [ + 390, + 160 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "ckpt_name", + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": 1721 + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "links": [ + 1696 + ] + }, + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "links": [] + }, + { + "localized_name": "VAE", + "name": "VAE", + "type": "VAE", + "links": [ + 1697 + ] + } + ], + "properties": { + "Node name for S&R": "CheckpointLoaderSimple", + "cnr_id": "comfy-core", + "ver": "0.15.0", + "models": [ + { + "name": "sdpose_wholebody_fp16.safetensors", + "url": "https://huggingface.co/Comfy-Org/SDPose/resolve/main/checkpoints/sdpose_wholebody_fp16.safetensors", + "directory": "checkpoints" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "sdpose_wholebody_fp16.safetensors" + ] + }, + { + "id": 677, + "type": "UNETLoader", + "pos": [ + -3030, + 3300 + ], + "size": [ + 370, + 110 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "localized_name": "unet_name", + "name": "unet_name", + "type": "COMBO", + "widget": { + "name": "unet_name" + }, + "link": 1722 + }, + { + "localized_name": "weight_dtype", + "name": "weight_dtype", + "type": "COMBO", + "widget": { + "name": "weight_dtype" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "links": [ + 1715 + ] + } + ], + "properties": { + "Node name for S&R": "UNETLoader", + "cnr_id": "comfy-core", + "ver": "0.14.1", + "models": [ + { + "name": "rt_detr_v4-x-hgnet_fp16.safetensors", + "url": "https://huggingface.co/Comfy-Org/SDPose/resolve/main/diffusion_models/rt_detr_v4-x-hgnet_fp16.safetensors", + "directory": "diffusion_models" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "rt_detr_v4-x-hgnet_fp16.safetensors", + "default" + ] + }, + { + "id": 678, + "type": "RTDETR_detect", + "pos": [ + -2540, + 3320 + ], + "size": [ + 270, + 200 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "label": "model", + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 1715 + }, + { + "label": "image", + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 1716 + }, + { + "localized_name": "threshold", + "name": "threshold", + "type": "FLOAT", + "widget": { + "name": "threshold" + }, + "link": 1718 + }, + { + "localized_name": "class_name", + "name": "class_name", + "type": "COMBO", + "widget": { + "name": "class_name" + }, + "link": 1719 + }, + { + "localized_name": "max_detections", + "name": "max_detections", + "type": "INT", + "widget": { + "name": "max_detections" + }, + "link": 1720 + } + ], + "outputs": [ + { + "localized_name": "bboxes", + "name": "bboxes", + "type": "BOUNDING_BOX", + "links": [ + 1717, + 1726 + ] + } + ], + "properties": { + "Node name for S&R": "RTDETR_detect", + "cnr_id": "comfy-core", + "ver": "0.15.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 0.5, + "person", + 2 + ] + }, + { + "id": 692, + "type": "GetVideoComponents", + "pos": [ + -3010, + 4100 + ], + "size": [ + 230, + 120 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "localized_name": "video", + "name": "video", + "type": "VIDEO", + "link": 1741 + } + ], + "outputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "links": [ + 1742 + ] + }, + { + "localized_name": "audio", + "name": "audio", + "type": "AUDIO", + "links": [ + 1743 + ] + }, + { + "localized_name": "fps", + "name": "fps", + "type": "FLOAT", + "links": [ + 1744 + ] + } + ], + "properties": { + "Node name for S&R": "GetVideoComponents", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + } + ], + "groups": [], + "links": [ + { + "id": 1696, + "origin_id": 673, + "origin_slot": 0, + "target_id": 671, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 1697, + "origin_id": 673, + "origin_slot": 2, + "target_id": 671, + "target_slot": 1, + "type": "VAE" + }, + { + "id": 1698, + "origin_id": 674, + "origin_slot": 0, + "target_id": 671, + "target_slot": 2, + "type": "IMAGE" + }, + { + "id": 1699, + "origin_id": 671, + "origin_slot": 0, + "target_id": 672, + "target_slot": 0, + "type": "POSE_KEYPOINT" + }, + { + "id": 1701, + "origin_id": 672, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 1704, + "origin_id": -10, + "origin_slot": 0, + "target_id": 674, + "target_slot": 2, + "type": "INT" + }, + { + "id": 1705, + "origin_id": -10, + "origin_slot": 1, + "target_id": 674, + "target_slot": 3, + "type": "COMBO" + }, + { + "id": 1706, + "origin_id": -10, + "origin_slot": 2, + "target_id": 672, + "target_slot": 1, + "type": "BOOLEAN" + }, + { + "id": 1707, + "origin_id": -10, + "origin_slot": 3, + "target_id": 672, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 1708, + "origin_id": -10, + "origin_slot": 4, + "target_id": 672, + "target_slot": 3, + "type": "BOOLEAN" + }, + { + "id": 1709, + "origin_id": -10, + "origin_slot": 5, + "target_id": 672, + "target_slot": 4, + "type": "BOOLEAN" + }, + { + "id": 1710, + "origin_id": -10, + "origin_slot": 6, + "target_id": 672, + "target_slot": 5, + "type": "INT" + }, + { + "id": 1711, + "origin_id": -10, + "origin_slot": 7, + "target_id": 672, + "target_slot": 6, + "type": "INT" + }, + { + "id": 1712, + "origin_id": -10, + "origin_slot": 8, + "target_id": 672, + "target_slot": 7, + "type": "FLOAT" + }, + { + "id": 1715, + "origin_id": 677, + "origin_slot": 0, + "target_id": 678, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 1716, + "origin_id": 674, + "origin_slot": 0, + "target_id": 678, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 1717, + "origin_id": 678, + "origin_slot": 0, + "target_id": 671, + "target_slot": 3, + "type": "BOUNDING_BOX" + }, + { + "id": 1718, + "origin_id": -10, + "origin_slot": 9, + "target_id": 678, + "target_slot": 2, + "type": "FLOAT" + }, + { + "id": 1719, + "origin_id": -10, + "origin_slot": 10, + "target_id": 678, + "target_slot": 3, + "type": "COMBO" + }, + { + "id": 1720, + "origin_id": -10, + "origin_slot": 11, + "target_id": 678, + "target_slot": 4, + "type": "INT" + }, + { + "id": 1721, + "origin_id": -10, + "origin_slot": 12, + "target_id": 673, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 1722, + "origin_id": -10, + "origin_slot": 13, + "target_id": 677, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 1725, + "origin_id": 671, + "origin_slot": 0, + "target_id": -20, + "target_slot": 1, + "type": "POSE_KEYPOINT" + }, + { + "id": 1726, + "origin_id": 678, + "origin_slot": 0, + "target_id": -20, + "target_slot": 2, + "type": "BOUNDING_BOX" + }, + { + "id": 1741, + "origin_id": -10, + "origin_slot": 14, + "target_id": 692, + "target_slot": 0, + "type": "VIDEO" + }, + { + "id": 1742, + "origin_id": 692, + "origin_slot": 0, + "target_id": 674, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 1743, + "origin_id": 692, + "origin_slot": 1, + "target_id": -20, + "target_slot": 3, + "type": "AUDIO" + }, + { + "id": 1744, + "origin_id": 692, + "origin_slot": 2, + "target_id": -20, + "target_slot": 4, + "type": "FLOAT" + } + ], + "extra": { + "workflowRendererVersion": "LG" + }, + "category": "Conditioning & Preprocessors/Pose", + "description": "Extracts multi-person pose keypoints and skeleton frame sequences from video using SDPose with built-in person detection." + } + ] + }, + "extra": {} +} \ No newline at end of file diff --git a/comfy/cli_args.py b/comfy/cli_args.py index 76faed3ad..9bda414d1 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -49,7 +49,7 @@ parser.add_argument("--temp-directory", type=str, default=None, help="Set the Co parser.add_argument("--input-directory", type=str, default=None, help="Set the ComfyUI input directory. Overrides --base-directory.") parser.add_argument("--auto-launch", action="store_true", help="Automatically launch ComfyUI in the default browser.") parser.add_argument("--disable-auto-launch", action="store_true", help="Disable auto launching the browser.") -parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use. All other devices will not be visible.") +parser.add_argument("--cuda-device", type=str, default=None, metavar="DEVICE_ID", help="Set the ids of cuda devices this instance will use, as a comma-separated list (e.g. '0' or '0,1'). All other devices will not be visible.") parser.add_argument("--default-device", type=int, default=None, metavar="DEFAULT_DEVICE_ID", help="Set the id of the default device, all other devices will stay visible.") cm_group = parser.add_mutually_exclusive_group() cm_group.add_argument("--cuda-malloc", action="store_true", help="Enable cudaMallocAsync (enabled by default for torch 2.0 and up).") @@ -110,13 +110,11 @@ parser.add_argument("--preview-method", type=LatentPreviewMethod, default=Latent parser.add_argument("--preview-size", type=int, default=512, help="Sets the maximum preview size for sampler nodes.") -CACHE_RAM_AUTO_GB = -1.0 - cache_group = parser.add_mutually_exclusive_group() +cache_group.add_argument("--cache-ram", nargs='*', type=float, default=[], metavar="GB", help="Use RAM pressure caching with the specified headroom thresholds. This is the default caching mode. The first value sets the active-cache threshold; the optional second value sets the inactive-cache/pin threshold. Defaults when no values are provided: active 10%% of system RAM (min 2GB, max 10GB), inactive 100%% of system RAM (max 96GB).") cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.") cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.") cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.") -cache_group.add_argument("--cache-ram", nargs='?', const=CACHE_RAM_AUTO_GB, type=float, default=0, help="Use RAM pressure caching with the specified headroom threshold. If available RAM drops below the threshold the cache removes large items to free RAM. Default (when no value is provided): 25%% of system RAM (min 4GB, max 32GB).") attn_group = parser.add_mutually_exclusive_group() attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.") @@ -245,6 +243,9 @@ if comfy.options.args_parsing: else: args = parser.parse_args([]) +if args.cache_ram is not None and len(args.cache_ram) > 2: + parser.error("--cache-ram accepts at most two values: active GB and inactive GB") + if args.windows_standalone_build: args.auto_launch = True diff --git a/comfy/comfy_types/node_typing.py b/comfy/comfy_types/node_typing.py index 57126fa4a..bb21eb1d1 100644 --- a/comfy/comfy_types/node_typing.py +++ b/comfy/comfy_types/node_typing.py @@ -1,6 +1,5 @@ """Comfy-specific type hinting""" -from __future__ import annotations from typing import Literal, TypedDict, Optional from typing_extensions import NotRequired from abc import ABC, abstractmethod diff --git a/comfy/controlnet.py b/comfy/controlnet.py index ba670b16d..6dbbaa959 100644 --- a/comfy/controlnet.py +++ b/comfy/controlnet.py @@ -15,13 +15,14 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . """ - +from __future__ import annotations import torch from enum import Enum import math import os import logging +import copy import comfy.utils import comfy.model_management import comfy.model_detection @@ -38,7 +39,7 @@ import comfy.ldm.hydit.controlnet import comfy.ldm.flux.controlnet import comfy.ldm.qwen_image.controlnet import comfy.cldm.dit_embedder -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Union if TYPE_CHECKING: from comfy.hooks import HookGroup @@ -64,6 +65,18 @@ class StrengthType(Enum): CONSTANT = 1 LINEAR_UP = 2 +class ControlIsolation: + '''Temporarily set a ControlBase object's previous_controlnet to None to prevent cascading calls.''' + def __init__(self, control: ControlBase): + self.control = control + self.orig_previous_controlnet = control.previous_controlnet + + def __enter__(self): + self.control.previous_controlnet = None + + def __exit__(self, *args): + self.control.previous_controlnet = self.orig_previous_controlnet + class ControlBase: def __init__(self): self.cond_hint_original = None @@ -77,7 +90,7 @@ class ControlBase: self.compression_ratio = 8 self.upscale_algorithm = 'nearest-exact' self.extra_args = {} - self.previous_controlnet = None + self.previous_controlnet: Union[ControlBase, None] = None self.extra_conds = [] self.strength_type = StrengthType.CONSTANT self.concat_mask = False @@ -85,6 +98,7 @@ class ControlBase: self.extra_concat = None self.extra_hooks: HookGroup = None self.preprocess_image = lambda a: a + self.multigpu_clones: dict[torch.device, ControlBase] = {} def set_cond_hint(self, cond_hint, strength=1.0, timestep_percent_range=(0.0, 1.0), vae=None, extra_concat=[]): self.cond_hint_original = cond_hint @@ -111,17 +125,38 @@ class ControlBase: def cleanup(self): if self.previous_controlnet is not None: self.previous_controlnet.cleanup() - + for device_cnet in self.multigpu_clones.values(): + with ControlIsolation(device_cnet): + device_cnet.cleanup() self.cond_hint = None self.extra_concat = None self.timestep_range = None def get_models(self): out = [] + for device_cnet in self.multigpu_clones.values(): + out += device_cnet.get_models_only_self() if self.previous_controlnet is not None: out += self.previous_controlnet.get_models() return out + def get_models_only_self(self): + 'Calls get_models, but temporarily sets previous_controlnet to None.' + with ControlIsolation(self): + return self.get_models() + + def get_instance_for_device(self, device): + 'Returns instance of this Control object intended for selected device.' + return self.multigpu_clones.get(device, self) + + def deepclone_multigpu(self, load_device, autoregister=False): + ''' + Create deep clone of Control object where model(s) is set to other devices. + + When autoregister is set to True, the deep clone is also added to multigpu_clones dict. + ''' + raise NotImplementedError("Classes inheriting from ControlBase should define their own deepclone_multigpu funtion.") + def get_extra_hooks(self): out = [] if self.extra_hooks is not None: @@ -130,7 +165,7 @@ class ControlBase: out += self.previous_controlnet.get_extra_hooks() return out - def copy_to(self, c): + def copy_to(self, c: ControlBase): c.cond_hint_original = self.cond_hint_original c.strength = self.strength c.timestep_percent_range = self.timestep_percent_range @@ -284,6 +319,14 @@ class ControlNet(ControlBase): self.copy_to(c) return c + def deepclone_multigpu(self, load_device, autoregister=False): + c = self.copy() + c.control_model = copy.deepcopy(c.control_model) + c.control_model_wrapped = comfy.model_patcher.ModelPatcher(c.control_model, load_device=load_device, offload_device=comfy.model_management.unet_offload_device()) + if autoregister: + self.multigpu_clones[load_device] = c + return c + def get_models(self): out = super().get_models() out.append(self.control_model_wrapped) @@ -314,6 +357,10 @@ class QwenFunControlNet(ControlNet): super().pre_run(model, percent_to_timestep_function) self.set_extra_arg("base_model", model.diffusion_model) + def cleanup(self): + self.extra_args.pop("base_model", None) + super().cleanup() + def copy(self): c = QwenFunControlNet(None, global_average_pooling=self.global_average_pooling, load_device=self.load_device, manual_cast_dtype=self.manual_cast_dtype) c.control_model = self.control_model @@ -906,6 +953,14 @@ class T2IAdapter(ControlBase): self.copy_to(c) return c + def deepclone_multigpu(self, load_device, autoregister=False): + c = self.copy() + c.t2i_model = copy.deepcopy(c.t2i_model) + c.device = load_device + if autoregister: + self.multigpu_clones[load_device] = c + return c + def load_t2i_adapter(t2i_data, model_options={}): #TODO: model_options compression_ratio = 8 upscale_algorithm = 'nearest-exact' diff --git a/comfy/float.py b/comfy/float.py index 184b3d6d0..3c82d6359 100644 --- a/comfy/float.py +++ b/comfy/float.py @@ -1,5 +1,20 @@ +import logging + import torch +_CK_STOCHASTIC_ROUNDING_AVAILABLE = False +try: + import comfy_kitchen as ck + _ck_stochastic_rounding_fp8 = ck.stochastic_rounding_fp8 + _CK_STOCHASTIC_ROUNDING_AVAILABLE = True +except (AttributeError, ImportError): + logging.warning("comfy_kitchen does not support stochastic FP8 rounding, please update comfy_kitchen.") + +if not _CK_STOCHASTIC_ROUNDING_AVAILABLE: + def _ck_stochastic_rounding_fp8(value, rng, dtype): + raise NotImplementedError("comfy_kitchen does not support stochastic FP8 rounding") + + def calc_mantissa(abs_x, exponent, normal_mask, MANTISSA_BITS, EXPONENT_BIAS, generator=None): mantissa_scaled = torch.where( normal_mask, @@ -57,6 +72,10 @@ def stochastic_rounding(value, dtype, seed=0): if dtype == torch.float8_e4m3fn or dtype == torch.float8_e5m2: generator = torch.Generator(device=value.device) generator.manual_seed(seed) + if _CK_STOCHASTIC_ROUNDING_AVAILABLE: + rng = torch.randint(0, 256, value.size(), dtype=torch.uint8, layout=value.layout, device=value.device, generator=generator) + return _ck_stochastic_rounding_fp8(value, rng, dtype) + output = torch.empty_like(value, dtype=dtype) num_slices = max(1, (value.numel() / (4096 * 4096))) slice_size = max(1, round(value.shape[0] / num_slices)) diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py index 6e37080bb..12a934d71 100644 --- a/comfy/latent_formats.py +++ b/comfy/latent_formats.py @@ -152,6 +152,11 @@ class StableAudio1(LatentFormat): latent_dimensions = 1 temporal_downscale_ratio = 2048 +class StableAudio3(LatentFormat): + latent_channels = 256 + latent_dimensions = 1 + temporal_downscale_ratio = 4096 + class Flux(SD3): latent_channels = 16 def __init__(self): @@ -794,13 +799,15 @@ class ZImagePixelSpace(ChromaRadiance): """ pass - class HiDreamO1Pixel(ChromaRadiance): """Pixel-space latent format for HiDream-O1. No VAE — model patches/unpatches raw RGB internally with patch_size=32. """ pass +class PixelDiTPixel(ChromaRadiance): + pass + class CogVideoX(LatentFormat): """Latent format for CogVideoX-2b (THUDM/CogVideoX-2b). diff --git a/comfy/ldm/audio/dit.py b/comfy/ldm/audio/dit.py index ca865189e..c28be5b49 100644 --- a/comfy/ldm/audio/dit.py +++ b/comfy/ldm/audio/dit.py @@ -10,6 +10,17 @@ from torch import nn from torch.nn import functional as F import math import comfy.ops +from .embedders import ExpoFourierFeatures + + +def _left_pad_to_match(emb, target_len): + emb_len = emb.shape[-2] + if emb_len < target_len: + return F.pad(emb, (0, 0, target_len - emb_len, 0), value=0.) + elif emb_len > target_len: + return emb[:, -target_len:, :] + return emb + class FourierFeatures(nn.Module): def __init__(self, in_features, out_features, std=1., dtype=None, device=None): @@ -22,6 +33,7 @@ class FourierFeatures(nn.Module): f = 2 * math.pi * input @ comfy.ops.cast_to_input(self.weight.T, input) return torch.cat([f.cos(), f.sin()], dim=-1) + # norms class LayerNorm(nn.Module): def __init__(self, dim, bias=False, fix_scale=False, dtype=None, device=None): @@ -43,6 +55,16 @@ class LayerNorm(nn.Module): beta = comfy.ops.cast_to_input(beta, x) return F.layer_norm(x, x.shape[-1:], weight=comfy.ops.cast_to_input(self.gamma, x), bias=beta) + +class RMSNorm(nn.Module): + def __init__(self, dim, dtype=None, device=None): + super().__init__() + self.gamma = nn.Parameter(torch.empty(dim, dtype=dtype, device=device)) + + def forward(self, x): + return F.rms_norm(x, x.shape[-1:], weight=comfy.ops.cast_to_input(self.gamma, x)) + + class GLU(nn.Module): def __init__( self, @@ -236,13 +258,6 @@ class FeedForward(nn.Module): linear_out = operations.Linear(inner_dim, dim_out, bias = not no_bias, dtype=dtype, device=device) if not use_conv else operations.Conv1d(inner_dim, dim_out, conv_kernel_size, padding = (conv_kernel_size // 2), bias = not no_bias, dtype=dtype, device=device) - # # init last linear layer to 0 - # if zero_init_output: - # nn.init.zeros_(linear_out.weight) - # if not no_bias: - # nn.init.zeros_(linear_out.bias) - - self.ff = nn.Sequential( linear_in, rearrange('b d n -> b n d') if use_conv else nn.Identity(), @@ -261,8 +276,10 @@ class Attention(nn.Module): dim_context = None, causal = False, zero_init_output=True, - qk_norm = False, + qk_norm = "none", + differential = False, natten_kernel_size = None, + feat_scale = False, dtype=None, device=None, operations=None, @@ -271,6 +288,7 @@ class Attention(nn.Module): self.dim = dim self.dim_heads = dim_heads self.causal = causal + self.differential = differential dim_kv = dim_context if dim_context is not None else dim @@ -278,18 +296,37 @@ class Attention(nn.Module): self.kv_heads = dim_kv // dim_heads if dim_context is not None: - self.to_q = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device) - self.to_kv = operations.Linear(dim_kv, dim_kv * 2, bias=False, dtype=dtype, device=device) + if differential: + self.to_q = operations.Linear(dim, dim * 2, bias=False, dtype=dtype, device=device) + self.to_kv = operations.Linear(dim_kv, dim_kv * 3, bias=False, dtype=dtype, device=device) + else: + self.to_q = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device) + self.to_kv = operations.Linear(dim_kv, dim_kv * 2, bias=False, dtype=dtype, device=device) else: - self.to_qkv = operations.Linear(dim, dim * 3, bias=False, dtype=dtype, device=device) + if differential: + self.to_qkv = operations.Linear(dim, dim * 5, bias=False, dtype=dtype, device=device) + else: + self.to_qkv = operations.Linear(dim, dim * 3, bias=False, dtype=dtype, device=device) self.to_out = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device) - # if zero_init_output: - # nn.init.zeros_(self.to_out.weight) - + # Accept bool for backward compat + if isinstance(qk_norm, bool): + qk_norm = "l2" if qk_norm else "none" self.qk_norm = qk_norm + if self.qk_norm == "ln": + self.q_norm = operations.LayerNorm(dim_heads, elementwise_affine=True, eps=1.0e-6, dtype=dtype, device=device) + self.k_norm = operations.LayerNorm(dim_heads, elementwise_affine=True, eps=1.0e-6, dtype=dtype, device=device) + elif self.qk_norm == "rms": + self.q_norm = RMSNorm(dim_heads, dtype=dtype, device=device) + self.k_norm = RMSNorm(dim_heads, dtype=dtype, device=device) + + self.feat_scale = feat_scale + + if self.feat_scale: + self.lambda_dc = nn.Parameter(torch.empty(dim, dtype=dtype, device=device)) + self.lambda_hf = nn.Parameter(torch.empty(dim, dtype=dtype, device=device)) def forward( self, @@ -306,22 +343,51 @@ class Attention(nn.Module): kv_input = context if has_context else x if hasattr(self, 'to_q'): - # Use separate linear projections for q and k/v - q = self.to_q(x) - q = rearrange(q, 'b n (h d) -> b h n d', h = h) + if self.differential: + # cross-attention differential: to_q → (q, q_diff), to_kv → (k, k_diff, v) + q, q_diff = self.to_q(x).chunk(2, dim=-1) + q = rearrange(q, 'b n (h d) -> b h n d', h=h) + q_diff = rearrange(q_diff, 'b n (h d) -> b h n d', h=h) + q = torch.stack([q, q_diff], dim=1) # (B, 2, H, N, D) + k, k_diff, v = self.to_kv(kv_input).chunk(3, dim=-1) + k = rearrange(k, 'b n (h d) -> b h n d', h=kv_h) + k_diff = rearrange(k_diff, 'b n (h d) -> b h n d', h=kv_h) + v = rearrange(v, 'b n (h d) -> b h n d', h=kv_h) + k = torch.stack([k, k_diff], dim=1) # (B, 2, H, M, D) + else: + # Use separate linear projections for q and k/v + q = self.to_q(x) + q = rearrange(q, 'b n (h d) -> b h n d', h = h) - k, v = self.to_kv(kv_input).chunk(2, dim=-1) + k, v = self.to_kv(kv_input).chunk(2, dim=-1) - k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = kv_h), (k, v)) + k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = kv_h), (k, v)) else: - # Use fused linear projection - q, k, v = self.to_qkv(x).chunk(3, dim=-1) - q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v)) + if self.differential: + # self-attention differential: to_qkv → (q, k, v, q_diff, k_diff) + q, k, v, q_diff, k_diff = self.to_qkv(x).chunk(5, dim=-1) + q, k, v, q_diff, k_diff = map( + lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), + (q, k, v, q_diff, k_diff) + ) + q = torch.stack([q, q_diff], dim=1) # (B, 2, H, N, D) + k = torch.stack([k, k_diff], dim=1) + else: + # Use fused linear projection + q, k, v = self.to_qkv(x).chunk(3, dim=-1) + q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v)) # Normalize q and k for cosine sim attention - if self.qk_norm: + if self.qk_norm == "l2": q = F.normalize(q, dim=-1) k = F.normalize(k, dim=-1) + elif self.qk_norm == "rms": + q_type, k_type = q.dtype, k.dtype + q = self.q_norm(q).to(q_type) + k = self.k_norm(k).to(k_type) + elif self.qk_norm != 'none': + q = self.q_norm(q) + k = self.k_norm(k) if rotary_pos_emb is not None and not has_context: freqs, _ = rotary_pos_emb @@ -364,9 +430,24 @@ class Attention(nn.Module): heads_per_kv_head = h // kv_h k, v = map(lambda t: t.repeat_interleave(heads_per_kv_head, dim = 1), (k, v)) - out = optimized_attention(q, k, v, h, skip_reshape=True, transformer_options=transformer_options) + if self.differential: + q, q_diff = q.unbind(dim=1) + k, k_diff = k.unbind(dim=1) + out = optimized_attention(q, k, v, h, skip_reshape=True, low_precision_attention=False, transformer_options=transformer_options) + out_diff = optimized_attention(q_diff, k_diff, v, h, skip_reshape=True, low_precision_attention=False, transformer_options=transformer_options) + out = out - out_diff + else: + out = optimized_attention(q, k, v, h, skip_reshape=True, low_precision_attention=False, transformer_options=transformer_options) + out = self.to_out(out) + if self.feat_scale: + out_dc = out.mean(dim=-2, keepdim=True) + out_hf = out - out_dc + + # Selectively modulate DC and high frequency components + out = out + comfy.ops.cast_to_input(self.lambda_dc, out) * out_dc + comfy.ops.cast_to_input(self.lambda_hf, out) * out_hf + if mask is not None: mask = rearrange(mask, 'b n -> b n 1') out = out.masked_fill(~mask, 0.) @@ -417,11 +498,14 @@ class TransformerBlock(nn.Module): cross_attend = False, dim_context = None, global_cond_dim = None, + global_cond_shared_embed = False, + local_add_cond_dim = None, causal = False, zero_init_branch_outputs = True, conformer = False, layer_ix = -1, remove_norms = False, + norm_type = "layer_norm", attn_kwargs = {}, ff_kwargs = {}, norm_kwargs = {}, @@ -436,8 +520,20 @@ class TransformerBlock(nn.Module): self.cross_attend = cross_attend self.dim_context = dim_context self.causal = causal + self.global_cond_shared_embed = global_cond_shared_embed - self.pre_norm = LayerNorm(dim, dtype=dtype, device=device, **norm_kwargs) if not remove_norms else nn.Identity() + norm_layer_map = { + "layer_norm": LayerNorm, + "rms_norm": RMSNorm, + } + norm_cls = norm_layer_map.get(norm_type, LayerNorm) + + def make_norm(): + if remove_norms: + return nn.Identity() + return norm_cls(dim, dtype=dtype, device=device, **norm_kwargs) + + self.pre_norm = make_norm() self.self_attn = Attention( dim, @@ -451,7 +547,7 @@ class TransformerBlock(nn.Module): ) if cross_attend: - self.cross_attend_norm = LayerNorm(dim, dtype=dtype, device=device, **norm_kwargs) if not remove_norms else nn.Identity() + self.cross_attend_norm = make_norm() self.cross_attn = Attention( dim, dim_heads = dim_heads, @@ -464,37 +560,56 @@ class TransformerBlock(nn.Module): **attn_kwargs ) - self.ff_norm = LayerNorm(dim, dtype=dtype, device=device, **norm_kwargs) if not remove_norms else nn.Identity() - self.ff = FeedForward(dim, zero_init_output=zero_init_branch_outputs, dtype=dtype, device=device, operations=operations,**ff_kwargs) + self.ff_norm = make_norm() + self.ff = FeedForward(dim, zero_init_output=zero_init_branch_outputs, dtype=dtype, device=device, operations=operations, **ff_kwargs) self.layer_ix = layer_ix self.conformer = ConformerModule(dim, norm_kwargs=norm_kwargs) if conformer else None - self.global_cond_dim = global_cond_dim + # Global conditioning + self.has_global_cond = (global_cond_dim is not None) or global_cond_shared_embed - if global_cond_dim is not None: + if global_cond_shared_embed: + # SA3 style: learnable per-block additive bias; global_cond is pre-projected to (B, dim*6) + self.to_scale_shift_gate = nn.Parameter(torch.empty(dim * 6, device=device, dtype=dtype)) + elif global_cond_dim is not None: + # SA1 style: per-block MLP projects global_cond → (B, dim*6) self.to_scale_shift_gate = nn.Sequential( nn.SiLU(), - nn.Linear(global_cond_dim, dim * 6, bias=False) + operations.Linear(global_cond_dim, dim * 6, bias=False, device=device, dtype=dtype) ) - nn.init.zeros_(self.to_scale_shift_gate[1].weight) - #nn.init.zeros_(self.to_scale_shift_gate_self[1].bias) + # Local additive conditioning (e.g. inpaint mask + masked latent) + self.local_add_cond_dim = local_add_cond_dim + if local_add_cond_dim is not None: + self.to_local_embed = nn.Sequential( + operations.Linear(local_add_cond_dim, dim, bias=True, dtype=dtype, device=device), + nn.SiLU(), + operations.Linear(dim, dim, bias=True, dtype=dtype, device=device), + ) + else: + self.to_local_embed = None def forward( self, x, context = None, global_cond=None, + local_add_cond=None, mask = None, context_mask = None, rotary_pos_emb = None, transformer_options={} ): - if self.global_cond_dim is not None and self.global_cond_dim > 0 and global_cond is not None: + if self.has_global_cond and global_cond is not None: + if self.global_cond_shared_embed: + # global_cond already has shape (B, dim*6) + ssg = (comfy.ops.cast_to_input(self.to_scale_shift_gate, global_cond) + global_cond).unsqueeze(1) + else: + ssg = self.to_scale_shift_gate(global_cond).unsqueeze(1) - scale_self, shift_self, gate_self, scale_ff, shift_ff, gate_ff = self.to_scale_shift_gate(global_cond).unsqueeze(1).chunk(6, dim = -1) + scale_self, shift_self, gate_self, scale_ff, shift_ff, gate_ff = ssg.chunk(6, dim = -1) # self-attention with adaLN residual = x @@ -510,6 +625,9 @@ class TransformerBlock(nn.Module): if self.conformer is not None: x = x + self.conformer(x) + if local_add_cond is not None and self.to_local_embed is not None: + x = x + _left_pad_to_match(self.to_local_embed(local_add_cond), x.shape[-2]) + # feedforward with adaLN residual = x x = self.ff_norm(x) @@ -527,6 +645,9 @@ class TransformerBlock(nn.Module): if self.conformer is not None: x = x + self.conformer(x) + if local_add_cond is not None and self.to_local_embed is not None: + x = x + _left_pad_to_match(self.to_local_embed(local_add_cond), x.shape[-2]) + x = x + self.ff(self.ff_norm(x)) return x @@ -543,6 +664,8 @@ class ContinuousTransformer(nn.Module): cross_attend=False, cond_token_dim=None, global_cond_dim=None, + global_cond_shared_embed=False, + local_add_cond_dim=None, causal=False, rotary_pos_emb=True, zero_init_branch_outputs=True, @@ -550,6 +673,7 @@ class ContinuousTransformer(nn.Module): use_sinusoidal_emb=False, use_abs_pos_emb=False, abs_pos_emb_max_length=10000, + num_memory_tokens=0, dtype=None, device=None, operations=None, @@ -562,6 +686,8 @@ class ContinuousTransformer(nn.Module): self.depth = depth self.causal = causal self.layers = nn.ModuleList([]) + self.num_memory_tokens = num_memory_tokens + self.global_cond_shared_embed = global_cond_shared_embed self.project_in = operations.Linear(dim_in, dim, bias=False, dtype=dtype, device=device) if dim_in is not None else nn.Identity() self.project_out = operations.Linear(dim, dim_out, bias=False, dtype=dtype, device=device) if dim_out is not None else nn.Identity() @@ -577,7 +703,22 @@ class ContinuousTransformer(nn.Module): self.use_abs_pos_emb = use_abs_pos_emb if use_abs_pos_emb: - self.pos_emb = AbsolutePositionalEmbedding(dim, abs_pos_emb_max_length) + self.pos_emb = AbsolutePositionalEmbedding(dim, abs_pos_emb_max_length + num_memory_tokens) + + if num_memory_tokens > 0: + self.memory_tokens = nn.Parameter(torch.empty(num_memory_tokens, dim, device=device, dtype=dtype)) + + # Shared global-cond embedder (SA3 style): projects (B, global_cond_dim) → (B, dim*6) + self.global_cond_embedder = None + if global_cond_shared_embed and global_cond_dim is not None: + self.global_cond_embedder = nn.Sequential( + operations.Linear(global_cond_dim, dim, bias=True, dtype=dtype, device=device), + nn.SiLU(), + operations.Linear(dim, dim * 6, bias=True, dtype=dtype, device=device), + ) + + # When using shared embed, TransformerBlocks use per-block Parameter (not per-block MLP) + block_global_cond_dim = None if global_cond_shared_embed else global_cond_dim for i in range(depth): self.layers.append( @@ -586,7 +727,9 @@ class ContinuousTransformer(nn.Module): dim_heads = dim_heads, cross_attend = cross_attend, dim_context = cond_token_dim, - global_cond_dim = global_cond_dim, + global_cond_dim = block_global_cond_dim, + global_cond_shared_embed = global_cond_shared_embed, + local_add_cond_dim = local_add_cond_dim, causal = causal, zero_init_branch_outputs = zero_init_branch_outputs, conformer=conformer, @@ -605,6 +748,7 @@ class ContinuousTransformer(nn.Module): prepend_embeds = None, prepend_mask = None, global_cond = None, + local_add_cond = None, return_info = False, **kwargs ): @@ -632,7 +776,9 @@ class ContinuousTransformer(nn.Module): mask = torch.cat((prepend_mask, mask), dim = -1) - # Attention layers + if self.num_memory_tokens > 0: + memory_tokens = comfy.ops.cast_to_input(self.memory_tokens, x).expand(batch, -1, -1) + x = torch.cat((memory_tokens, x), dim=1) if self.rotary_pos_emb is not None: rotary_pos_emb = self.rotary_pos_emb.forward_from_seq_len(x.shape[1], dtype=torch.float, device=x.device) @@ -642,6 +788,10 @@ class ContinuousTransformer(nn.Module): if self.use_sinusoidal_emb or self.use_abs_pos_emb: x = x + self.pos_emb(x) + # Project global_cond once (SA3 shared-embed path) + if global_cond is not None and self.global_cond_embedder is not None: + global_cond = self.global_cond_embedder(global_cond) + blocks_replace = patches_replace.get("dit", {}) # Iterate over the transformer layers for i, layer in enumerate(self.layers): @@ -654,12 +804,17 @@ class ContinuousTransformer(nn.Module): out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": global_cond, "pe": rotary_pos_emb, "transformer_options": transformer_options}, {"original_block": block_wrap}) x = out["img"] else: - x = layer(x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, context=context, transformer_options=transformer_options) - # x = checkpoint(layer, x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, **kwargs) + x = layer(x, rotary_pos_emb=rotary_pos_emb, global_cond=global_cond, + local_add_cond=local_add_cond, context=context, + transformer_options=transformer_options) if return_info: info["hidden_states"].append(x) + # Strip memory tokens before projecting out + if self.num_memory_tokens > 0: + x = x[:, self.num_memory_tokens:, :] + x = self.project_out(x) if return_info: @@ -682,6 +837,7 @@ class AudioDiffusionTransformer(nn.Module): num_heads=24, transformer_type: tp.Literal["continuous_transformer"] = "continuous_transformer", global_cond_type: tp.Literal["prepend", "adaLN"] = "prepend", + timestep_features_type: str = "learned", audio_model="", dtype=None, device=None, @@ -696,7 +852,10 @@ class AudioDiffusionTransformer(nn.Module): # Timestep embeddings timestep_features_dim = 256 - self.timestep_features = FourierFeatures(1, timestep_features_dim, dtype=dtype, device=device) + if timestep_features_type == "expo": + self.timestep_features = ExpoFourierFeatures(timestep_features_dim, 0.5, 10000.0) + else: + self.timestep_features = FourierFeatures(1, timestep_features_dim, dtype=dtype, device=device) self.to_timestep_embed = nn.Sequential( operations.Linear(timestep_features_dim, embed_dim, bias=True, dtype=dtype, device=device), @@ -781,6 +940,7 @@ class AudioDiffusionTransformer(nn.Module): cross_attn_cond=None, cross_attn_cond_mask=None, input_concat_cond=None, + local_add_cond=None, global_embed=None, prepend_cond=None, prepend_cond_mask=None, @@ -802,9 +962,13 @@ class AudioDiffusionTransformer(nn.Module): prepend_cond = self.to_prepend_embed(prepend_cond) prepend_inputs = prepend_cond + prepend_length = prepend_cond.shape[1] if prepend_cond_mask is not None: prepend_mask = prepend_cond_mask + if local_add_cond is not None and local_add_cond.dim() == 3: + local_add_cond = local_add_cond.permute(0, 2, 1) + if input_concat_cond is not None: # Interpolate input_concat_cond to the same length as x @@ -850,7 +1014,7 @@ class AudioDiffusionTransformer(nn.Module): if self.transformer_type == "x-transformers": output = self.transformer(x, prepend_embeds=prepend_inputs, context=cross_attn_cond, context_mask=cross_attn_cond_mask, mask=mask, prepend_mask=prepend_mask, **extra_args, **kwargs) elif self.transformer_type == "continuous_transformer": - output = self.transformer(x, prepend_embeds=prepend_inputs, context=cross_attn_cond, context_mask=cross_attn_cond_mask, mask=mask, prepend_mask=prepend_mask, return_info=return_info, **extra_args, **kwargs) + output = self.transformer(x, prepend_embeds=prepend_inputs, context=cross_attn_cond, context_mask=cross_attn_cond_mask, mask=mask, prepend_mask=prepend_mask, return_info=return_info, local_add_cond=local_add_cond, **extra_args, **kwargs) if return_info: output, info = output @@ -876,6 +1040,7 @@ class AudioDiffusionTransformer(nn.Module): context=None, context_mask=None, input_concat_cond=None, + local_add_cond=None, global_embed=None, negative_global_embed=None, prepend_cond=None, @@ -890,6 +1055,7 @@ class AudioDiffusionTransformer(nn.Module): cross_attn_cond=context, cross_attn_cond_mask=context_mask, input_concat_cond=input_concat_cond, + local_add_cond=local_add_cond, global_embed=global_embed, prepend_cond=prepend_cond, prepend_cond_mask=prepend_cond_mask, diff --git a/comfy/ldm/audio/embedders.py b/comfy/ldm/audio/embedders.py index 20edb365a..ba9a62837 100644 --- a/comfy/ldm/audio/embedders.py +++ b/comfy/ldm/audio/embedders.py @@ -31,15 +31,39 @@ def TimePositionalEmbedding(dim: int, out_features: int) -> nn.Module: ) +class ExpoFourierFeatures(nn.Module): + """Exponentially-spaced Fourier features (no learnable parameters).""" + def __init__(self, dim, min_freq=0.5, max_freq=10000.0): + super().__init__() + self.dim = dim + self.min_freq = min_freq + self.max_freq = max_freq + + def forward(self, t): + in_dtype = t.dtype + t = t.float() + if t.dim() == 1: + t = t.unsqueeze(-1) + half_dim = self.dim // 2 + ramp = torch.linspace(0, 1, half_dim, device=t.device, dtype=torch.float32) + freqs = torch.exp(ramp * (math.log(self.max_freq) - math.log(self.min_freq)) + math.log(self.min_freq)) + args = t * freqs * 2 * math.pi + return torch.cat([args.cos(), args.sin()], dim=-1).to(in_dtype) + + class NumberEmbedder(nn.Module): def __init__( self, features: int, dim: int = 256, + fourier_features_type="learned", ): super().__init__() self.features = features - self.embedding = TimePositionalEmbedding(dim=dim, out_features=features) + if fourier_features_type == "expo": + self.embedding = nn.Sequential(ExpoFourierFeatures(dim=dim), comfy.ops.manual_cast.Linear(in_features=dim, out_features=features)) + else: + self.embedding = TimePositionalEmbedding(dim=dim, out_features=features) def forward(self, x: Union[List[float], Tensor]) -> Tensor: if not torch.is_tensor(x): @@ -77,14 +101,15 @@ class NumberConditioner(Conditioner): def __init__(self, output_dim: int, min_val: float=0, - max_val: float=1 + max_val: float=1, + fourier_features_type: str = "learned", ): super().__init__(output_dim, output_dim) self.min_val = min_val self.max_val = max_val - self.embedder = NumberEmbedder(features=output_dim) + self.embedder = NumberEmbedder(features=output_dim, fourier_features_type=fourier_features_type) def forward(self, floats, device=None): # Cast the inputs to floats diff --git a/comfy/ldm/audio/vae_sa3.py b/comfy/ldm/audio/vae_sa3.py new file mode 100644 index 000000000..8be36d6ee --- /dev/null +++ b/comfy/ldm/audio/vae_sa3.py @@ -0,0 +1,533 @@ +import torch +import torch.nn as nn + +import comfy.ops +import comfy.model_management +from comfy.ldm.modules.attention import optimized_attention +from comfy.ldm.audio.autoencoder import WNConv1d + +ops = comfy.ops.disable_weight_init + +class Transpose(nn.Module): + def forward(self, x, **kwargs): + return x.transpose(-2, -1) + + +def _zero_pad_modulo_sequence(x, size, dim=-2): + input_len = x.shape[dim] + pad_len = (size - input_len % size) % size + if pad_len > 0: + pad_shape = list(x.shape) + pad_shape[dim] = pad_len + x = torch.cat([x, torch.zeros(pad_shape, device=x.device, dtype=x.dtype)], dim=dim) + return x + + +def _sliding_window_mask(seq_len, window, device, dtype): + """Additive attention mask enforcing a ±window local window (matches flash_attn window_size).""" + i = torch.arange(seq_len, device=device).unsqueeze(1) + j = torch.arange(seq_len, device=device).unsqueeze(0) + out_of_window = (j - i).abs() > window + return torch.where( + out_of_window, + torch.full((1,), torch.finfo(dtype).min / 4, device=device, dtype=dtype), + torch.zeros(1, device=device, dtype=dtype), + ) + + +class DynamicTanh(nn.Module): + def __init__(self, dim, init_alpha=4.0, dtype=None, device=None, **kwargs): + super().__init__() + self.alpha = nn.Parameter(torch.empty(1, dtype=dtype, device=device)) + self.gamma = nn.Parameter(torch.empty(dim, dtype=dtype, device=device)) + self.beta = nn.Parameter(torch.empty(dim, dtype=dtype, device=device)) + + def forward(self, x): + alpha = comfy.ops.cast_to_input(self.alpha, x) + gamma = comfy.ops.cast_to_input(self.gamma, x) + beta = comfy.ops.cast_to_input(self.beta, x) + return gamma * torch.tanh(alpha * x) + beta + + +class RotaryEmbedding(nn.Module): + def __init__(self, dim, base=10000, base_rescale_factor=1., dtype=None, device=None): + super().__init__() + base = base * base_rescale_factor ** (dim / (dim - 2)) + self.register_buffer("inv_freq", torch.empty(dim // 2, dtype=dtype, device=device)) + + def forward_from_seq_len(self, seq_len, device, dtype=None): + t = torch.arange(seq_len, device=device, dtype=torch.float32) + return self.forward(t) + + def forward(self, t): + freqs = torch.outer(t.float(), comfy.model_management.cast_to(self.inv_freq, dtype=torch.float32, device=t.device)) + freqs = torch.cat((freqs, freqs), dim=-1) + return freqs, 1. + + +def _rotate_half(x): + d = x.shape[-1] // 2 + return torch.cat((-x[..., d:], x[..., :d]), dim=-1) + + +def _apply_rotary_pos_emb(t, freqs): + out_dtype = t.dtype + rot_dim = freqs.shape[-1] + seq_len = t.shape[-2] + freqs = freqs[-seq_len:] + t_rot, t_pass = t[..., :rot_dim], t[..., rot_dim:] + t_rot = t_rot * freqs.cos() + _rotate_half(t_rot) * freqs.sin() + return torch.cat((t_rot.to(out_dtype), t_pass.to(out_dtype)), dim=-1) + + +class Attention(nn.Module): + def __init__(self, dim, dim_heads=64, qk_norm="none", qk_norm_eps=1e-6, + differential=False, zero_init_output=True, + dtype=None, device=None, operations=None, **kwargs): + super().__init__() + self.num_heads = dim // dim_heads + self.differential = differential + self.qk_norm = qk_norm + + self.to_qkv = operations.Linear( + dim, dim * (5 if differential else 3), bias=False, dtype=dtype, device=device) + self.to_out = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device) + + if qk_norm == "dyt": + self.q_norm = DynamicTanh(dim_heads, dtype=dtype, device=device) + self.k_norm = DynamicTanh(dim_heads, dtype=dtype, device=device) + elif qk_norm == "rms": + self.q_norm = operations.RMSNorm(dim_heads, eps=qk_norm_eps, dtype=dtype, device=device) + self.k_norm = operations.RMSNorm(dim_heads, eps=qk_norm_eps, dtype=dtype, device=device) + + def forward(self, x, rotary_pos_emb=None, mask=None, **kwargs): + B, N, _ = x.shape + h = self.num_heads + + qkv = self.to_qkv(x) + if self.differential: + q, k, v, q_diff, k_diff = qkv.chunk(5, dim=-1) + del qkv + q = q.view(B, N, h, -1).transpose(1, 2) + k = k.view(B, N, h, -1).transpose(1, 2) + v = v.view(B, N, h, -1).transpose(1, 2) + q_diff = q_diff.view(B, N, h, -1).transpose(1, 2) + k_diff = k_diff.view(B, N, h, -1).transpose(1, 2) + else: + q, k, v = qkv.chunk(3, dim=-1) + del qkv + q = q.view(B, N, h, -1).transpose(1, 2) + k = k.view(B, N, h, -1).transpose(1, 2) + v = v.view(B, N, h, -1).transpose(1, 2) + + if self.qk_norm != "none": + q_dtype, k_dtype = q.dtype, k.dtype + q = self.q_norm(q).to(q_dtype) + k = self.k_norm(k).to(k_dtype) + if self.differential: + q_diff = self.q_norm(q_diff).to(q_dtype) + k_diff = self.k_norm(k_diff).to(k_dtype) + + if rotary_pos_emb is not None: + freqs, _ = rotary_pos_emb + q_dtype, k_dtype = q.dtype, k.dtype + q = _apply_rotary_pos_emb(q.float(), freqs).to(q_dtype) + k = _apply_rotary_pos_emb(k.float(), freqs).to(k_dtype) + if self.differential: + q_diff = _apply_rotary_pos_emb(q_diff.float(), freqs).to(q_dtype) + k_diff = _apply_rotary_pos_emb(k_diff.float(), freqs).to(k_dtype) + + if self.differential: + out = (optimized_attention(q, k, v, h, mask=mask, skip_reshape=True, low_precision_attention=False) + - optimized_attention(q_diff, k_diff, v, h, mask=mask, skip_reshape=True, low_precision_attention=False)) + del q, k, v, q_diff, k_diff + else: + out = optimized_attention(q, k, v, h, mask=mask, skip_reshape=True, low_precision_attention=False) + del q, k, v + + return self.to_out(out) + + +class _Sin(nn.Module): + def forward(self, x): + return torch.sin(3.14159265359 * x) + + +class _GLU(nn.Module): + def __init__(self, dim_in, dim_out, activation, dtype=None, device=None, operations=None): + super().__init__() + self.act = activation + self.proj = operations.Linear(dim_in, dim_out * 2, dtype=dtype, device=device) + + def forward(self, x): + x = self.proj(x) + x, gate = x.chunk(2, dim=-1) + return x * self.act(gate) + + +class FeedForward(nn.Module): + def __init__(self, dim, mult=4, no_bias=False, zero_init_output=True, + sinusoidal=False, dtype=None, device=None, operations=None, **kwargs): + super().__init__() + inner_dim = int(dim * mult) + act = _Sin() if sinusoidal else nn.SiLU() + self.ff = nn.Sequential( + _GLU(dim, inner_dim, act, dtype=dtype, device=device, operations=operations), + nn.Identity(), + operations.Linear(inner_dim, dim, bias=not no_bias, dtype=dtype, device=device), + nn.Identity(), + ) + + def forward(self, x, **kwargs): + return self.ff(x) + + +class TransformerBlock(nn.Module): + def __init__(self, dim, dim_heads=64, causal=False, zero_init_branch_outputs=True, + norm_type="dyt", add_rope=False, attn_kwargs=None, ff_kwargs=None, + norm_kwargs=None, dtype=None, device=None, operations=None, **kwargs): + super().__init__() + if attn_kwargs is None: + attn_kwargs = {} + if ff_kwargs is None: + ff_kwargs = {} + if norm_kwargs is None: + norm_kwargs = {} + dim_heads = min(dim_heads, dim) + + Norm = DynamicTanh if norm_type == "dyt" else operations.RMSNorm + norm_kw = {**norm_kwargs, "dtype": dtype, "device": device} + + self.pre_norm = Norm(dim, **norm_kw) + self.self_attn = Attention(dim, dim_heads=dim_heads, + zero_init_output=zero_init_branch_outputs, + dtype=dtype, device=device, operations=operations, + **attn_kwargs) + self.ff_norm = Norm(dim, **norm_kw) + self.ff = FeedForward(dim, zero_init_output=zero_init_branch_outputs, + dtype=dtype, device=device, operations=operations, **ff_kwargs) + self.rope = RotaryEmbedding(dim_heads // 2, dtype=dtype, device=device) if add_rope else None + + def forward(self, x, mask=None, **kwargs): + rope = self.rope.forward_from_seq_len(x.shape[-2], device=x.device) \ + if self.rope is not None else None + x = x + self.self_attn(self.pre_norm(x), rotary_pos_emb=rope, mask=mask) + x = x + self.ff(self.ff_norm(x)) + return x + + +class TransformerResamplingBlock(nn.Module): + def __init__(self, in_channels, out_channels, stride, type="encoder", + transformer_depth=3, dim_heads=128, differential=True, + sliding_window=None, chunk_size=128, chunk_midpoint_shift=False, + dyt=True, ff_mult=3, mapping_bias=True, variable_stride=False, + sinusoidal_blocks=0, conv_mapping=False, dtype=None, device=None, operations=None, **kwargs): + super().__init__() + if type not in ("encoder", "decoder"): + raise ValueError(f"type must be 'encoder' or 'decoder', got {type!r}") + + self.type = type + self.stride = stride + self.chunk_size = chunk_size + self.chunk_midpoint_shift = chunk_midpoint_shift + self.variable_stride = variable_stride + self.transformer_depth = transformer_depth + + transformer_dim = out_channels if type == "encoder" else in_channels + + self.mapping = (WNConv1d(in_channels, out_channels, 3 if conv_mapping else 1, padding="same", bias=mapping_bias) + if in_channels != out_channels else nn.Identity()) + + self.sliding_window_latents = sliding_window + self.sliding_window_seq = self._get_sliding_window_size(sliding_window, stride) + self.input_seg_size, self.output_seg_size, self.sub_chunk_size = self._get_seg_sizes(stride) + + token_seq = 1 if variable_stride else self.output_seg_size + self.new_tokens = nn.Parameter(torch.empty(1, token_seq, transformer_dim, dtype=dtype, device=device)) + + norm_type = "dyt" if dyt else "rms_norm" + attn_kwargs = {"qk_norm": "dyt" if dyt else "rms", "qk_norm_eps": 1e-3, + "differential": differential} + norm_kwargs = {"eps": 1e-3} + transformers = [] + for i in range(transformer_depth): + sinusoidal = (transformer_depth - i) < sinusoidal_blocks + transformers.append(TransformerBlock( + transformer_dim, + dim_heads=dim_heads, + causal=False, + zero_init_branch_outputs=True, + norm_type=norm_type, + add_rope=True, + attn_kwargs=attn_kwargs, + ff_kwargs={"mult": ff_mult, "no_bias": False, "sinusoidal": sinusoidal}, + norm_kwargs=norm_kwargs, + dtype=dtype, device=device, operations=operations, + )) + self.transformers = nn.ModuleList(transformers) + + def _get_sliding_window_size(self, window, stride, prepend_cond_length=0): + if window is None: + return None + return [w * (stride + 1 + prepend_cond_length) for w in window] + + def _get_seg_sizes(self, stride, prepend_cond_length=0): + sub_chunk_size = stride + 1 + prepend_cond_length + input_seg_size = stride if self.type == "encoder" else 1 + output_seg_size = 1 if self.type == "encoder" else stride + return input_seg_size, output_seg_size, sub_chunk_size + + def forward(self, x, stride=None, **kwargs): + B = x.shape[0] + + if stride is None: + input_seg = self.input_seg_size + output_seg = self.output_seg_size + sub_chunk = self.sub_chunk_size + sliding_window = self.sliding_window_seq + else: + input_seg, output_seg, sub_chunk = self._get_seg_sizes(stride) + sliding_window = self._get_sliding_window_size(self.sliding_window_latents, stride) + + if self.type == "encoder": + if self.transformer_depth > 0: + pad_mod = self.chunk_size if sliding_window is None else input_seg + x = _zero_pad_modulo_sequence(x, pad_mod, dim=-1) + x = self.mapping(x) + + if self.transformer_depth > 0: + x = x.permute(0, 2, 1) + + if self.type != "encoder": + pad_mod = 1 if sliding_window is not None else ( + self.chunk_size // (stride if stride is not None else self.stride)) + x = _zero_pad_modulo_sequence(x, pad_mod) + + C = x.shape[2] + x = x.reshape(-1, input_seg, C) + + new_tokens = self.new_tokens.expand(x.shape[0], output_seg, -1) + x = torch.cat([x, comfy.ops.cast_to_input(new_tokens, x)], dim=-2) + del new_tokens + + x = x.reshape(B, -1, C) + + if sliding_window is None: + eff_chunk = self.chunk_size + self.chunk_size // (stride if stride is not None else self.stride) + + if sliding_window is None and self.chunk_midpoint_shift: + split = self.transformer_depth // 2 + shift = eff_chunk // 2 + + x = x.reshape(-1, eff_chunk, C) + for layer in self.transformers[:split]: + x = layer(x) + x = x.reshape(B, -1, C) + + shifted = torch.cat([x[:, :shift, :], x, x[:, -shift:, :]], dim=1) + del x + x = shifted.reshape(-1, eff_chunk, C) + del shifted + for layer in self.transformers[split:]: + x = layer(x) + x = x.reshape(B, -1, C) + x = x[:, shift:-shift, :] + elif sliding_window is None: + x = x.reshape(-1, eff_chunk, C) + for layer in self.transformers: + x = layer(x) + x = x.reshape(B, -1, C) + else: + attn_mask = _sliding_window_mask(x.shape[1], sliding_window[0], x.device, x.dtype) + for layer in self.transformers: + x = layer(x, mask=attn_mask) + + x = x.reshape(-1, sub_chunk, C) + x = x[:, -output_seg:, :] + x = x.reshape(B, -1, C).transpose(1, 2) + + if self.type == "decoder": + x = self.mapping(x) + + return x + + +class SAMEEncoder(nn.Module): + def __init__(self, in_channels=2, channels=128, latent_dim=32, + c_mults=(1, 2, 4, 8), strides=(2, 4, 8, 8), + transformer_depths=(3, 3, 3, 3), + dtype=None, device=None, operations=None, **kwargs): + super().__init__() + channel_dims = [in_channels] + [channels * c for c in c_mults] + layers = [] + for i in range(len(c_mults)): + layers.append(TransformerResamplingBlock( + in_channels=channel_dims[i], out_channels=channel_dims[i + 1], + stride=strides[i], type="encoder", + transformer_depth=transformer_depths[i], + dtype=dtype, device=device, operations=operations, **kwargs)) + layers += [ + Transpose(), + operations.Linear(channel_dims[-1], latent_dim, dtype=dtype, device=device), + Transpose(), + ] + self.layers = nn.ModuleList(layers) + + def forward(self, x, **kwargs): + for layer in self.layers: + x = layer(x) + return x + + +class SAMEDecoder(nn.Module): + def __init__(self, out_channels=2, channels=128, latent_dim=32, + c_mults=(1, 2, 4, 8), strides=(2, 4, 8, 8), + transformer_depths=(3, 3, 3, 3), sinusoidal_blocks=None, + dtype=None, device=None, operations=None, **kwargs): + super().__init__() + if sinusoidal_blocks is None: + sinusoidal_blocks = [0] * len(c_mults) + channel_dims = [out_channels] + [channels * c for c in c_mults] + layers = [ + Transpose(), + operations.Linear(latent_dim, channel_dims[-1], dtype=dtype, device=device), + Transpose(), + ] + for i in range(len(c_mults) - 1, -1, -1): + layers.append(TransformerResamplingBlock( + in_channels=channel_dims[i + 1], out_channels=channel_dims[i], + stride=strides[i], type="decoder", + transformer_depth=transformer_depths[i], + sinusoidal_blocks=sinusoidal_blocks[i], + dtype=dtype, device=device, operations=operations, **kwargs)) + self.layers = nn.ModuleList(layers) + + def forward(self, x, **kwargs): + for layer in self.layers: + x = layer(x) + return x + + +class SoftNormBottleneck(nn.Module): + def __init__(self, dim=32, noise_augment_dim=0, noise_regularize=False, + auto_scale=False, freeze=False, dtype=None, device=None, **kwargs): + super().__init__() + self.noise_augment_dim = noise_augment_dim + self.noise_regularize = noise_regularize + self.scaling_factor = nn.Parameter(torch.empty(1, dim, 1, dtype=dtype, device=device)) + self.bias = nn.Parameter(torch.empty(1, dim, 1, dtype=dtype, device=device)) + self.noise_scaling_factor = nn.Parameter(torch.empty(1, noise_augment_dim, 1, dtype=dtype, device=device)) + if auto_scale: + self.register_parameter("running_std", nn.Parameter( + torch.empty(1, dtype=dtype, device=device), requires_grad=False)) + if freeze: + for p in self.parameters(): + p.requires_grad = False + + def encode(self, x, return_info=False, **kwargs): + x = x * comfy.ops.cast_to_input(self.scaling_factor, x) \ + + comfy.ops.cast_to_input(self.bias, x) + if hasattr(self, "running_std"): + x = x / comfy.ops.cast_to_input(self.running_std, x) + if return_info: + return x, {} + return x + + def decode(self, x, **kwargs): + if hasattr(self, "running_std"): + x = x * comfy.ops.cast_to_input(self.running_std, x) + if self.noise_regularize: + scaling = self.running_std if hasattr(self, "running_std") \ + else x.std(dim=-1, keepdim=True) + noise = torch.randn_like(x) * comfy.ops.cast_to_input(scaling, x) * 1e-3 + x = x + noise + if self.noise_augment_dim > 0: + noise = comfy.ops.cast_to_input(self.noise_scaling_factor, x) * torch.randn( + x.shape[0], self.noise_augment_dim, x.shape[-1], device=x.device, dtype=x.dtype) + x = torch.cat([x, noise], dim=1) + return x + + +class PatchedPretransform(nn.Module): + def __init__(self, channels, patch_size, **kwargs): + super().__init__() + self.channels = channels + self.patch_size = patch_size + self.enable_grad = False + + def _pad(self, x): + pad_len = (self.patch_size - x.shape[-1] % self.patch_size) % self.patch_size + if pad_len > 0: + x = torch.cat([x, torch.zeros_like(x[:, :, :pad_len])], dim=-1) + return x + + def encode(self, x): + x = self._pad(x) + B, C, T = x.shape + h = self.patch_size + L = T // h + # b c (l h) -> b (c h) l + return x.reshape(B, C, L, h).permute(0, 1, 3, 2).reshape(B, C * h, L) + + def decode(self, x): + B, Ch, L = x.shape + h = self.patch_size + C = Ch // h + # b (c h) l -> b c (l h) + return x.reshape(B, C, h, L).permute(0, 1, 3, 2).reshape(B, C, L * h) + + +class SA3AudioVAE(nn.Module): + """SA3 VAE. State dict keys match checkpoint after stripping 'pretransform.model.'""" + + def __init__(self, channels=256, transformer_depths=12, sinusoidal_blocks=8, + sliding_window=None, decoder_conv_mapping=False, + chunk_size=128, chunk_midpoint_shift=False, + dtype=None, device=None, operations=None): + super().__init__() + if operations is None: + operations = ops + + self.pretransform = PatchedPretransform(channels=2, patch_size=256) + + common_kwargs = dict( + differential=True, dyt=True, dim_heads=64, + sliding_window=sliding_window, variable_stride=True, + chunk_size=chunk_size, chunk_midpoint_shift=chunk_midpoint_shift, + dtype=dtype, device=device, operations=operations, + ) + self.encoder = SAMEEncoder( + in_channels=512, channels=channels, c_mults=[6], strides=[16], + latent_dim=256, transformer_depths=[transformer_depths], + conv_mapping=False, **common_kwargs, + ) + self.decoder = SAMEDecoder( + out_channels=512, channels=channels, c_mults=[6], strides=[16], + latent_dim=256, transformer_depths=[transformer_depths], sinusoidal_blocks=[sinusoidal_blocks], + conv_mapping=decoder_conv_mapping, **common_kwargs, + ) + self.bottleneck = SoftNormBottleneck( + dim=256, noise_augment_dim=0, noise_regularize=True, + auto_scale=True, freeze=True, + dtype=dtype, device=device, + ) + + @torch.no_grad() + def _pretransform_encode(self, x): + return self.pretransform.encode(x) + + @torch.no_grad() + def _pretransform_decode(self, x): + return self.pretransform.decode(x) + + def encode(self, x): + x = self._pretransform_encode(x) + x = self.encoder(x) + x = self.bottleneck.encode(x) + return x + + def decode(self, x): + x = self.bottleneck.decode(x) + x = self.decoder(x) + x = self._pretransform_decode(x) + return x diff --git a/comfy/ldm/hunyuan3dv2_1/hunyuandit.py b/comfy/ldm/hunyuan3dv2_1/hunyuandit.py index f67ba84e9..4e4819fe3 100644 --- a/comfy/ldm/hunyuan3dv2_1/hunyuandit.py +++ b/comfy/ldm/hunyuan3dv2_1/hunyuandit.py @@ -328,7 +328,7 @@ class CrossAttention(nn.Module): kv = torch.cat((k, v), dim=-1) split_size = kv.shape[-1] // self.num_heads // 2 - kv = kv.view(1, -1, self.num_heads, split_size * 2) + kv = kv.view(b, -1, self.num_heads, split_size * 2) k, v = torch.split(kv, split_size, dim=-1) q = q.view(b, s1, self.num_heads, self.head_dim) @@ -398,7 +398,7 @@ class Attention(nn.Module): qkv_combined = torch.cat((query, key, value), dim=-1) split_size = qkv_combined.shape[-1] // self.num_heads // 3 - qkv = qkv_combined.view(1, -1, self.num_heads, split_size * 3) + qkv = qkv_combined.view(B, -1, self.num_heads, split_size * 3) query, key, value = torch.split(qkv, split_size, dim=-1) query = query.reshape(B, N, self.num_heads, self.head_dim) @@ -607,9 +607,13 @@ class HunYuanDiTPlain(nn.Module): def forward(self, x, t, context, transformer_options = {}, **kwargs): x = x.movedim(-1, -2) - uncond_emb, cond_emb = context.chunk(2, dim = 0) - context = torch.cat([cond_emb, uncond_emb], dim = 0) + swap_cfg_halves = context.shape[0] >= 2 + + if swap_cfg_halves: + first_half, second_half = context.chunk(2, dim = 0) + context = torch.cat([second_half, first_half], dim = 0) + main_condition = context t = 1.0 - t @@ -657,5 +661,8 @@ class HunYuanDiTPlain(nn.Module): output = self.final_layer(combined) output = output.movedim(-2, -1) * (-1.0) - cond_emb, uncond_emb = output.chunk(2, dim = 0) - return torch.cat([uncond_emb, cond_emb]) + if swap_cfg_halves: + first_half, second_half = output.chunk(2, dim = 0) + output = torch.cat([second_half, first_half], dim = 0) + + return output diff --git a/comfy/ldm/lens/model.py b/comfy/ldm/lens/model.py new file mode 100644 index 000000000..cd5015ddc --- /dev/null +++ b/comfy/ldm/lens/model.py @@ -0,0 +1,510 @@ +"""Lens denoising transformer (DiT)""" + +from __future__ import annotations + +from typing import Any, Dict, Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F + +import comfy.ldm.flux.layers +import comfy.patcher_extension +from comfy.ldm.flux.layers import EmbedND +from comfy.ldm.flux.math import apply_rope +from comfy.ldm.modules.attention import optimized_attention + + +def _lens_time_proj(t: torch.Tensor, dim: int = 256) -> torch.Tensor: + return comfy.ldm.flux.layers.timestep_embedding(t, dim) + + +def _lens_position_ids( + frame: int, height: int, width: int, text_seq_len: int, + scale_rope: bool = True, device=None, +) -> torch.Tensor: + """Lens axial (frame, h, w) position ids for joint image + text sequence. + + With ``scale_rope=True`` h/w are centered around 0 (negative + positive + halves) and text starts at ``max(h//2, w//2)``. Result shape ``[seq, 3]``; + caller adds a batch dim for ``EmbedND``. + """ + if scale_rope: + h_pos = torch.cat([torch.arange(-(height - height // 2), 0, device=device), + torch.arange(0, height // 2, device=device)]) + w_pos = torch.cat([torch.arange(-(width - width // 2), 0, device=device), + torch.arange(0, width // 2, device=device)]) + text_start = max(height // 2, width // 2) + else: + h_pos = torch.arange(height, device=device) + w_pos = torch.arange(width, device=device) + text_start = max(height, width) + + f_pos = torch.arange(frame, device=device) + img_ids = torch.zeros(frame, height, width, 3, device=device) + img_ids[..., 0] = f_pos[:, None, None] + img_ids[..., 1] = h_pos[None, :, None] + img_ids[..., 2] = w_pos[None, None, :] + img_ids = img_ids.reshape(-1, 3) + + # Text positions replicate across all 3 axes (matches original packing). + txt_pos = torch.arange(text_start, text_start + text_seq_len, device=device).float() + txt_ids = txt_pos[:, None].expand(text_seq_len, 3) + + return torch.cat([img_ids, txt_ids], dim=0) + + +class _TimestepEmbedder(nn.Module): + def __init__(self, in_channels: int, time_embed_dim: int, dtype=None, device=None, operations=None) -> None: + super().__init__() + self.linear_1 = operations.Linear(in_channels, time_embed_dim, dtype=dtype, device=device) + self.linear_2 = operations.Linear(time_embed_dim, time_embed_dim, dtype=dtype, device=device) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.linear_1(x) + x = F.silu(x) + return self.linear_2(x) + + +class LensTimestepProjEmbeddings(nn.Module): + def __init__(self, embedding_dim: int, dtype=None, device=None, operations=None) -> None: + super().__init__() + self.timestep_embedder = _TimestepEmbedder(256, embedding_dim, dtype=dtype, device=device, operations=operations) + + def forward(self, timestep: torch.Tensor, hidden_states: torch.Tensor) -> torch.Tensor: + proj = _lens_time_proj(timestep, 256) + return self.timestep_embedder(proj.to(dtype=hidden_states.dtype)) + + +class GateMLP(nn.Module): + """SwiGLU MLP.""" + + def __init__(self, dim: int, hidden_dim: int, dtype=None, device=None, operations=None) -> None: + super().__init__() + self.w1 = operations.Linear(dim, hidden_dim, bias=False, dtype=dtype, device=device) + self.w2 = operations.Linear(hidden_dim, dim, bias=False, dtype=dtype, device=device) + self.w3 = operations.Linear(dim, hidden_dim, bias=False, dtype=dtype, device=device) + + def forward(self, x): + return self.w2(F.silu(self.w1(x), inplace=True).mul_(self.w3(x))) + + +class LensJointAttention(nn.Module): + """Joint image+text attention with fused QKV per stream.""" + + def __init__( + self, + query_dim: int, + added_kv_proj_dim: int, + dim_head: int = 64, + heads: int = 8, + out_dim: Optional[int] = None, + eps: float = 1e-5, + dtype=None, + device=None, + operations=None, + ) -> None: + super().__init__() + self.inner_dim = out_dim if out_dim is not None else dim_head * heads + self.heads = self.inner_dim // dim_head + self.dim_head = dim_head + self.out_dim = out_dim if out_dim is not None else query_dim + + self.norm_q = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device) + self.norm_k = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device) + self.norm_added_q = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device) + self.norm_added_k = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device) + + self.img_qkv = operations.Linear(query_dim, 3 * self.inner_dim, bias=True, dtype=dtype, device=device) + self.txt_qkv = operations.Linear(added_kv_proj_dim, 3 * self.inner_dim, bias=True, dtype=dtype, device=device) + + # ModuleList([Linear, Identity]) for state-dict key compatibility. + self.to_out = nn.ModuleList([ + operations.Linear(self.inner_dim, self.out_dim, bias=True, dtype=dtype, device=device), + nn.Identity(), + ]) + self.to_add_out = operations.Linear(self.inner_dim, query_dim, bias=True, dtype=dtype, device=device) + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor, + freqs_cis: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + transformer_options: Optional[Dict[str, Any]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + bsz, seq_img, _ = hidden_states.shape + seq_txt = encoder_hidden_states.shape[1] + + # image stream + img_qkv = self.img_qkv(hidden_states).view(bsz, seq_img, 3, self.heads, self.dim_head) + img_q, img_k, img_v = img_qkv.unbind(dim=2) + img_q = self.norm_q(img_q) + img_k = self.norm_k(img_k) + del img_qkv + + # text stream + txt_qkv = self.txt_qkv(encoder_hidden_states).view(bsz, seq_txt, 3, self.heads, self.dim_head) + txt_q, txt_k, txt_v = txt_qkv.unbind(dim=2) + txt_q = self.norm_added_q(txt_q) + txt_k = self.norm_added_k(txt_k) + + # [B, S, H, D] → [B, H, S, D] for attention, dels to avoid VRAM peaks + q = torch.cat([img_q, txt_q], dim=1).transpose(1, 2) + del img_q, txt_q + k = torch.cat([img_k, txt_k], dim=1).transpose(1, 2) + del img_k, txt_k + v = torch.cat([img_v, txt_v], dim=1).transpose(1, 2) + del img_v, txt_v + + q, k = apply_rope(q, k, freqs_cis) + + if attention_mask is not None: + expected = (bsz, 1, 1, seq_img + seq_txt) + if attention_mask.shape != expected: + raise ValueError( + f"attention_mask must be {expected}, got {tuple(attention_mask.shape)}" + ) + attention_mask = attention_mask.to(q.dtype) + + out = optimized_attention( + q, k, v, self.heads, mask=attention_mask, skip_reshape=True, + transformer_options=transformer_options, + ) + + img_out = self.to_out[1](self.to_out[0](out[:, :seq_img, :])) + txt_out = self.to_add_out(out[:, seq_img:, :]) + return img_out, txt_out + + +class LensTransformerBlock(nn.Module): + def __init__( + self, + dim: int, + num_attention_heads: int, + attention_head_dim: int, + eps: float = 1e-6, + rms_norm: bool = True, + dtype=None, + device=None, + operations=None, + ) -> None: + super().__init__() + + self.attn = LensJointAttention( + query_dim=dim, + added_kv_proj_dim=dim, + dim_head=attention_head_dim, + heads=num_attention_heads, + out_dim=dim, + eps=1e-5, + dtype=dtype, + device=device, + operations=operations, + ) + + if rms_norm: + NormCls = operations.RMSNorm + norm_kwargs = {} + else: + NormCls = operations.LayerNorm + norm_kwargs = {"elementwise_affine": False} + + mlp_hidden = int(dim / 3 * 8) + + # Sequential(SiLU, Linear) so state-dict lands at img_mod.1.{weight,bias}. + self.img_mod = nn.Sequential( + nn.SiLU(), + operations.Linear(dim, 6 * dim, bias=True, dtype=dtype, device=device), + ) + self.img_norm1 = NormCls(dim, eps=eps, dtype=dtype, device=device, **norm_kwargs) + self.img_norm2 = NormCls(dim, eps=eps, dtype=dtype, device=device, **norm_kwargs) + self.img_mlp = GateMLP(dim, mlp_hidden, dtype=dtype, device=device, operations=operations) + + self.txt_mod = nn.Sequential( + nn.SiLU(), + operations.Linear(dim, 6 * dim, bias=True, dtype=dtype, device=device), + ) + self.txt_norm1 = NormCls(dim, eps=eps, dtype=dtype, device=device, **norm_kwargs) + self.txt_norm2 = NormCls(dim, eps=eps, dtype=dtype, device=device, **norm_kwargs) + self.txt_mlp = GateMLP(dim, mlp_hidden, dtype=dtype, device=device, operations=operations) + + @staticmethod + def _modulate(x: torch.Tensor, mod_params: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + shift, scale, gate = mod_params.chunk(3, dim=-1) + return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1), gate.unsqueeze(1) + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor, + temb: torch.Tensor, + freqs_cis: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + transformer_options: Optional[Dict[str, Any]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + img_mod1, img_mod2 = self.img_mod(temb).chunk(2, dim=-1) + txt_mod1, txt_mod2 = self.txt_mod(temb).chunk(2, dim=-1) + + img_modulated, img_gate1 = self._modulate(self.img_norm1(hidden_states), img_mod1) + txt_modulated, txt_gate1 = self._modulate(self.txt_norm1(encoder_hidden_states), txt_mod1) + + img_attn, txt_attn = self.attn( + hidden_states=img_modulated, + encoder_hidden_states=txt_modulated, + freqs_cis=freqs_cis, + attention_mask=attention_mask, + transformer_options=transformer_options, + ) + + hidden_states = hidden_states + img_gate1 * img_attn + encoder_hidden_states = encoder_hidden_states + txt_gate1 * txt_attn + + img_modulated2, img_gate2 = self._modulate(self.img_norm2(hidden_states), img_mod2) + hidden_states = hidden_states + img_gate2 * self.img_mlp(img_modulated2) + + txt_modulated2, txt_gate2 = self._modulate(self.txt_norm2(encoder_hidden_states), txt_mod2) + encoder_hidden_states = encoder_hidden_states + txt_gate2 * self.txt_mlp(txt_modulated2) + + return encoder_hidden_states, hidden_states + + +class _AdaLayerNormContinuousNoAffine(nn.Module): + """AdaLayerNormContinuous(elementwise_affine=False). + + The reference uses ``scale, shift = chunk(2)`` (scale first) — opposite + to Flux's ``LastLayer``. + """ + + def __init__(self, embedding_dim: int, conditioning_embedding_dim: int, eps: float = 1e-6, + dtype=None, device=None, operations=None) -> None: + super().__init__() + self.linear = operations.Linear( + conditioning_embedding_dim, embedding_dim * 2, bias=True, dtype=dtype, device=device + ) + self.eps = eps + self.embedding_dim = embedding_dim + + def forward(self, x: torch.Tensor, conditioning: torch.Tensor) -> torch.Tensor: + emb = self.linear(F.silu(conditioning)) + scale, shift = torch.chunk(emb, 2, dim=-1) + x = F.layer_norm(x, (self.embedding_dim,), None, None, self.eps) + return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1) + + +class LensTransformer2DModel(nn.Module): + """Lens dual-stream MMDiT (48 blocks, inner_dim=1536, multi-layer text).""" + + def __init__( + self, + patch_size: int = 2, + in_channels: int = 128, + out_channels: Optional[int] = 32, + num_layers: int = 48, + attention_head_dim: int = 64, + num_attention_heads: int = 24, + enc_hidden_dim: int = 2880, + axes_dims_rope: Tuple[int, int, int] = (8, 28, 28), + rms_norm: bool = True, + multi_layer_encoder_feature: bool = True, + selected_layer_index: Tuple[int, ...] = (5, 11, 17, 23), + image_model=None, # unused; accepted for detection-side configs. + dtype=None, + device=None, + operations=None, + ) -> None: + super().__init__() + self.patch_size = patch_size + self.in_channels = in_channels + self.out_channels = out_channels if out_channels is not None else in_channels + self.inner_dim = num_attention_heads * attention_head_dim + self.multi_layer_encoder_feature = multi_layer_encoder_feature + self.selected_layer_index = list(selected_layer_index) + self.dtype = dtype + + self.pos_embed = EmbedND(dim=attention_head_dim, theta=10000, axes_dim=list(axes_dims_rope)) + self.time_text_embed = LensTimestepProjEmbeddings( + embedding_dim=self.inner_dim, dtype=dtype, device=device, operations=operations + ) + + if self.multi_layer_encoder_feature: + self.txt_norm = nn.ModuleList( + [operations.RMSNorm(enc_hidden_dim, eps=1e-5, dtype=dtype, device=device) + for _ in self.selected_layer_index] + ) + self.txt_in = operations.Linear( + enc_hidden_dim * len(self.selected_layer_index), + self.inner_dim, bias=True, dtype=dtype, device=device, + ) + else: + self.txt_norm = operations.RMSNorm(enc_hidden_dim, eps=1e-5, dtype=dtype, device=device) + self.txt_in = operations.Linear(enc_hidden_dim, self.inner_dim, bias=True, dtype=dtype, device=device) + + self.img_in = operations.Linear(in_channels, self.inner_dim, bias=True, dtype=dtype, device=device) + + self.transformer_blocks = nn.ModuleList([ + LensTransformerBlock( + dim=self.inner_dim, + num_attention_heads=num_attention_heads, + attention_head_dim=attention_head_dim, + eps=1e-6, + rms_norm=rms_norm, + dtype=dtype, device=device, operations=operations, + ) + for _ in range(num_layers) + ]) + + self.norm_out = _AdaLayerNormContinuousNoAffine( + self.inner_dim, self.inner_dim, eps=1e-6, + dtype=dtype, device=device, operations=operations, + ) + self.proj_out = operations.Linear( + self.inner_dim, patch_size * patch_size * self.out_channels, bias=True, + dtype=dtype, device=device, + ) + + def forward(self, x: torch.Tensor, timestep: torch.Tensor, context: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, + transformer_options: Optional[Dict[str, Any]] = None, **kwargs) -> torch.Tensor: + if transformer_options is None: + transformer_options = {} + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options), + ).execute(x, timestep, context, attention_mask, transformer_options, **kwargs) + + def _forward( + self, + x: torch.Tensor, + timestep: torch.Tensor, + context: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + transformer_options: Optional[Dict[str, Any]] = None, + control: Optional[Dict[str, Any]] = None, + **kwargs, + ) -> torch.Tensor: + """ComfyUI bridge: ``(x[B,128,h,w], t[B], context[B,S,L*H], mask[B,S])``.""" + if transformer_options is None: + transformer_options = {} + transformer_options = transformer_options.copy() + patches = transformer_options.get("patches", {}) + patches_replace = transformer_options.get("patches_replace", {}) + blocks_replace = patches_replace.get("dit", {}) + + B, C, h, w = x.shape + hidden_states = x.permute(0, 2, 3, 1).reshape(B, h * w, C) + + if self.multi_layer_encoder_feature: + L = len(self.selected_layer_index) + enc_dim = context.shape[-1] // L + encoder_hidden_states = list( + context.reshape(B, -1, L, enc_dim).unbind(dim=2) + ) + text_seq_len = encoder_hidden_states[0].shape[1] + else: + encoder_hidden_states = context + text_seq_len = context.shape[1] + + if attention_mask is None: + attention_mask = torch.ones( + (B, text_seq_len), dtype=torch.bool, device=x.device + ) + + img_len = h * w + joint_mask = self._build_joint_attention_mask(attention_mask, img_len) + + hidden_states = self.img_in(hidden_states) + timestep = timestep.to(hidden_states.dtype) + + if self.multi_layer_encoder_feature: + normed = [self.txt_norm[i](encoder_hidden_states[i]) for i in range(L)] + encoder_hidden_states = torch.cat(normed, dim=-1) + else: + encoder_hidden_states = self.txt_norm(encoder_hidden_states) + encoder_hidden_states = self.txt_in(encoder_hidden_states) + + if "post_input" in patches: + for p in patches["post_input"]: + out = p({ + "img": hidden_states, + "txt": encoder_hidden_states, + "transformer_options": transformer_options, + }) + hidden_states = out["img"] + encoder_hidden_states = out["txt"] + + temb = self.time_text_embed(timestep, hidden_states) + ids = _lens_position_ids(1, h, w, text_seq_len, device=hidden_states.device).unsqueeze(0) + freqs_cis = self.pos_embed(ids) + + transformer_options["total_blocks"] = len(self.transformer_blocks) + transformer_options["block_type"] = "double" + for i, block in enumerate(self.transformer_blocks): + transformer_options["block_index"] = i + if ("double_block", i) in blocks_replace: + def block_wrap(args): + out = {} + out["txt"], out["img"] = block( + hidden_states=args["img"], + encoder_hidden_states=args["txt"], + temb=args["vec"], + freqs_cis=args["pe"], + attention_mask=args.get("attn_mask"), + transformer_options=args.get("transformer_options"), + ) + return out + out = blocks_replace[("double_block", i)]( + { + "img": hidden_states, + "txt": encoder_hidden_states, + "vec": temb, + "pe": freqs_cis, + "attn_mask": joint_mask, + "transformer_options": transformer_options, + }, + {"original_block": block_wrap}, + ) + encoder_hidden_states = out["txt"] + hidden_states = out["img"] + else: + encoder_hidden_states, hidden_states = block( + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + temb=temb, + freqs_cis=freqs_cis, + attention_mask=joint_mask, + transformer_options=transformer_options, + ) + + if "double_block" in patches: + for p in patches["double_block"]: + out = p({ + "img": hidden_states, + "txt": encoder_hidden_states, + "x": x, + "block_index": i, + "transformer_options": transformer_options, + }) + hidden_states = out["img"] + encoder_hidden_states = out["txt"] + + if control is not None: + control_i = control.get("input") + if control_i is not None and i < len(control_i): + add = control_i[i] + if add is not None: + hidden_states[:, :add.shape[1]] += add + + hidden_states = self.norm_out(hidden_states, temb) + out = self.proj_out(hidden_states) + return out.reshape(B, h, w, C).permute(0, 3, 1, 2).contiguous() + + @staticmethod + def _build_joint_attention_mask(text_mask: torch.Tensor, img_len: int) -> torch.Tensor: + if text_mask.dtype != torch.bool: + text_mask = text_mask.bool() + bsz = text_mask.shape[0] + img_ones = torch.ones((bsz, img_len), dtype=torch.bool, device=text_mask.device) + joint = torch.cat([img_ones, text_mask], dim=1) + additive = torch.zeros_like(joint, dtype=torch.float32) + additive.masked_fill_(~joint, torch.finfo(torch.float32).min) + return additive[:, None, None, :] diff --git a/comfy/ldm/lightricks/av_model.py b/comfy/ldm/lightricks/av_model.py index bc09fb77e..ef9938465 100644 --- a/comfy/ldm/lightricks/av_model.py +++ b/comfy/ldm/lightricks/av_model.py @@ -767,25 +767,25 @@ class LTXAVModel(LTXVModel): # Cross-attention timesteps - compress these too av_ca_audio_scale_shift_timestep, _ = self.av_ca_audio_scale_shift_adaln_single( - timestep.max().expand_as(a_timestep_flat), + a_timestep_flat, {"resolution": None, "aspect_ratio": None}, batch_size=batch_size, hidden_dtype=hidden_dtype, ) av_ca_video_scale_shift_timestep, _ = self.av_ca_video_scale_shift_adaln_single( - a_timestep.max().expand_as(timestep_flat), + timestep_flat, {"resolution": None, "aspect_ratio": None}, batch_size=batch_size, hidden_dtype=hidden_dtype, ) av_ca_a2v_gate_noise_timestep, _ = self.av_ca_a2v_gate_adaln_single( - a_timestep.max().expand_as(timestep_flat) * av_ca_factor, + a_timestep_scaled.max().expand_as(timestep_flat) * av_ca_factor, {"resolution": None, "aspect_ratio": None}, batch_size=batch_size, hidden_dtype=hidden_dtype, ) av_ca_v2a_gate_noise_timestep, _ = self.av_ca_v2a_gate_adaln_single( - timestep.max().expand_as(a_timestep_flat) * av_ca_factor, + timestep_scaled.max().expand_as(a_timestep_flat) * av_ca_factor, {"resolution": None, "aspect_ratio": None}, batch_size=batch_size, hidden_dtype=hidden_dtype, diff --git a/comfy/ldm/lightricks/vae/causal_audio_autoencoder.py b/comfy/ldm/lightricks/vae/causal_audio_autoencoder.py index b556b128f..58b67d45a 100644 --- a/comfy/ldm/lightricks/vae/causal_audio_autoencoder.py +++ b/comfy/ldm/lightricks/vae/causal_audio_autoencoder.py @@ -1,4 +1,3 @@ -from __future__ import annotations import torch from torch import nn from torch.nn import functional as F diff --git a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py index 998122c85..5975015e2 100644 --- a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py +++ b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py @@ -1,4 +1,3 @@ -from __future__ import annotations import threading import torch from torch import nn diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py index 9e432d5c0..d0ee97d33 100644 --- a/comfy/ldm/lumina/model.py +++ b/comfy/ldm/lumina/model.py @@ -1,5 +1,4 @@ # Code from: https://github.com/Alpha-VLLM/Lumina-Image-2.0/blob/main/models/model.py -from __future__ import annotations from typing import List, Optional, Tuple diff --git a/comfy/ldm/modules/attention.py b/comfy/ldm/modules/attention.py index a68cb8439..55360535a 100644 --- a/comfy/ldm/modules/attention.py +++ b/comfy/ldm/modules/attention.py @@ -741,12 +741,12 @@ optimized_attention = attention_basic if model_management.sage_attention_enabled(): logging.info("Using sage attention") optimized_attention = attention_sage -elif model_management.xformers_enabled(): - logging.info("Using xformers attention") - optimized_attention = attention_xformers elif model_management.flash_attention_enabled(): logging.info("Using Flash Attention") optimized_attention = attention_flash +elif model_management.xformers_enabled(): + logging.info("Using xformers attention") + optimized_attention = attention_xformers elif model_management.pytorch_attention_enabled(): logging.info("Using pytorch attention") optimized_attention = attention_pytorch diff --git a/comfy/ldm/modules/diffusionmodules/mmdit.py b/comfy/ldm/modules/diffusionmodules/mmdit.py index 0dc8fe789..9ab3c463c 100644 --- a/comfy/ldm/modules/diffusionmodules/mmdit.py +++ b/comfy/ldm/modules/diffusionmodules/mmdit.py @@ -211,7 +211,7 @@ class TimestepEmbedder(nn.Module): Embeds scalar timesteps into vector representations. """ - def __init__(self, hidden_size, frequency_embedding_size=256, output_size=None, dtype=None, device=None, operations=None): + def __init__(self, hidden_size, frequency_embedding_size=256, output_size=None, dtype=None, device=None, operations=None, max_period=10000): super().__init__() if output_size is None: output_size = hidden_size @@ -221,9 +221,10 @@ class TimestepEmbedder(nn.Module): operations.Linear(hidden_size, output_size, bias=True, dtype=dtype, device=device), ) self.frequency_embedding_size = frequency_embedding_size + self.max_period = max_period def forward(self, t, dtype, **kwargs): - t_freq = timestep_embedding(t, self.frequency_embedding_size).to(dtype) + t_freq = timestep_embedding(t, self.frequency_embedding_size, max_period=self.max_period).to(dtype) t_emb = self.mlp(t_freq) return t_emb diff --git a/comfy/ldm/moge/geometry.py b/comfy/ldm/moge/geometry.py index 7fdc97871..d1a1e445f 100644 --- a/comfy/ldm/moge/geometry.py +++ b/comfy/ldm/moge/geometry.py @@ -1,6 +1,5 @@ """Pure-torch + scipy geometry helpers for MoGe inference and mesh export.""" -from __future__ import annotations from typing import Optional, Tuple diff --git a/comfy/ldm/moge/model.py b/comfy/ldm/moge/model.py index 6876c4af2..1695626bc 100644 --- a/comfy/ldm/moge/model.py +++ b/comfy/ldm/moge/model.py @@ -4,7 +4,6 @@ V1: DINOv2 backbone + multi-output head (points, mask). V2: DINOv2 encoder + neck + per-output heads (points, mask, normal, optional metric-scale MLP). """ -from __future__ import annotations from numbers import Number from typing import Any, Dict, List, Optional, Tuple, Union diff --git a/comfy/ldm/moge/modules.py b/comfy/ldm/moge/modules.py index 235a59212..f6443d65a 100644 --- a/comfy/ldm/moge/modules.py +++ b/comfy/ldm/moge/modules.py @@ -1,6 +1,5 @@ """Building blocks for MoGe: residual conv stack, resamplers, MLP, DINOv2 encoder, v1 head.""" -from __future__ import annotations from typing import List, Optional, Sequence, Tuple, Union diff --git a/comfy/ldm/moge/panorama.py b/comfy/ldm/moge/panorama.py index de53ebe68..18d0cb665 100644 --- a/comfy/ldm/moge/panorama.py +++ b/comfy/ldm/moge/panorama.py @@ -6,7 +6,6 @@ equirect distance map via a multi-scale Poisson + gradient sparse solve. Image sampling uses F.grid_sample (GPU); the sparse solve uses lsmr (CPU). """ -from __future__ import annotations from typing import Callable, List, Optional, Tuple diff --git a/comfy/ldm/pixeldit/model.py b/comfy/ldm/pixeldit/model.py new file mode 100644 index 000000000..b044b9b29 --- /dev/null +++ b/comfy/ldm/pixeldit/model.py @@ -0,0 +1,239 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +import comfy.ldm.common_dit +import comfy.patcher_extension +from comfy.ldm.flux.math import apply_rope, rope +from comfy.ldm.hidream.model import FeedForwardSwiGLU +from comfy.ldm.modules.attention import optimized_attention +from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder + +from .modules import ( + FinalLayer, + PatchTokenEmbedder, + PiTBlock, + PixelTokenEmbedder, + apply_adaln_, + precompute_freqs_cis_2d, +) + + +class MMDiTJointAttention(nn.Module): + """Joint MMDiT attention with separate Q/K/V/proj for image and text streams. + + RoPE is applied to each stream before concatenation so each stream uses its own + 2D/1D positional encoding. Concat order is [text, image] (text first). + """ + def __init__(self, dim, num_heads=8, qkv_bias=False, dtype=None, device=None, operations=None): + super().__init__() + assert dim % num_heads == 0 + self.num_heads = num_heads + self.head_dim = dim // num_heads + + self.qkv_x = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device) + self.qkv_y = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device) + + self.q_norm_x = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device) + self.k_norm_x = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device) + self.q_norm_y = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device) + self.k_norm_y = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device) + + self.proj_x = operations.Linear(dim, dim, dtype=dtype, device=device) + self.proj_y = operations.Linear(dim, dim, dtype=dtype, device=device) + + def forward(self, x, y, pos_img, pos_txt=None, attn_mask=None, transformer_options={}): + B, Nx, _ = x.shape + _, Ny, _ = y.shape + H = self.num_heads + D = self.head_dim + + qkv_x = self.qkv_x(x).reshape(B, Nx, 3, H, D).permute(2, 0, 3, 1, 4) + qx, kx, vx = qkv_x.unbind(0) + qx = self.q_norm_x(qx) + kx = self.k_norm_x(kx) + + qkv_y = self.qkv_y(y).reshape(B, Ny, 3, H, D).permute(2, 0, 3, 1, 4) + qy, ky, vy = qkv_y.unbind(0) + qy = self.q_norm_y(qy) + ky = self.k_norm_y(ky) + + qx, kx = apply_rope(qx, kx, pos_img[None, None]) + if pos_txt is not None: + qy, ky = apply_rope(qy, ky, pos_txt[None, None]) + + q_joint = torch.cat([qy, qx], dim=2) + k_joint = torch.cat([ky, kx], dim=2) + v_joint = torch.cat([vy, vx], dim=2) + + out_joint = optimized_attention( + q_joint, k_joint, v_joint, H, + mask=attn_mask, skip_reshape=True, skip_output_reshape=True, + transformer_options=transformer_options, + ) + + out_y = out_joint[:, :, :Ny, :].transpose(1, 2).reshape(B, Ny, H * D) + out_x = out_joint[:, :, Ny:, :].transpose(1, 2).reshape(B, Nx, H * D) + + return self.proj_x(out_x), self.proj_y(out_y) + + +class MMDiTBlockT2I(nn.Module): + def __init__(self, hidden_size, groups, mlp_ratio=4.0, dtype=None, device=None, operations=None): + super().__init__() + self.norm_x1 = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device) + self.norm_y1 = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device) + self.attn = MMDiTJointAttention(hidden_size, num_heads=groups, qkv_bias=False, dtype=dtype, device=device, operations=operations) + self.norm_x2 = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device) + self.norm_y2 = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device) + mlp_hidden_dim = int(hidden_size * mlp_ratio) + self.mlp_x = FeedForwardSwiGLU(hidden_size, mlp_hidden_dim, multiple_of=1, dtype=dtype, device=device, operations=operations) + self.mlp_y = FeedForwardSwiGLU(hidden_size, mlp_hidden_dim, multiple_of=1, dtype=dtype, device=device, operations=operations) + self.adaLN_modulation_img = nn.Sequential(operations.Linear(hidden_size, 6 * hidden_size, bias=True, dtype=dtype, device=device)) + self.adaLN_modulation_txt = nn.Sequential(operations.Linear(hidden_size, 6 * hidden_size, bias=True, dtype=dtype, device=device)) + + def forward(self, x, y, c, pos_img, pos_txt=None, attn_mask=None, transformer_options={}): + shift_msa_x, scale_msa_x, gate_msa_x, shift_mlp_x, scale_mlp_x, gate_mlp_x = self.adaLN_modulation_img(c).chunk(6, dim=-1) + shift_msa_y, scale_msa_y, gate_msa_y, shift_mlp_y, scale_mlp_y, gate_mlp_y = self.adaLN_modulation_txt(c).chunk(6, dim=-1) + + x_norm = apply_adaln_(self.norm_x1(x), shift_msa_x, scale_msa_x) + y_norm = apply_adaln_(self.norm_y1(y), shift_msa_y, scale_msa_y) + attn_x, attn_y = self.attn(x_norm, y_norm, pos_img, pos_txt, attn_mask, transformer_options=transformer_options) + x = torch.addcmul(x, gate_msa_x, attn_x) + y = torch.addcmul(y, gate_msa_y, attn_y) + + x = torch.addcmul(x, gate_mlp_x, self.mlp_x(apply_adaln_(self.norm_x2(x), shift_mlp_x, scale_mlp_x))) + y = torch.addcmul(y, gate_mlp_y, self.mlp_y(apply_adaln_(self.norm_y2(y), shift_mlp_y, scale_mlp_y))) + return x, y + + +class PixDiT_T2I(nn.Module): + """PixelDiT T2I model. Hardcoded for the released 1024px Stage-3 checkpoint + (also runs at 512px when fed the appropriate latent size and flow_shift). + + Forward: + x: [B, 3, H, W] pixel-space input (no VAE) + timesteps:[B] in [0, 1000] (ComfyUI flow sampling convention) + context: [B, Ltxt, 2304] Gemma-2-2b-it hidden states (chi_prompt prepended) + Returns flow-matching velocity [B, 3, H, W]. + """ + def __init__( + self, + in_channels=3, + num_groups=24, + hidden_size=1536, + pixel_hidden_size=16, + pixel_attn_hidden_size=1152, + pixel_num_groups=16, + patch_depth=14, + pixel_depth=2, + patch_size=16, + txt_embed_dim=2304, + txt_max_length=300, + use_text_rope=True, + text_rope_theta=10000.0, + image_model=None, + dtype=None, + device=None, + operations=None, + pixel_mlp_chunks=2, + ): + super().__init__() + self.dtype = dtype + self.in_channels = in_channels + self.out_channels = in_channels + self.hidden_size = hidden_size + self.num_groups = num_groups + self.patch_depth = patch_depth + self.pixel_depth = pixel_depth + self.patch_size = patch_size + self.pixel_hidden_size = pixel_hidden_size + self.pixel_attn_hidden_size = pixel_attn_hidden_size + self.pixel_num_groups = pixel_num_groups + self.txt_embed_dim = txt_embed_dim + self.txt_max_length = txt_max_length + self.use_text_rope = use_text_rope + self.text_rope_theta = text_rope_theta + + self.pixel_embedder = PixelTokenEmbedder(self.in_channels, self.pixel_hidden_size, dtype=dtype, device=device, operations=operations) + self.s_embedder = PatchTokenEmbedder(self.in_channels * self.patch_size ** 2, self.hidden_size, bias=True, dtype=dtype, device=device, operations=operations) + self.t_embedder = TimestepEmbedder(self.hidden_size, dtype=dtype, device=device, operations=operations, max_period=10) + self.y_embedder = PatchTokenEmbedder(self.txt_embed_dim, self.hidden_size, bias=True, use_norm=True, dtype=dtype, device=device, operations=operations) + self.y_pos_embedding = nn.Parameter(torch.empty(1, self.txt_max_length, self.hidden_size, dtype=dtype, device=device)) + + self.patch_blocks = nn.ModuleList([ + MMDiTBlockT2I(self.hidden_size, self.num_groups, + dtype=dtype, device=device, operations=operations) + for _ in range(self.patch_depth) + ]) + self.pixel_blocks = nn.ModuleList([ + PiTBlock( + self.pixel_hidden_size, + self.hidden_size, + patch_size=self.patch_size, + num_heads=self.num_groups, + attn_hidden_size=self.pixel_attn_hidden_size, + attn_num_heads=self.pixel_num_groups, + dtype=dtype, device=device, operations=operations, + mlp_chunks=pixel_mlp_chunks, + ) + for _ in range(self.pixel_depth) + ]) + + self.final_layer = FinalLayer(self.pixel_hidden_size, self.out_channels, dtype=dtype, device=device, operations=operations) + + def _fetch_patch_pos(self, height, width, device, dtype, **rope_opts): + return precompute_freqs_cis_2d(self.hidden_size // self.num_groups, height, width, device=device, dtype=dtype, **rope_opts) + + def _fetch_text_pos(self, length, device, dtype): + return rope(torch.arange(length, dtype=torch.float32, device=device).reshape(1, -1), self.hidden_size // self.num_groups, self.text_rope_theta).squeeze(0).to(dtype=dtype) + + def forward(self, x, timesteps, context=None, attention_mask=None, transformer_options={}, **kwargs): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, self, comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options), + ).execute(x, timesteps, context, attention_mask, transformer_options, **kwargs) + + def _pre_patch_block(self, s, i, **kwargs): + """Hook for subclasses to inject per-block state into the patch stream (e.g. PiD's LQ gate).""" + return s + + def _forward(self, x, timesteps, context=None, attention_mask=None, transformer_options={}, **kwargs): + H_orig, W_orig = x.shape[2], x.shape[3] + x = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size)) + B, _, H, W = x.shape + Hs = H // self.patch_size + Ws = W // self.patch_size + L = Hs * Ws + + pos_img = self._fetch_patch_pos(Hs, Ws, x.device, x.dtype, **(transformer_options.get("rope_options") or {})) + x_patches = F.unfold(x, kernel_size=self.patch_size, stride=self.patch_size).transpose(1, 2) + + t_emb = self.t_embedder(timesteps.view(-1), x.dtype).view(B, -1, self.hidden_size) + + if context is None or context.dim() != 3: + raise ValueError("PixDiT_T2I requires context (text embeddings) of shape [B, L, D]") + Ltxt = min(context.shape[1], self.txt_max_length) + y = context[:, :Ltxt, :] + y_emb = self.y_embedder(y).view(B, Ltxt, self.hidden_size) + y_emb = y_emb + self.y_pos_embedding[:, :Ltxt, :].to(y_emb) # y_pos_embedding is a raw nn.Parameter + + condition = F.silu(t_emb) + pos_txt = self._fetch_text_pos(Ltxt, x.device, x.dtype) if self.use_text_rope else None + + s = self.s_embedder(x_patches) + for i, blk in enumerate(self.patch_blocks): + s = self._pre_patch_block(s, i, **kwargs) + s, y_emb = blk(s, y_emb, condition, pos_img, pos_txt, None, transformer_options=transformer_options) + s = F.silu(t_emb + s) + + s_cond = s.view(B * L, self.hidden_size) + x_pixels = self.pixel_embedder(x, patch_size=self.patch_size) + for blk in self.pixel_blocks: + x_pixels = blk(x_pixels, s_cond, H, W, self.patch_size, mask=None, transformer_options=transformer_options) + + x_pixels = self.final_layer(x_pixels) + C_out = self.out_channels + P2 = self.patch_size * self.patch_size + x_pixels = x_pixels.view(B, L, P2, C_out).permute(0, 3, 2, 1).reshape(B, C_out * P2, L) + out = F.fold(x_pixels, (H, W), kernel_size=self.patch_size, stride=self.patch_size) + return out[:, :, :H_orig, :W_orig] diff --git a/comfy/ldm/pixeldit/modules.py b/comfy/ldm/pixeldit/modules.py new file mode 100644 index 000000000..4b1e538c7 --- /dev/null +++ b/comfy/ldm/pixeldit/modules.py @@ -0,0 +1,187 @@ +import torch +import torch.nn as nn + +from comfy.ldm.flux.math import apply_rope, rope +from comfy.ldm.modules.attention import optimized_attention +from comfy.ldm.modules.diffusionmodules.mmdit import Mlp, get_1d_sincos_pos_embed_from_grid_torch + + +def apply_adaln_(x, shift, scale): + return x.addcmul_(x, scale).add_(shift) + + +def precompute_freqs_cis_2d(dim, height, width, theta=10000.0, scale=16.0, + ref_grid_h=None, ref_grid_w=None, + scale_x=1.0, scale_y=1.0, shift_x=0.0, shift_y=0.0, + device=None, dtype=torch.float32, **kwargs): + """2D RoPE with x/y axis frequencies interleaved at stride 2 across head dim. + + rope_options: + scale_x / scale_y multiply the position range (RoPE extrapolation). + shift_x / shift_y offset the position origin (tiled / regional inference). + With ref_grid_h/w set, also applies NTK-aware per-axis theta scaling + (rope_mode='ntk_aware'): theta_axis = theta * (current/ref)^(dim_axis/(dim_axis-2)). + Returns Flux-format rotation matrices of shape [H*W, dim/2, 2, 2]. + Layout of head-dim pairs: [x_0, y_0, x_1, y_1, ..., x_{dim/4-1}, y_{dim/4-1}]. + """ + dim_axis = dim // 2 + if ref_grid_h is not None and dim_axis > 2: + h_ntk = (height / ref_grid_h) ** (dim_axis / (dim_axis - 2)) + w_ntk = (width / ref_grid_w) ** (dim_axis / (dim_axis - 2)) + else: + h_ntk = w_ntk = 1.0 + + x_lin = torch.linspace(shift_x, scale * scale_x + shift_x, width, device=device) + y_lin = torch.linspace(shift_y, scale * scale_y + shift_y, height, device=device) + y_grid, x_grid = torch.meshgrid(y_lin, x_lin, indexing="ij") + x_rope = rope(x_grid.reshape(1, -1), dim_axis, theta * w_ntk).squeeze(0) + y_rope = rope(y_grid.reshape(1, -1), dim_axis, theta * h_ntk).squeeze(0) + out = torch.stack([x_rope, y_rope], dim=2).reshape(height * width, dim // 2, 2, 2) + return out.to(dtype=dtype) + + +def get_2d_sincos_pos_embed(embed_dim, height, width, device=None, dtype=torch.float32): + """Standard 2D sin/cos absolute positional embedding (ViT-style). + + first half encodes W-coordinates, second half H. + """ + assert embed_dim % 4 == 0 + grid_h = torch.arange(height, dtype=torch.float32, device=device) + grid_w = torch.arange(width, dtype=torch.float32, device=device) + grid_y, grid_x = torch.meshgrid(grid_h, grid_w, indexing="ij") + emb_w = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_x.reshape(-1), device=device) + emb_h = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_y.reshape(-1), device=device) + return torch.cat([emb_w, emb_h], dim=1).to(dtype=dtype) + + +class RotaryAttention(nn.Module): + """Single-stream self-attention with rotary positional encoding (used inside PiTBlock).""" + def __init__(self, dim, num_heads=8, qkv_bias=False, dtype=None, device=None, operations=None): + super().__init__() + assert dim % num_heads == 0 + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.qkv = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device) + self.q_norm = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device) + self.k_norm = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device) + self.proj = operations.Linear(dim, dim, dtype=dtype, device=device) + + def forward(self, x, pos, mask=None, transformer_options={}): + B, N, C = x.shape + H = self.num_heads + D = self.head_dim + qkv = self.qkv(x).reshape(B, N, 3, H, D).permute(2, 0, 3, 1, 4) + q, k, v = qkv.unbind(0) + q, k = apply_rope(self.q_norm(q), self.k_norm(k), pos[None, None]) + x = optimized_attention(q, k, v, H, mask=mask, skip_reshape=True, transformer_options=transformer_options) + return self.proj(x) + + +class FinalLayer(nn.Module): + def __init__(self, hidden_size, out_channels, dtype=None, device=None, operations=None): + super().__init__() + self.norm = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device) + self.linear = operations.Linear(hidden_size, out_channels, bias=True, dtype=dtype, device=device) + + def forward(self, x): + return self.linear(self.norm(x)) + + +class PatchTokenEmbedder(nn.Module): + """Linear projection used both for patchified-image tokens and text-feature tokens.""" + def __init__(self, in_chans, embed_dim, use_norm=False, bias=True, dtype=None, device=None, operations=None): + super().__init__() + self.proj = operations.Linear(in_chans, embed_dim, bias=bias, dtype=dtype, device=device) + self.norm = operations.RMSNorm(embed_dim, eps=1e-6, dtype=dtype, device=device) if use_norm else nn.Identity() + + def forward(self, x): + return self.norm(self.proj(x)) + + +class PixelTokenEmbedder(nn.Module): + """Pixel-level embedder: lifts each RGB pixel to hidden_size and packs into per-patch sequences.""" + def __init__(self, in_channels, hidden_size_output, dtype=None, device=None, operations=None): + super().__init__() + self.in_channels = in_channels + self.hidden_size_output = hidden_size_output + self.proj = operations.Linear(self.in_channels, self.hidden_size_output, bias=True, dtype=dtype, device=device) + + def forward(self, inputs, patch_size): + B, _, H, W = inputs.shape + Hs, Ws = H // patch_size, W // patch_size + P2 = patch_size * patch_size + x = inputs.permute(0, 2, 3, 1).contiguous() + x = self.proj(x) + pos_full = get_2d_sincos_pos_embed(self.hidden_size_output, H, W, device=x.device, dtype=x.dtype).view(H, W, self.hidden_size_output) + x = x + pos_full.unsqueeze(0) + x = x.view(B, Hs, patch_size, Ws, patch_size, self.hidden_size_output) + return x.permute(0, 1, 3, 2, 4, 5).reshape(B * Hs * Ws, P2, self.hidden_size_output) + + +class PiTBlock(nn.Module): + """Pixel-level transformer block. + + Compresses each patch's P^2 pixel tokens → 1 attention token via a linear, + runs global self-attention across patches with 2D RoPE, then expands back to P^2 tokens. + Conditioning is per-pixel adaLN from the patch-level features. + """ + def __init__(self, pixel_hidden_size, patch_hidden_size, patch_size, num_heads, mlp_ratio=4.0, + attn_hidden_size=None, attn_num_heads=None, dtype=None, device=None, operations=None, mlp_chunks=1): + super().__init__() + self.pixel_dim = pixel_hidden_size + self.context_dim = patch_hidden_size + self.attn_dim = attn_hidden_size if attn_hidden_size is not None else patch_hidden_size + self.num_heads = attn_num_heads if attn_num_heads is not None else num_heads + assert self.attn_dim % self.num_heads == 0 + + p2 = patch_size * patch_size + self.compress_to_attn = operations.Linear(p2 * self.pixel_dim, self.attn_dim, bias=True, dtype=dtype, device=device) + self.expand_from_attn = operations.Linear(self.attn_dim, p2 * self.pixel_dim, bias=True, dtype=dtype, device=device) + + self.norm1 = operations.RMSNorm(self.pixel_dim, eps=1e-6, dtype=dtype, device=device) + self.attn = RotaryAttention(self.attn_dim, num_heads=self.num_heads, qkv_bias=False, dtype=dtype, device=device, operations=operations) + self.norm2 = operations.RMSNorm(self.pixel_dim, eps=1e-6, dtype=dtype, device=device) + self.mlp = Mlp(self.pixel_dim, hidden_features=int(self.pixel_dim * mlp_ratio), dtype=dtype, device=device, operations=operations) + + self.adaLN_modulation_msa = operations.Linear(self.context_dim, 3 * self.pixel_dim * p2, bias=True, dtype=dtype, device=device) + self.adaLN_modulation_mlp = operations.Linear(self.context_dim, 3 * self.pixel_dim * p2, bias=True, dtype=dtype, device=device) + + self._rope_fn = precompute_freqs_cis_2d + self.mlp_chunks = max(1, int(mlp_chunks)) + + def _fetch_pos(self, height, width, device, dtype, **rope_opts): + return self._rope_fn(self.attn_dim // self.num_heads, height, width, device=device, dtype=dtype, **rope_opts) + + def forward(self, x, s_cond, image_height, image_width, patch_size, mask=None, transformer_options={}): + BL, P2, _ = x.shape + Hs, Ws = image_height // patch_size, image_width // patch_size + L = Hs * Ws + B = BL // L + + # Attention path uses only msa params; compute, use, free before mlp params allocate. + msa_params = self.adaLN_modulation_msa(s_cond).view(BL, P2, 3 * self.pixel_dim) + shift_msa, scale_msa, gate_msa = msa_params.chunk(3, dim=-1) + + x_norm = apply_adaln_(self.norm1(x), shift_msa, scale_msa) + x_flat = x_norm.view(BL, P2 * self.pixel_dim) + + x_comp = self.compress_to_attn(x_flat).view(B, L, self.attn_dim) + pos_comp = self._fetch_pos(Hs, Ws, x.device, x.dtype, **(transformer_options.get("rope_options") or {})) + attn_out = self.attn(x_comp, pos_comp, mask=mask, transformer_options=transformer_options) + attn_flat = self.expand_from_attn(attn_out.view(B * L, self.attn_dim)) + attn_exp = attn_flat.view(BL, P2, self.pixel_dim) + x = torch.addcmul(x, gate_msa, attn_exp) + del msa_params, shift_msa, scale_msa, gate_msa + + mlp_params = self.adaLN_modulation_mlp(s_cond).view(BL, P2, 3 * self.pixel_dim) + shift_mlp, scale_mlp, gate_mlp = mlp_params.chunk(3, dim=-1) + gate_mlp = gate_mlp.contiguous() # detach from mlp_params so the del below frees shift+scale storage before the MLP + mlp_input = apply_adaln_(self.norm2(x), shift_mlp, scale_mlp) + del mlp_params, shift_mlp, scale_mlp + + # MLP in chunks since the peak memory usage is huge here + chunk_size = (BL + self.mlp_chunks - 1) // self.mlp_chunks + for s in range(0, BL, chunk_size): + e = min(s + chunk_size, BL) + x[s:e].addcmul_(gate_mlp[s:e], self.mlp(mlp_input[s:e])) + return x diff --git a/comfy/ldm/pixeldit/pid.py b/comfy/ldm/pixeldit/pid.py new file mode 100644 index 000000000..21b73907a --- /dev/null +++ b/comfy/ldm/pixeldit/pid.py @@ -0,0 +1,227 @@ +"""PiD — Pixel Diffusion Decoder. Decodes a Flux/SD3/Flux2/Z-Image latent +directly to a 4x-upscaled image in 4 distilled flow-matching steps. PixDiT_T2I +body + LQ projection branch injected before each MMDiT patch block. +""" + +from typing import List + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .model import PixDiT_T2I +from .modules import precompute_freqs_cis_2d + + +class SigmaAwareGatePerTokenPerDim(nn.Module): + """gate = sigmoid(content_proj(cat[x, lq]) - exp(log_alpha) * sigma); out = x + gate * lq. + + Trained init gives ~0.88 gate at sigma=0, ~0.05 at sigma=1. + """ + + def __init__(self, dim: int, dtype=None, device=None, operations=None): + super().__init__() + self.content_proj = operations.Linear(dim * 2, dim, dtype=dtype, device=device) + self.log_alpha = nn.Parameter(torch.empty((), dtype=dtype, device=device)) + + def forward(self, x: torch.Tensor, lq: torch.Tensor, sigma: torch.Tensor) -> torch.Tensor: + content_logit = self.content_proj(torch.cat([x, lq], dim=-1)) + # log_alpha is a raw nn.Parameter -> doesn't auto-cast under dynamic VRAM. + log_alpha = self.log_alpha.to(device=x.device, dtype=torch.float32) + sigma_offset = -log_alpha.exp() * sigma.float().view(-1, 1, 1) + gate = torch.sigmoid(content_logit + sigma_offset) + return x + (gate * lq).to(x.dtype) + + +class ResBlock(nn.Module): + """Pre-activation ResNet block: GN -> SiLU -> Conv -> GN -> SiLU -> Conv + skip.""" + + def __init__(self, channels: int, num_groups: int = 4, dtype=None, device=None, operations=None): + super().__init__() + self.block = nn.Sequential( + operations.GroupNorm(num_groups, channels, dtype=dtype, device=device), + nn.SiLU(), + operations.Conv2d(channels, channels, kernel_size=3, padding=1, dtype=dtype, device=device), + operations.GroupNorm(num_groups, channels, dtype=dtype, device=device), + nn.SiLU(), + operations.Conv2d(channels, channels, kernel_size=3, padding=1, dtype=dtype, device=device), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x + self.block(x) + + +class LQProjection2D(nn.Module): + """LQ latent -> per-block patch-aligned features for controlnet-style injection.""" + + def __init__( + self, + latent_channels: int, + hidden_dim: int = 512, + out_dim: int = 1536, + patch_size: int = 16, + sr_scale: int = 4, + latent_spatial_down_factor: int = 8, + num_res_blocks: int = 4, + num_outputs: int = 7, + interval: int = 2, + dtype=None, device=None, operations=None, + ): + super().__init__() + self.latent_channels = latent_channels + self.hidden_dim = hidden_dim + self.out_dim = out_dim + self.patch_size = patch_size + self.sr_scale = sr_scale + self.latent_spatial_down_factor = latent_spatial_down_factor + self.num_outputs = num_outputs + self.interval = interval + + z_to_patch_ratio = (sr_scale * latent_spatial_down_factor) / patch_size + self.z_to_patch_ratio = z_to_patch_ratio + if z_to_patch_ratio >= 1: + self.latent_fold_factor = 0 + latent_proj_in_ch = latent_channels + else: + fold_factor = int(1 / z_to_patch_ratio) + assert fold_factor * z_to_patch_ratio == 1.0 + self.latent_fold_factor = fold_factor + latent_proj_in_ch = latent_channels * fold_factor * fold_factor + + layers = [ + operations.Conv2d(latent_proj_in_ch, hidden_dim, kernel_size=3, padding=1, dtype=dtype, device=device), + nn.SiLU(), + operations.Conv2d(hidden_dim, hidden_dim, kernel_size=3, padding=1, dtype=dtype, device=device), + ] + for _ in range(num_res_blocks): + layers.append(ResBlock(hidden_dim, dtype=dtype, device=device, operations=operations)) + self.latent_proj = nn.Sequential(*layers) + + self.output_heads = nn.ModuleList( + [operations.Linear(hidden_dim, out_dim, dtype=dtype, device=device) for _ in range(num_outputs)] + ) + self.gate_modules = nn.ModuleList( + [SigmaAwareGatePerTokenPerDim(out_dim, dtype=dtype, device=device, operations=operations) + for _ in range(num_outputs)] + ) + + def is_gate_active(self, block_idx: int) -> bool: + return block_idx % self.interval == 0 + + def output_index(self, block_idx: int) -> int: + return block_idx // self.interval + + def gate(self, x: torch.Tensor, lq_feature: torch.Tensor, sigma: torch.Tensor, out_idx: int) -> torch.Tensor: + return self.gate_modules[out_idx](x, lq_feature, sigma) + + def _align_latent_to_patch_grid(self, lq_latent: torch.Tensor, pH: int, pW: int) -> torch.Tensor: + B, z_dim = lq_latent.shape[:2] + if self.z_to_patch_ratio >= 1: + if lq_latent.shape[2] != pH or lq_latent.shape[3] != pW: + z_aligned = F.interpolate(lq_latent, size=(pH, pW), mode="nearest") + else: + z_aligned = lq_latent + else: + f = self.latent_fold_factor + zH_expected, zW_expected = pH * f, pW * f + if lq_latent.shape[2] != zH_expected or lq_latent.shape[3] != zW_expected: + lq_latent = F.interpolate(lq_latent, size=(zH_expected, zW_expected), mode="nearest") + z_aligned = lq_latent.reshape(B, z_dim, pH, f, pW, f).permute(0, 1, 3, 5, 2, 4) + z_aligned = z_aligned.reshape(B, z_dim * f * f, pH, pW) + return self.latent_proj(z_aligned) + + def forward(self, lq_latent: torch.Tensor, target_pH: int, target_pW: int) -> List[torch.Tensor]: + feat = self._align_latent_to_patch_grid(lq_latent, target_pH, target_pW) + B, C, H, W = feat.shape + tokens = feat.permute(0, 2, 3, 1).contiguous().view(B, H * W, C) + return [head(tokens) for head in self.output_heads] + + +class PidNet(PixDiT_T2I): + """PixDiT_T2I + LQ injection (one sigma-gated feature inserted before each patch block).""" + + def __init__( + self, + lq_latent_channels: int = 16, + lq_hidden_dim: int = 512, + lq_num_res_blocks: int = 4, + lq_interval: int = 2, + sr_scale: int = 4, + latent_spatial_down_factor: int = 8, + rope_ref_h: int = 1024, # NTK ref resolution in PIXEL units: 1024px / patch=16 -> grid_ref=64. + rope_ref_w: int = 1024, + image_model=None, + dtype=None, device=None, operations=None, + **pixdit_kwargs, + ): + super().__init__(dtype=dtype, device=device, operations=operations, **pixdit_kwargs) + + self.rope_ref_grid_h = rope_ref_h // self.patch_size + self.rope_ref_grid_w = rope_ref_w // self.patch_size + + # Parent's PiTBlocks were built with plain RoPE — swap in NTK-aware. + def _pit_rope_fn(head_dim, h, w, device=None, dtype=torch.float32, **rope_opts): + return precompute_freqs_cis_2d(head_dim, h, w, ref_grid_h=self.rope_ref_grid_h, ref_grid_w=self.rope_ref_grid_w, device=device, dtype=dtype, **rope_opts) + for blk in self.pixel_blocks: + blk._rope_fn = _pit_rope_fn + + num_lq_outputs = (self.patch_depth + lq_interval - 1) // lq_interval + self.lq_proj = LQProjection2D( + latent_channels=lq_latent_channels, + hidden_dim=lq_hidden_dim, + out_dim=self.hidden_size, + patch_size=self.patch_size, + sr_scale=sr_scale, + latent_spatial_down_factor=latent_spatial_down_factor, + num_res_blocks=lq_num_res_blocks, + num_outputs=num_lq_outputs, + interval=lq_interval, + dtype=dtype, + device=device, + operations=operations, + ) + + def _fetch_patch_pos(self, height, width, device, dtype, **rope_opts): + return precompute_freqs_cis_2d( + self.hidden_size // self.num_groups, + height, width, + ref_grid_h=self.rope_ref_grid_h, ref_grid_w=self.rope_ref_grid_w, + device=device, dtype=dtype, **rope_opts, + ) + + def _pre_patch_block(self, s, i, pid_lq_features, pid_degrade_sigma, **kwargs): + if not self.lq_proj.is_gate_active(i): + return s + out_idx = self.lq_proj.output_index(i) + if out_idx >= len(pid_lq_features): + return s + return self.lq_proj.gate(s, pid_lq_features[out_idx], pid_degrade_sigma, out_idx) + + def _forward(self, x, timesteps, context=None, attention_mask=None, transformer_options={}, lq_latent=None, degrade_sigma=None, **kwargs): + if lq_latent is None: + raise ValueError("PidNet requires lq_latent — attach via PiDConditioning") + expected_c = self.lq_proj.latent_channels + if lq_latent.shape[1] != expected_c: + raise ValueError( + f"Input latent has {lq_latent.shape[1]} channels, this model variant expects {expected_c}. " + f"Flux1/SD3 = 16 channels, Flux2 = 128 channels." + ) + B = x.shape[0] + # Match the backbone's pad_to_patch_size (round up) so the LQ grid lines up with the patch stream. + Hs = -(-x.shape[2] // self.patch_size) + Ws = -(-x.shape[3] // self.patch_size) + + degrade_sigma = degrade_sigma.to(device=x.device, dtype=torch.float32).reshape(-1) + if degrade_sigma.numel() == 1 and B > 1: + degrade_sigma = degrade_sigma.expand(B).contiguous() + + lq_features = self.lq_proj(lq_latent=lq_latent.to(x), target_pH=Hs, target_pW=Ws) + + return super()._forward( + x, timesteps, + context=context, attention_mask=attention_mask, + transformer_options=transformer_options, + pid_lq_features=lq_features, + pid_degrade_sigma=degrade_sigma, + **kwargs, + ) diff --git a/comfy/lora.py b/comfy/lora.py index f11e26ec9..4e0ea29e0 100644 --- a/comfy/lora.py +++ b/comfy/lora.py @@ -16,7 +16,6 @@ along with this program. If not, see . """ -from __future__ import annotations import comfy.memory_management import comfy.utils import comfy.model_management @@ -484,16 +483,23 @@ def calculate_weight(patches, weight, key, intermediate_dtype=torch.float32, ori return weight -def prefetch_prepared_value(value, allocate_buffer, stream): +def prefetch_prepared_value(value, counter, destination, stream, copy): if isinstance(value, torch.Tensor): - dest = allocate_buffer(comfy.memory_management.vram_aligned_size(value)) - comfy.model_management.cast_to_gathered([value], dest, non_blocking=True, stream=stream) + size = comfy.memory_management.vram_aligned_size(value) + offset = counter[0] + counter[0] += size + if destination is None: + return value + + dest = destination[offset:offset + size] + if copy: + comfy.model_management.cast_to_gathered([value], dest, non_blocking=True, stream=stream) return comfy.memory_management.interpret_gathered_like([value], dest)[0] elif isinstance(value, weight_adapter.WeightAdapterBase): - return type(value)(value.loaded_keys, prefetch_prepared_value(value.weights, allocate_buffer, stream)) + return type(value)(value.loaded_keys, prefetch_prepared_value(value.weights, counter, destination, stream, copy)) elif isinstance(value, tuple): - return tuple(prefetch_prepared_value(item, allocate_buffer, stream) for item in value) + return tuple(prefetch_prepared_value(item, counter, destination, stream, copy) for item in value) elif isinstance(value, list): - return [prefetch_prepared_value(item, allocate_buffer, stream) for item in value] + return [prefetch_prepared_value(item, counter, destination, stream, copy) for item in value] return value diff --git a/comfy/memory_management.py b/comfy/memory_management.py index 48e3c11da..962addb27 100644 --- a/comfy/memory_management.py +++ b/comfy/memory_management.py @@ -1,6 +1,5 @@ import math import ctypes -import threading import dataclasses import torch from typing import NamedTuple @@ -10,12 +9,12 @@ from comfy.quant_ops import QuantizedTensor class TensorFileSlice(NamedTuple): file_ref: object - thread_id: int + lock: object offset: int size: int -def read_tensor_file_slice_into(tensor, destination): +def read_tensor_file_slice_into(tensor, destination, stream=None, destination2=None): if isinstance(tensor, QuantizedTensor): if not isinstance(destination, QuantizedTensor): @@ -23,12 +22,17 @@ def read_tensor_file_slice_into(tensor, destination): if tensor._layout_cls != destination._layout_cls: return False - if not read_tensor_file_slice_into(tensor._qdata, destination._qdata): + if not read_tensor_file_slice_into(tensor._qdata, destination._qdata, stream=stream, + destination2=(destination2._qdata if destination2 is not None else None)): return False dst_orig_dtype = destination._params.orig_dtype destination._params.copy_from(tensor._params, non_blocking=False) destination._params = dataclasses.replace(destination._params, orig_dtype=dst_orig_dtype) + if destination2 is not None: + dst_orig_dtype = destination2._params.orig_dtype + destination2._params.copy_from(destination._params, non_blocking=True) + destination2._params = dataclasses.replace(destination2._params, orig_dtype=dst_orig_dtype) return True info = getattr(tensor.untyped_storage(), "_comfy_tensor_file_slice", None) @@ -38,7 +42,6 @@ def read_tensor_file_slice_into(tensor, destination): file_obj = info.file_ref if (destination.device.type != "cpu" or file_obj is None - or threading.get_ident() != info.thread_id or destination.numel() * destination.element_size() < info.size or tensor.numel() * tensor.element_size() != info.size or tensor.storage_offset() != 0 @@ -48,20 +51,33 @@ def read_tensor_file_slice_into(tensor, destination): if info.size == 0: return True + hostbuf = getattr(destination.untyped_storage(), "_comfy_hostbuf", None) + if hostbuf is not None: + stream_ptr = getattr(stream, "cuda_stream", 0) if stream is not None else 0 + device_ptr = destination2.data_ptr() if destination2 is not None else 0 + with info.lock: + hostbuf.read_file_slice(file_obj, info.offset, info.size, + offset=destination.data_ptr() - hostbuf.get_raw_address(), + stream=stream_ptr, + device_ptr=device_ptr, + device=None if destination2 is None else destination2.device.index) + return True + buf_type = ctypes.c_ubyte * info.size view = memoryview(buf_type.from_address(destination.data_ptr())) try: - file_obj.seek(info.offset) - done = 0 - while done < info.size: - try: - n = file_obj.readinto(view[done:]) - except OSError: - return False - if n <= 0: - return False - done += n + with info.lock: + file_obj.seek(info.offset) + done = 0 + while done < info.size: + try: + n = file_obj.readinto(view[done:]) + except OSError: + return False + if n <= 0: + return False + done += n return True finally: view.release() @@ -151,7 +167,7 @@ def set_ram_cache_release_state(callback, headroom): extra_ram_release_callback = callback RAM_CACHE_HEADROOM = max(0, int(headroom)) -def extra_ram_release(target): +def extra_ram_release(target, free_active=False): if extra_ram_release_callback is None: return 0 - return extra_ram_release_callback(target) + return extra_ram_release_callback(target, free_active=free_active) diff --git a/comfy/model_base.py b/comfy/model_base.py index c22705655..205178911 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -35,6 +35,7 @@ import comfy.ldm.hydit.models import comfy.ldm.audio.dit import comfy.ldm.audio.embedders import comfy.ldm.flux.model +import comfy.ldm.lens.model import comfy.ldm.lightricks.model import comfy.ldm.hunyuan_video.model import comfy.ldm.cosmos.model @@ -48,6 +49,8 @@ import comfy.ldm.hunyuan3d.model import comfy.ldm.hidream.model import comfy.ldm.chroma.model import comfy.ldm.chroma_radiance.model +import comfy.ldm.pixeldit.model +import comfy.ldm.pixeldit.pid import comfy.ldm.ace.model import comfy.ldm.omnigen.omnigen2 import comfy.ldm.qwen_image.model @@ -813,6 +816,85 @@ class StableAudio1(BaseModel): sd["{}{}".format(k, l)] = s[l] return sd +class StableAudio3(BaseModel): + def __init__(self, model_config, seconds_total_embedder_weights, padding_embedding=None, model_type=ModelType.FLOW, device=None): + super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.audio.dit.AudioDiffusionTransformer) + self.seconds_total_embedder = comfy.ldm.audio.embedders.NumberConditioner(768, min_val=0, max_val=384, fourier_features_type=model_config.unet_config["timestep_features_type"]) + self.seconds_total_embedder.load_state_dict(seconds_total_embedder_weights) + if padding_embedding is not None: + self.padding_embedding = torch.nn.Parameter(padding_embedding, requires_grad=False) + else: + self.padding_embedding = None + + def concat_cond(self, **kwargs): + noise = kwargs.get("noise", None) + image = kwargs.get("concat_latent_image", None) + + if image is None: + shape_image = list(noise.shape) + image = torch.zeros(shape_image, dtype=noise.dtype, layout=noise.layout, device=noise.device) + else: + image = self.process_latent_in(image) + # TODO: scale if not match + image = utils.resize_to_batch_size(image, noise.shape[0]) + + mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None)) + if mask is None: + mask = torch.zeros_like(noise)[:, :1] + else: + if mask.shape[1] != 1: + mask = torch.mean(mask, dim=1, keepdim=True) + mask = 1.0 - mask + # TODO: scale if not match + mask = utils.resize_to_batch_size(mask, noise.shape[0]) + + return torch.cat((mask, image), dim=1) + + def extra_conds(self, **kwargs): + out = {} + + concat_cond = self.concat_cond(**kwargs) + if concat_cond is not None: + out['local_add_cond'] = comfy.conds.CONDNoiseShape(concat_cond) + + noise = kwargs.get("noise", None) + device = kwargs["device"] + + seconds_total = kwargs.get("seconds_total", int(noise.shape[-1] / 10.7666)) + seconds_total_embed = self.seconds_total_embedder([seconds_total])[0].to(device) + + global_embed = seconds_total_embed.reshape((1, -1)) + out['global_embed'] = comfy.conds.CONDRegular(global_embed) + + cross_attn = kwargs.get("cross_attn", None) + if cross_attn is not None: + cross_attn = cross_attn.to(device) + if self.padding_embedding is not None: + pe = self.padding_embedding.to(device=device, dtype=cross_attn.dtype) + max_text_tokens = self.model_config.unet_config.get("max_text_tokens", 256) + n_text = cross_attn.shape[1] + if n_text < max_text_tokens: + pad = pe.view(1, 1, -1).expand(cross_attn.shape[0], max_text_tokens - n_text, -1) + cross_attn = torch.cat([cross_attn, pad], dim=1) + cross_attn = torch.cat([cross_attn, seconds_total_embed.repeat((cross_attn.shape[0], 1, 1))], dim=1) + out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) + + return out + + def state_dict_for_saving(self, unet_state_dict, clip_state_dict=None, vae_state_dict=None, clip_vision_state_dict=None): + sd = super().state_dict_for_saving(unet_state_dict, clip_state_dict=clip_state_dict, vae_state_dict=vae_state_dict, clip_vision_state_dict=clip_vision_state_dict) + + d = {"conditioner.conditioners.seconds_total.": self.seconds_total_embedder.state_dict()} + + for k in d: + s = d[k] + for l in s: + sd["{}{}".format(k, l)] = s[l] + + if self.padding_embedding is not None: + sd["conditioner.conditioners.prompt.padding_embedding"] = self.padding_embedding.data + return sd + class HunyuanDiT(BaseModel): def __init__(self, model_config, model_type=ModelType.V_PREDICTION, device=None): @@ -979,6 +1061,27 @@ class Flux2(Flux): out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) return out + +class Lens(BaseModel): + def __init__(self, model_config, model_type=ModelType.FLUX, device=None): + super().__init__( + model_config, model_type, device=device, + unet_model=comfy.ldm.lens.model.LensTransformer2DModel, + ) + + def encode_adm(self, **kwargs): + return None # Lens has no pooled/ADM conditioning. + + def extra_conds(self, **kwargs): + out = super().extra_conds(**kwargs) + cross_attn = kwargs.get("cross_attn", None) + if cross_attn is not None: + out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) + attention_mask = kwargs.get("attention_mask", None) + if attention_mask is not None: + out['attention_mask'] = comfy.conds.CONDRegular(attention_mask) + return out + class GenmoMochi(BaseModel): def __init__(self, model_config, model_type=ModelType.FLOW, device=None): super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.genmo.joint_model.asymm_models_joint.AsymmDiTJoint) @@ -1296,6 +1399,53 @@ class ZImagePixelSpace(Lumina2): BaseModel.__init__(self, model_config, model_type, device=device, unet_model=comfy.ldm.lumina.model.NextDiTPixelSpace) self.memory_usage_factor_conds = ("ref_latents",) + +class PixelDiTT2I(BaseModel): + def __init__(self, model_config, model_type=ModelType.FLOW, device=None): + super().__init__(model_config, model_type, device=device, + unet_model=comfy.ldm.pixeldit.model.PixDiT_T2I) + + def extra_conds(self, **kwargs): + out = super().extra_conds(**kwargs) + attention_mask = kwargs.get("attention_mask", None) + if attention_mask is not None: + out["attention_mask"] = comfy.conds.CONDRegular(attention_mask) + return out + + +class PiD(PixelDiTT2I): + def __init__(self, model_config, model_type=ModelType.FLOW, device=None): + BaseModel.__init__(self, model_config, model_type, device=device, + unet_model=comfy.ldm.pixeldit.pid.PidNet) + + def extra_conds(self, **kwargs): + out = super().extra_conds(**kwargs) + lq_latent = kwargs.get("lq_latent", None) + if lq_latent is not None: + out["lq_latent"] = comfy.conds.CONDRegular(lq_latent) + degrade_sigma = kwargs.get("degrade_sigma", None) + if degrade_sigma is not None: + out["degrade_sigma"] = comfy.conds.CONDRegular(degrade_sigma) + return out + + def resize_cond_for_context_window(self, cond_key, cond_value, window, x_in, device, retain_index_list=[]): + if cond_key == "lq_latent" and hasattr(cond_value, "cond") and isinstance(cond_value.cond, torch.Tensor): + lq = cond_value.cond + dim = window.dim + if dim >= lq.ndim: + return None + lq_proj = self.diffusion_model.lq_proj + ratio = lq_proj.sr_scale * lq_proj.latent_spatial_down_factor + # Map x window indices -> lq indices (deduplicated, sorted, in-bounds). + lq_size = lq.size(dim) + lq_indices = sorted({i // ratio for i in window.index_list if 0 <= i // ratio < lq_size}) + if not lq_indices: + return None + idx = tuple([slice(None)] * dim + [lq_indices]) + return cond_value._copy_with(lq[idx].to(device)) + return super().resize_cond_for_context_window(cond_key, cond_value, window, x_in, device, retain_index_list=retain_index_list) + + class WAN21(BaseModel): def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None): super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index bc0b933bc..f0db7d388 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -116,6 +116,45 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): if '{}transformer.rotary_pos_emb.inv_freq'.format(key_prefix) in state_dict_keys: #stable audio dit unet_config = {} unet_config["audio_model"] = "dit1.0" + unet_config["global_cond_dim"] = state_dict['{}to_global_embed.0.weight'.format(key_prefix)].shape[1] + cond_embed = state_dict['{}to_cond_embed.0.weight'.format(key_prefix)] + unet_config["project_cond_tokens"] = cond_embed.shape[0] != cond_embed.shape[1] + unet_config["embed_dim"] = state_dict['{}to_timestep_embed.0.weight'.format(key_prefix)].shape[0] + mem_tokens = state_dict.get('{}transformer.memory_tokens'.format(key_prefix), None) + to_qkv = state_dict.get('{}transformer.layers.0.self_attn.to_qkv.weight'.format(key_prefix), None) + differential = False + if to_qkv is not None: + if to_qkv.shape[0] == to_qkv.shape[1] * 5: + differential = True + if mem_tokens is not None: + unet_config["num_memory_tokens"] = mem_tokens.shape[0] + if '{}transformer.layers.0.self_attn.q_norm.weight'.format(key_prefix) in state_dict: + unet_config["attn_kwargs"] = {"qk_norm": "ln", "feat_scale": True} + rms_norm = state_dict.get('{}transformer.layers.0.self_attn.q_norm.gamma'.format(key_prefix), None) + if rms_norm is not None: + unet_config["attn_kwargs"] = {"qk_norm": "rms", "differential": differential} + unet_config["norm_type"] = "rms_norm" + unet_config["num_heads"] = unet_config["embed_dim"] // rms_norm.shape[0] + + if '{}timestep_features.weight'.format(key_prefix) in state_dict: + unet_config["timestep_features_type"] = "learned" + else: + unet_config["timestep_features_type"] = "expo" + + io_channels = state_dict['{}postprocess_conv.weight'.format(key_prefix)].shape[0] + unet_config["io_channels"] = io_channels + unet_config["input_concat_dim"] = state_dict['{}transformer.project_in.weight'.format(key_prefix)].shape[1] - io_channels + + local_add_cond = state_dict.get('{}transformer.layers.0.to_local_embed.0.weight'.format(key_prefix), None) + if local_add_cond is not None: + unet_config["local_add_cond_dim"] = local_add_cond.shape[1] + + global_cond_embed = state_dict.get('{}transformer.global_cond_embedder.0.weight'.format(key_prefix), None) + if global_cond_embed is not None: + unet_config["global_cond_shared_embed"] = True + unet_config["global_cond_type"] = "adaLN" + + unet_config["depth"] = count_blocks(state_dict_keys, '{}transformer.layers.'.format(key_prefix) + '{}.') return unet_config if '{}double_layers.0.attn.w1q.weight'.format(key_prefix) in state_dict_keys: #aura flow dit @@ -424,6 +463,23 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["extra_per_block_abs_pos_emb_type"] = "learnable" return dit_config + # PiD (Pixel Diffusion Decoder). Must check BEFORE plain PixelDiT_T2I. + _lq_w_key = '{}lq_proj.latent_proj.0.weight'.format(key_prefix) + if _lq_w_key in state_dict_keys: + in_ch = int(state_dict[_lq_w_key].shape[1]) + _gate_prefix = '{}lq_proj.gate_modules.'.format(key_prefix) + num_gates = len({k[len(_gate_prefix):].split('.')[0] + for k in state_dict_keys if k.startswith(_gate_prefix)}) + dit_config = {"image_model": "pid", + "lq_latent_channels": in_ch, + "latent_spatial_down_factor": 16 if in_ch >= 64 else 8} + if num_gates > 0: + dit_config["lq_interval"] = (14 + num_gates - 1) // num_gates + return dit_config + + if '{}core.pixel_embedder.proj.weight'.format(key_prefix) in state_dict_keys: # PixelDiT T2I + return {"image_model": "pixeldit_t2i"} + if '{}cap_embedder.1.weight'.format(key_prefix) in state_dict_keys and '{}noise_refiner.0.attention.k_norm.weight'.format(key_prefix) in state_dict_keys: # Lumina 2 dit_config = {} dit_config["image_model"] = "lumina2" @@ -716,6 +772,30 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["timestep_scale"] = 1000.0 return dit_config + if '{}transformer_blocks.0.attn.norm_added_q.weight'.format(key_prefix) in state_dict_keys \ + and '{}transformer_blocks.0.img_mlp.w1.weight'.format(key_prefix) in state_dict_keys: # Lens + img_in_w = state_dict['{}img_in.weight'.format(key_prefix)] + proj_out_w = state_dict['{}proj_out.weight'.format(key_prefix)] + multi_layer = '{}txt_norm.0.weight'.format(key_prefix) in state_dict_keys + if multi_layer: + enc_hidden_dim = state_dict['{}txt_norm.0.weight'.format(key_prefix)].shape[0] + # Indices are TE-side; the DiT just consumes L layers in order. + selected_layer_index = tuple(range(count_blocks(state_dict_keys, '{}txt_norm.'.format(key_prefix) + '{}.'))) + else: + enc_hidden_dim = state_dict['{}txt_norm.weight'.format(key_prefix)].shape[0] + selected_layer_index = (0,) + + return { + "image_model": "lens", + "in_channels": img_in_w.shape[1], + "out_channels": proj_out_w.shape[0] // 4, # patch_size ** 2 (=2² default) + "num_layers": count_blocks(state_dict_keys, '{}transformer_blocks.'.format(key_prefix) + '{}.'), + "num_attention_heads": img_in_w.shape[0] // 64, # // attention_head_dim default + "enc_hidden_dim": enc_hidden_dim, + "multi_layer_encoder_feature": multi_layer, + "selected_layer_index": selected_layer_index, + } + if '{}txt_norm.weight'.format(key_prefix) in state_dict_keys: # Qwen Image dit_config = {} dit_config["image_model"] = "qwen_image" diff --git a/comfy/model_management.py b/comfy/model_management.py index 21738a4c7..b01c4d7fa 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -15,6 +15,7 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . """ +from __future__ import annotations import psutil import logging @@ -27,12 +28,18 @@ import platform import weakref import gc import os -from contextlib import nullcontext +from contextlib import contextmanager, nullcontext import comfy.memory_management import comfy.utils import comfy.quant_ops +import comfy_aimdo.host_buffer import comfy_aimdo.vram_buffer +from typing import TYPE_CHECKING +if TYPE_CHECKING: + from comfy.model_patcher import ModelPatcher + + class VRAMState(Enum): DISABLED = 0 #No vram present: no need to move models to vram NO_VRAM = 1 #Very low vram: enable all the options to save vram @@ -203,6 +210,107 @@ def get_torch_device(): else: return torch.device(torch.cuda.current_device()) +def get_all_torch_devices(exclude_current=False): + global cpu_state + devices = [] + if cpu_state == CPUState.GPU: + # NVIDIA + AMD/ROCm both expose their GPUs through torch.cuda.*; + # without the AMD arm, single-GPU ROCm users get an empty list + # which silently turns unload_all_models() into a no-op. + if is_nvidia() or is_amd(): + for i in range(torch.cuda.device_count()): + devices.append(torch.device("cuda", i)) + elif is_intel_xpu(): + for i in range(torch.xpu.device_count()): + devices.append(torch.device("xpu", i)) + elif is_ascend_npu(): + for i in range(torch.npu.device_count()): + devices.append(torch.device("npu", i)) + elif is_mlu(): + for i in range(torch.mlu.device_count()): + devices.append(torch.device("mlu", i)) + else: + # Fallback for unhandled GPU backends (e.g. DirectML): at least + # report the current device so callers like unload_all_models() + # do not silently no-op. + devices.append(get_torch_device()) + else: + devices.append(get_torch_device()) + if exclude_current: + current = get_torch_device() + if current in devices: + devices.remove(current) + return devices + +def get_gpu_device_options(): + """Return list of device option strings for node widgets. + + Always includes "default" and "cpu". When multiple GPUs are present, + adds "gpu:0", "gpu:1", etc. (vendor-agnostic labels). + """ + options = ["default", "cpu"] + devices = get_all_torch_devices() + if len(devices) > 1: + for i in range(len(devices)): + options.append(f"gpu:{i}") + return options + +def get_gpu_device_options_no_cpu(): + """Variant of get_gpu_device_options that omits "cpu". + + Intended for components like the VAE selector where running on CPU + is impractical and should not be offered as a choice. + """ + return [o for o in get_gpu_device_options() if o != "cpu"] + +def resolve_gpu_device_option(option: str): + """Resolve a device option string to a torch.device. + + Returns None for "default" (let the caller use its normal default). + Returns torch.device("cpu") for "cpu". + For "gpu:N", returns the Nth torch device. Returns None if the + index is out of range, the option string is malformed, or + unrecognized (callers are expected to log their own context-rich + message before falling back to the default device). + """ + if option is None or option == "default": + return None + if option == "cpu": + return torch.device("cpu") + if option.startswith("gpu:"): + try: + idx = int(option[4:]) + except ValueError: + return None + devices = get_all_torch_devices() + if 0 <= idx < len(devices): + return devices[idx] + return None + +@contextmanager +def cuda_device_context(device): + """Context manager that sets torch.cuda.current_device to match *device*. + + Used when running operations on a non-default CUDA device so that custom + CUDA kernels (e.g. comfy_kitchen fp8 quantization) pick up the correct + device index. The previous device is restored on exit. + + No-op when *device* is not CUDA, has no explicit index, or already matches + the current device. + """ + prev = None + if device.type == "cuda" and device.index is not None: + prev = torch.cuda.current_device() + if prev != device.index: + torch.cuda.set_device(device) + else: + prev = None + try: + yield + finally: + if prev is not None: + torch.cuda.set_device(prev) + def get_total_memory(dev=None, torch_total_too=False): global directml_enabled if dev is None: @@ -491,9 +599,21 @@ try: logging.info("Device: {}".format(get_torch_device_name(get_torch_device()))) except: logging.warning("Could not pick default device.") +try: + for device in get_all_torch_devices(exclude_current=True): + logging.info("Device: {}".format(get_torch_device_name(device))) +except: + pass +current_loaded_models: list[LoadedModel] = [] -current_loaded_models = [] +DIRTY_MMAPS = set() + +PIN_PRESSURE_HYSTERESIS = 256 * 1024 * 1024 + +#Freeing registerables on pressure does imply a GPU sync, so go big on +#the hysteresis so each expensive sync gives us back a good chunk. +REGISTERABLE_PIN_HYSTERESIS = 2048 * 1024 * 1024 def module_size(module): module_mem = 0 @@ -503,30 +623,49 @@ def module_size(module): module_mem += t.nbytes return module_mem -def module_mmap_residency(module, free=False): - mmap_touched_mem = 0 - module_mem = 0 - bounced_mmaps = set() - sd = module.state_dict() - for k in sd: - t = sd[k] - module_mem += t.nbytes - storage = t._qdata.untyped_storage() if isinstance(t, comfy.quant_ops.QuantizedTensor) else t.untyped_storage() - if not getattr(storage, "_comfy_tensor_mmap_touched", False): - continue - mmap_touched_mem += t.nbytes - if not free: - continue - storage._comfy_tensor_mmap_touched = False - mmap_obj = storage._comfy_tensor_mmap_refs[0] - if mmap_obj in bounced_mmaps: - continue - mmap_obj.bounce() - bounced_mmaps.add(mmap_obj) - return mmap_touched_mem, module_mem +def mark_mmap_dirty(storage): + mmap_refs = getattr(storage, "_comfy_tensor_mmap_refs", None) + if mmap_refs is not None: + DIRTY_MMAPS.add(mmap_refs[0]) + +def free_pins(size, evict_active=False): + freed_total = 0 + for loaded_model in reversed(current_loaded_models): + if size <= 0: + return freed_total + model = loaded_model.model + if model is not None and model.is_dynamic() and (evict_active or not model.model.dynamic_pins[model.load_device]["active"]): + freed = model.partially_unload_ram(size) + freed_total += freed + size -= freed + return freed_total + +def ensure_pin_budget(size, evict_active=False): + shortfall = size + comfy.memory_management.RAM_CACHE_HEADROOM / 2 - psutil.virtual_memory().available + if shortfall <= 0: + return True + + to_free = shortfall + PIN_PRESSURE_HYSTERESIS + return free_pins(to_free, evict_active=evict_active) >= shortfall + +def ensure_pin_registerable(size, evict_active=False): + shortfall = TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY + if MAX_PINNED_MEMORY <= 0: + return False + if shortfall <= 0: + return True + + shortfall += REGISTERABLE_PIN_HYSTERESIS + for loaded_model in reversed(current_loaded_models): + model = loaded_model.model + if model is not None and model.is_dynamic() and (evict_active or not model.model.dynamic_pins[model.load_device]["active"]): + shortfall -= model.unregister_inactive_pins(shortfall) + if shortfall <= 0: + return True + return shortfall <= REGISTERABLE_PIN_HYSTERESIS class LoadedModel: - def __init__(self, model): + def __init__(self, model: ModelPatcher): self._set_model(model) self.device = model.load_device self.real_model = None @@ -534,7 +673,7 @@ class LoadedModel: self.model_finalizer = None self._patcher_finalizer = None - def _set_model(self, model): + def _set_model(self, model: ModelPatcher): self._model = weakref.ref(model) if model.parent is not None: self._parent_model = weakref.ref(model.parent) @@ -545,6 +684,7 @@ class LoadedModel: model = self._parent_model() if model is not None: self._set_model(model) + self.device = model.load_device @property def model(self): @@ -553,9 +693,6 @@ class LoadedModel: def model_memory(self): return self.model.model_size() - def model_mmap_residency(self, free=False): - return self.model.model_mmap_residency(free=free) - def model_loaded_memory(self): return self.model.loaded_size() @@ -635,15 +772,9 @@ WINDOWS = any(platform.win32_ver()) EXTRA_RESERVED_VRAM = 400 * 1024 * 1024 if WINDOWS: - import comfy.windows EXTRA_RESERVED_VRAM = 600 * 1024 * 1024 #Windows is higher because of the shared vram issue if total_vram > (15 * 1024): # more extra reserved vram on 16GB+ cards EXTRA_RESERVED_VRAM += 100 * 1024 * 1024 - def get_free_ram(): - return comfy.windows.get_free_ram() -else: - def get_free_ram(): - return psutil.virtual_memory().available if args.reserve_vram is not None: EXTRA_RESERVED_VRAM = args.reserve_vram * 1024 * 1024 * 1024 @@ -657,7 +788,6 @@ def minimum_inference_memory(): def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins_required=0, ram_required=0): cleanup_models_gc() - comfy.memory_management.extra_ram_release(max(pins_required, ram_required)) unloaded_model = [] can_unload = [] unloaded_models = [] @@ -673,11 +803,9 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins for x in can_unload_sorted: i = x[-1] memory_to_free = 1e32 - pins_to_free = 1e32 - if not DISABLE_SMART_MEMORY or device is None: + if current_loaded_models[i].model.is_dynamic() and (not DISABLE_SMART_MEMORY or device is None): memory_to_free = 0 if device is None else memory_required - get_free_memory(device) - pins_to_free = pins_required - get_free_ram() - if current_loaded_models[i].model.is_dynamic() and for_dynamic: + if for_dynamic: #don't actually unload dynamic models for the sake of other dynamic models #as that works on-demand. memory_required -= current_loaded_models[i].model.loaded_size() @@ -685,18 +813,6 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins if memory_to_free > 0 and current_loaded_models[i].model_unload(memory_to_free): logging.debug(f"Unloading {current_loaded_models[i].model.model.__class__.__name__}") unloaded_model.append(i) - if pins_to_free > 0: - logging.debug(f"PIN Unloading {current_loaded_models[i].model.model.__class__.__name__}") - current_loaded_models[i].model.partially_unload_ram(pins_to_free) - - for x in can_unload_sorted: - i = x[-1] - ram_to_free = ram_required - psutil.virtual_memory().available - if ram_to_free <= 0 and i not in unloaded_model: - continue - resident_memory, _ = current_loaded_models[i].model_mmap_residency(free=True) - if resident_memory > 0: - logging.debug(f"RAM Unloading {current_loaded_models[i].model.model.__class__.__name__}") for i in sorted(unloaded_model, reverse=True): unloaded_models.append(current_loaded_models.pop(i)) @@ -762,29 +878,16 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu model_to_unload.model.detach(unpatch_all=False) model_to_unload.model_finalizer.detach() - total_memory_required = {} - total_pins_required = {} - total_ram_required = {} for loaded_model in models_to_load: device = loaded_model.device total_memory_required[device] = total_memory_required.get(device, 0) + loaded_model.model_memory_required(device) - resident_memory, model_memory = loaded_model.model_mmap_residency() - pinned_memory = loaded_model.model.pinned_memory_size() - #FIXME: This can over-free the pins as it budgets to pin the entire model. We should - #make this JIT to keep as much pinned as possible. - pins_required = model_memory - pinned_memory - ram_required = model_memory - resident_memory - total_pins_required[device] = total_pins_required.get(device, 0) + pins_required - total_ram_required[device] = total_ram_required.get(device, 0) + ram_required for device in total_memory_required: if device != torch.device("cpu"): free_memory(total_memory_required[device] * 1.1 + extra_mem, device, - for_dynamic=free_for_dynamic, - pins_required=total_pins_required[device], - ram_required=total_ram_required[device]) + for_dynamic=free_for_dynamic) for device in total_memory_required: if device != torch.device("cpu"): @@ -1180,6 +1283,7 @@ STREAM_CAST_BUFFERS = {} LARGEST_CASTED_WEIGHT = (None, 0) STREAM_AIMDO_CAST_BUFFERS = {} LARGEST_AIMDO_CASTED_WEIGHT = (None, 0) +STREAM_PIN_BUFFERS = {} DEFAULT_AIMDO_CAST_BUFFER_RESERVATION_SIZE = 16 * 1024 ** 3 @@ -1220,21 +1324,66 @@ def get_aimdo_cast_buffer(offload_stream, device): if cast_buffer is None: cast_buffer = comfy_aimdo.vram_buffer.VRAMBuffer(DEFAULT_AIMDO_CAST_BUFFER_RESERVATION_SIZE, device.index) STREAM_AIMDO_CAST_BUFFERS[offload_stream] = cast_buffer - return cast_buffer + +def get_pin_buffer(offload_stream): + pin_buffer = STREAM_PIN_BUFFERS.get(offload_stream, None) + if pin_buffer is None: + pin_buffer = comfy_aimdo.host_buffer.HostBuffer(0, 0, pinned_hostbuf_size(8 * 1024**3), mark_cold=False) + STREAM_PIN_BUFFERS[offload_stream] = pin_buffer + elif offload_stream is not None: + event = getattr(pin_buffer, "_comfy_event", None) + if event is not None: + event.synchronize() + delattr(pin_buffer, "_comfy_event") + return pin_buffer + +def resize_pin_buffer(pin_buffer, size): + global TOTAL_PINNED_MEMORY + old_size = pin_buffer.size + if size <= old_size: + return True + growth = size - old_size + comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM) + ensure_pin_budget(growth, evict_active=True) + ensure_pin_registerable(growth, evict_active=True) + try: + pin_buffer.extend(size=size, reallocate=True) + except RuntimeError: + return False + TOTAL_PINNED_MEMORY += pin_buffer.size - old_size + return True + def reset_cast_buffers(): + global TOTAL_PINNED_MEMORY global LARGEST_CASTED_WEIGHT global LARGEST_AIMDO_CASTED_WEIGHT LARGEST_CASTED_WEIGHT = (None, 0) LARGEST_AIMDO_CASTED_WEIGHT = (None, 0) - for offload_stream in set(STREAM_CAST_BUFFERS) | set(STREAM_AIMDO_CAST_BUFFERS): + for offload_stream in set(STREAM_CAST_BUFFERS) | set(STREAM_AIMDO_CAST_BUFFERS) | set(STREAM_PIN_BUFFERS): if offload_stream is not None: offload_stream.synchronize() synchronize() + for mmap_obj in DIRTY_MMAPS: + mmap_obj.bounce() + DIRTY_MMAPS.clear() + + for pin_buffer in STREAM_PIN_BUFFERS.values(): + TOTAL_PINNED_MEMORY -= pin_buffer.size + TOTAL_PINNED_MEMORY = max(0, TOTAL_PINNED_MEMORY) + + for loaded_model in current_loaded_models: + model = loaded_model.model + if model is not None and model.is_dynamic(): + model.model.dynamic_pins[model.load_device]["active"] = False + model.partially_unload_ram(1e30, subsets=[ "patches" ]) + model.model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024, pinned_hostbuf_size(model.model_size())), [], [-1], [0]) + STREAM_CAST_BUFFERS.clear() STREAM_AIMDO_CAST_BUFFERS.clear() + STREAM_PIN_BUFFERS.clear() soft_empty_cache() def get_offload_stream(device): @@ -1280,7 +1429,7 @@ def sync_stream(device, stream): current_stream(device).wait_stream(stream) -def cast_to_gathered(tensors, r, non_blocking=False, stream=None): +def cast_to_gathered(tensors, r, non_blocking=False, stream=None, r2=None): wf_context = nullcontext() if stream is not None: wf_context = stream @@ -1288,17 +1437,20 @@ def cast_to_gathered(tensors, r, non_blocking=False, stream=None): wf_context = wf_context.as_context(stream) dest_views = comfy.memory_management.interpret_gathered_like(tensors, r) + dest2_views = comfy.memory_management.interpret_gathered_like(tensors, r2) if r2 is not None else None with wf_context: for tensor in tensors: dest_view = dest_views.pop(0) + dest2_view = dest2_views.pop(0) if dest2_views is not None else None if tensor is None: continue - if comfy.memory_management.read_tensor_file_slice_into(tensor, dest_view): + if comfy.memory_management.read_tensor_file_slice_into(tensor, dest_view, stream=stream, destination2=dest2_view): continue storage = tensor._qdata.untyped_storage() if isinstance(tensor, comfy.quant_ops.QuantizedTensor) else tensor.untyped_storage() - if hasattr(storage, "_comfy_tensor_mmap_touched"): - storage._comfy_tensor_mmap_touched = True + mark_mmap_dirty(storage) dest_view.copy_(tensor, non_blocking=non_blocking) + if dest2_view is not None: + dest2_view.copy_(dest_view, non_blocking=non_blocking) def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, stream=None, r=None): @@ -1339,14 +1491,18 @@ TOTAL_PINNED_MEMORY = 0 MAX_PINNED_MEMORY = -1 if not args.disable_pinned_memory: if is_nvidia() or is_amd(): + ram = get_total_memory(torch.device("cpu")) if WINDOWS: - MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.40 # Windows limit is apparently 50% + MAX_PINNED_MEMORY = ram * 0.40 # Windows limit is apparently 50% else: - MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.90 + MAX_PINNED_MEMORY = ram * 0.90 logging.info("Enabled pinned memory {}".format(MAX_PINNED_MEMORY // (1024 * 1024))) PINNING_ALLOWED_TYPES = set(["Tensor", "Parameter", "QuantizedTensor"]) +def pinned_hostbuf_size(size): + return max(0, int(min(size, MAX_PINNED_MEMORY) * 2)) + def discard_cuda_async_error(): try: a = torch.tensor([1], dtype=torch.uint8, device=get_torch_device()) @@ -1378,8 +1534,8 @@ def pin_memory(tensor): return False size = tensor.nbytes - if (TOTAL_PINNED_MEMORY + size) > MAX_PINNED_MEMORY: - return False + comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM) + ensure_pin_registerable(size) ptr = tensor.data_ptr() if ptr == 0: @@ -1416,7 +1572,8 @@ def unpin_memory(tensor): return False if torch.cuda.cudart().cudaHostUnregister(ptr) == 0: - TOTAL_PINNED_MEMORY -= PINNED_MEMORY.pop(ptr) + size = PINNED_MEMORY.pop(ptr) + TOTAL_PINNED_MEMORY -= size return True else: logging.warning("Unpin error.") @@ -1803,7 +1960,34 @@ def soft_empty_cache(force=False): torch.cuda.ipc_collect() def unload_all_models(): - free_memory(1e30, get_torch_device()) + for device in get_all_torch_devices(): + free_memory(1e30, device) + +def unload_model_and_clones(model: ModelPatcher, unload_additional_models=True, all_devices=False): + 'Unload only model and its clones - primarily for multigpu cloning purposes.' + initial_keep_loaded: list[LoadedModel] = current_loaded_models.copy() + additional_models = [] + if unload_additional_models: + additional_models = model.get_nested_additional_models() + keep_loaded = [] + for loaded_model in initial_keep_loaded: + if loaded_model.model is not None: + if model.clone_base_uuid == loaded_model.model.clone_base_uuid: + continue + # check additional models if they are a match + skip = False + for add_model in additional_models: + if add_model.clone_base_uuid == loaded_model.model.clone_base_uuid: + skip = True + break + if skip: + continue + keep_loaded.append(loaded_model) + if not all_devices: + free_memory(1e30, get_torch_device(), keep_loaded) + else: + for device in get_all_torch_devices(): + free_memory(1e30, device, keep_loaded) def debug_memory_summary(): if is_amd() or is_nvidia(): diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 4f9d8403e..00a15fa63 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -35,6 +35,7 @@ import comfy.model_management import comfy.ops import comfy.patcher_extension import comfy.utils +import comfy_aimdo.host_buffer from comfy.comfy_types import UnetWrapperFunction from comfy.quant_ops import QuantizedTensor from comfy.patcher_extension import CallbacksMP, PatcherInjection, WrappersMP @@ -77,12 +78,15 @@ def set_model_options_pre_cfg_function(model_options, pre_cfg_function, disable_ def create_model_options_clone(orig_model_options: dict): return comfy.patcher_extension.copy_nested_dicts(orig_model_options) -def create_hook_patches_clone(orig_hook_patches): +def create_hook_patches_clone(orig_hook_patches, copy_tuples=False): new_hook_patches = {} for hook_ref in orig_hook_patches: new_hook_patches[hook_ref] = {} for k in orig_hook_patches[hook_ref]: new_hook_patches[hook_ref][k] = orig_hook_patches[hook_ref][k][:] + if copy_tuples: + for i in range(len(new_hook_patches[hook_ref][k])): + new_hook_patches[hook_ref][k][i] = tuple(new_hook_patches[hook_ref][k][i]) return new_hook_patches def wipe_lowvram_weight(m): @@ -117,6 +121,8 @@ def string_to_seed(data): return comfy.utils.string_to_seed(data) class LowVramPatch: + is_lowvram_patch = True + def __init__(self, key, patches, convert_func=None, set_func=None): self.key = key self.patches = patches @@ -124,11 +130,21 @@ class LowVramPatch: self.set_func = set_func self.prepared_patches = None - def prepare(self, allocate_buffer, stream): - self.prepared_patches = [ - (patch[0], comfy.lora.prefetch_prepared_value(patch[1], allocate_buffer, stream), patch[2], patch[3], patch[4]) + def memory_required(self): + counter = [0] + for patch in self.patches[self.key]: + comfy.lora.prefetch_prepared_value(patch[1], counter, None, None, False) + return counter[0] + + def prepare(self, destination, stream, copy=True, commit=True): + counter = [0] + prepared_patches = [ + (patch[0], comfy.lora.prefetch_prepared_value(patch[1], counter, destination, stream, copy), patch[2], patch[3], patch[4]) for patch in self.patches[self.key] ] + if commit: + self.prepared_patches = prepared_patches + return prepared_patches def clear_prepared(self): self.prepared_patches = None @@ -316,7 +332,10 @@ class ModelPatcher: self.is_clip = False self.hook_mode = comfy.hooks.EnumHookMode.MaxSpeed - self.cached_patcher_init: tuple[Callable, tuple] | None = None + self.cached_patcher_init: tuple[Callable, tuple] | tuple[Callable, tuple, int] | None = None + self.is_multigpu_base_clone = False + self.clone_base_uuid = uuid.uuid4() + if not hasattr(self.model, 'model_loaded_weight_memory'): self.model.model_loaded_weight_memory = 0 @@ -341,9 +360,6 @@ class ModelPatcher: self.size = comfy.model_management.module_size(self.model) return self.size - def model_mmap_residency(self, free=False): - return comfy.model_management.module_mmap_residency(self.model, free=free) - def loaded_size(self): return self.model.model_loaded_weight_memory @@ -356,7 +372,8 @@ class ModelPatcher: #than pays for CFG. So return everything both torch and Aimdo could give us aimdo_mem = 0 if comfy.memory_management.aimdo_enabled: - aimdo_mem = comfy_aimdo.model_vbar.vbars_analyze() + aimdo_device = device.index if getattr(device, "type", None) == "cuda" else None + aimdo_mem = comfy_aimdo.model_vbar.vbars_analyze(aimdo_device) return comfy.model_management.get_free_memory(device) + aimdo_mem def get_clone_model_override(self): @@ -370,6 +387,8 @@ class ModelPatcher: if self.cached_patcher_init is None: raise RuntimeError("Cannot create non-dynamic delegate: cached_patcher_init is not initialized.") temp_model_patcher = self.cached_patcher_init[0](*self.cached_patcher_init[1], disable_dynamic=True) + if len(self.cached_patcher_init) > 2: + temp_model_patcher = temp_model_patcher[self.cached_patcher_init[2]] model_override = temp_model_patcher.get_clone_model_override() if model_override is None: model_override = self.get_clone_model_override() @@ -428,19 +447,113 @@ class ModelPatcher: n.hook_mode = self.hook_mode n.cached_patcher_init = self.cached_patcher_init + n.is_multigpu_base_clone = self.is_multigpu_base_clone + n.clone_base_uuid = self.clone_base_uuid for callback in self.get_all_callbacks(CallbacksMP.ON_CLONE): callback(self, n) return n + def deepclone_multigpu(self, new_load_device=None, models_cache: dict[uuid.UUID,ModelPatcher]=None): + logging.info(f"Creating deepclone of {self.model.__class__.__name__} for {new_load_device if new_load_device else self.load_device}.") + if self.cached_patcher_init is None: + raise RuntimeError( + f"Cannot create multigpu deepclone of {self.model.__class__.__name__}: " + "the loader that produced this model does not support multigpu " + "(cached_patcher_init is not initialized). Use a core loader " + "(CheckpointLoaderSimple, UNETLoader, CLIPLoader/DualCLIPLoader, VAELoader), " + "or have the custom loader register a cached_patcher_init factory." + ) + comfy.model_management.unload_model_and_clones(self) + # Produce a freshly-loaded patcher from the loader factory so the multigpu + # clone owns its own untainted model weights (rather than relying on + # copy.deepcopy of an already-patched/already-loaded module). + temp_model_patcher: ModelPatcher | list[ModelPatcher] = self.cached_patcher_init[0](*self.cached_patcher_init[1]) + if len(self.cached_patcher_init) > 2: + temp_model_patcher = temp_model_patcher[self.cached_patcher_init[2]] + # Override clone()'s normal "share self.model + share backup containers" with + # the pristine model from temp_model_patcher plus empty backup containers -- + # the fresh model has no patches applied, so any deepcopy of self's stale + # backup/object_patches_backup/pinned would just propagate dead state that + # no longer corresponds to anything in n.model. + model_override = (temp_model_patcher.model, ({}, {}, {}, set())) + n = self.clone(model_override=model_override) + # clone() copies hook_backup by reference from self; reset since model is pristine. + n.hook_backup = {} + # set load device, if present + if new_load_device is not None: + n.load_device = new_load_device + # Ensure any per-device bookkeeping (e.g. ModelPatcherDynamic.dynamic_pins) + # has an entry for n.load_device on the freshly-loaded n.model. temp_model_patcher's + # __init__ only registered its own (default) load_device. + if hasattr(n, "register_load_device"): + n.register_load_device(n.load_device) + # multigpu clone should not have multigpu additional_models entry + n.remove_additional_models("multigpu") + # multigpu_clone all stored additional_models; make sure circular references are properly handled + if models_cache is None: + models_cache = {} + for key, model_list in n.additional_models.items(): + for i in range(len(model_list)): + add_model = n.additional_models[key][i] + if add_model.clone_base_uuid not in models_cache: + models_cache[add_model.clone_base_uuid] = add_model.deepclone_multigpu(new_load_device=new_load_device, models_cache=models_cache) + n.additional_models[key][i] = models_cache[add_model.clone_base_uuid] + for callback in self.get_all_callbacks(CallbacksMP.ON_DEEPCLONE_MULTIGPU): + callback(self, n) + return n + + def match_multigpu_clones(self): + multigpu_models = self.get_additional_models_with_key("multigpu") + if len(multigpu_models) > 0: + new_multigpu_models = [] + for mm in multigpu_models: + # clone main model, but bring over relevant props from existing multigpu clone + n = self.clone() + n.load_device = mm.load_device + n.backup = mm.backup + n.object_patches_backup = mm.object_patches_backup + n.hook_backup = mm.hook_backup + n.model = mm.model + n.is_multigpu_base_clone = mm.is_multigpu_base_clone + n.remove_additional_models("multigpu") + orig_additional_models: dict[str, list[ModelPatcher]] = comfy.patcher_extension.copy_nested_dicts(n.additional_models) + n.additional_models = comfy.patcher_extension.copy_nested_dicts(mm.additional_models) + # figure out which additional models are not present in multigpu clone + models_cache = {} + for mm_add_model in mm.get_additional_models(): + models_cache[mm_add_model.clone_base_uuid] = mm_add_model + remove_models_uuids = set(list(models_cache.keys())) + for key, model_list in orig_additional_models.items(): + for orig_add_model in model_list: + if orig_add_model.clone_base_uuid not in models_cache: + models_cache[orig_add_model.clone_base_uuid] = orig_add_model.deepclone_multigpu(new_load_device=n.load_device, models_cache=models_cache) + existing_list = n.get_additional_models_with_key(key) + existing_list.append(models_cache[orig_add_model.clone_base_uuid]) + n.set_additional_models(key, existing_list) + if orig_add_model.clone_base_uuid in remove_models_uuids: + remove_models_uuids.remove(orig_add_model.clone_base_uuid) + # remove duplicate additional models + for key, model_list in n.additional_models.items(): + new_model_list = [x for x in model_list if x.clone_base_uuid not in remove_models_uuids] + n.set_additional_models(key, new_model_list) + for callback in self.get_all_callbacks(CallbacksMP.ON_MATCH_MULTIGPU_CLONES): + callback(self, n) + new_multigpu_models.append(n) + self.set_additional_models("multigpu", new_multigpu_models) + def is_clone(self, other): if hasattr(other, 'model') and self.model is other.model: return True return False - def clone_has_same_weights(self, clone: 'ModelPatcher'): - if not self.is_clone(clone): - return False + def clone_has_same_weights(self, clone: ModelPatcher, allow_multigpu=False): + if allow_multigpu: + if self.clone_base_uuid != clone.clone_base_uuid: + return False + else: + if not self.is_clone(clone): + return False if self.current_hooks != clone.current_hooks: return False @@ -1118,8 +1231,12 @@ class ModelPatcher: # Pinned memory pressure tracking is only implemented for DynamicVram loading return 0 + def loaded_ram_size(self): + # Loaded RAM pressure tracking is only implemented for DynamicVram loading + return 0 + def partially_unload_ram(self, ram_to_unload): - pass + return 0 def detach(self, unpatch_all=True): self.eject_model() @@ -1218,7 +1335,7 @@ class ModelPatcher: return self.additional_models.get(key, []) def get_additional_models(self): - all_models = [] + all_models: list[ModelPatcher] = [] for models in self.additional_models.values(): all_models.extend(models) return all_models @@ -1272,9 +1389,18 @@ class ModelPatcher: for callback in self.get_all_callbacks(CallbacksMP.ON_PRE_RUN): callback(self) - def prepare_state(self, timestep): + def prepare_state(self, timestep, model_options): + ignore_multigpu = model_options.get("ignore_multigpu", False) for callback in self.get_all_callbacks(CallbacksMP.ON_PREPARE_STATE): - callback(self, timestep) + callback(self, timestep, model_options) + if not ignore_multigpu and "multigpu_clones" in model_options: + model_options["ignore_multigpu"] = True + try: + for p in model_options["multigpu_clones"].values(): + p: ModelPatcher + p.prepare_state(timestep, model_options) + finally: + model_options.pop("ignore_multigpu", None) def restore_hook_patches(self): if self.hook_patches_backup is not None: @@ -1287,12 +1413,18 @@ class ModelPatcher: def prepare_hook_patches_current_keyframe(self, t: torch.Tensor, hook_group: comfy.hooks.HookGroup, model_options: dict[str]): curr_t = t[0] reset_current_hooks = False + multigpu_kf_changed_cache = None transformer_options = model_options.get("transformer_options", {}) for hook in hook_group.hooks: changed = hook.hook_keyframe.prepare_current_keyframe(curr_t=curr_t, transformer_options=transformer_options) # if keyframe changed, remove any cached HookGroups that contain hook with the same hook_ref; # this will cause the weights to be recalculated when sampling if changed: + # cache changed for multigpu usage + if "multigpu_clones" in model_options: + if multigpu_kf_changed_cache is None: + multigpu_kf_changed_cache = [] + multigpu_kf_changed_cache.append(hook) # reset current_hooks if contains hook that changed if self.current_hooks is not None: for current_hook in self.current_hooks.hooks: @@ -1304,6 +1436,28 @@ class ModelPatcher: self.cached_hook_patches.pop(cached_group) if reset_current_hooks: self.patch_hooks(None) + if "multigpu_clones" in model_options: + for p in model_options["multigpu_clones"].values(): + p: ModelPatcher + p._handle_changed_hook_keyframes(multigpu_kf_changed_cache) + + def _handle_changed_hook_keyframes(self, kf_changed_cache: list[comfy.hooks.Hook]): + 'Used to handle multigpu behavior inside prepare_hook_patches_current_keyframe.' + if kf_changed_cache is None: + return + reset_current_hooks = False + # reset current_hooks if contains hook that changed + for hook in kf_changed_cache: + if self.current_hooks is not None: + for current_hook in self.current_hooks.hooks: + if current_hook == hook: + reset_current_hooks = True + break + for cached_group in list(self.cached_hook_patches.keys()): + if cached_group.contains(hook): + self.cached_hook_patches.pop(cached_group) + if reset_current_hooks: + self.patch_hooks(None) def register_all_hook_patches(self, hooks: comfy.hooks.HookGroup, target_dict: dict[str], model_options: dict=None, registered: comfy.hooks.HookGroup = None): @@ -1550,9 +1704,30 @@ class ModelPatcherDynamic(ModelPatcher): super().__init__(model, load_device, offload_device, size, weight_inplace_update) if not hasattr(self.model, "dynamic_vbars"): self.model.dynamic_vbars = {} + if not hasattr(self.model, "dynamic_pins"): + self.model.dynamic_pins = {} + self.register_load_device(self.load_device) self.non_dynamic_delegate_model = None assert load_device is not None + def register_load_device(self, device): + """Ensure dynamic_pins has an entry for *device*. + + Called from __init__ and also from any code that retargets an + already-constructed patcher to a new load_device (e.g. the + Select{Model,CLIP,VAE}Device selector nodes); without this entry + partially_unload_ram() raises KeyError when it tries to read the + per-device pin state. + """ + if device not in self.model.dynamic_pins: + self.model.dynamic_pins[device] = { + "weights": (comfy_aimdo.host_buffer.HostBuffer(0, 0, 0), [], [-1], [0]), + "patches": (comfy_aimdo.host_buffer.HostBuffer(0, 0, 0), [], [-1], [0]), + "hostbufs_initialized": False, + "failed": False, + "active": False, + } + def is_dynamic(self): return True @@ -1589,6 +1764,16 @@ class ModelPatcherDynamic(ModelPatcher): #use all ModelPatcherDynamic this is ignored and its all done dynamically. return super().memory_required(input_shape=input_shape) * 1.3 + (1024 ** 3) + def restore_loaded_backups(self): + restored = self.model.model_loaded_weight_memory + for key in list(self.backup.keys()): + bk = self.backup.pop(key) + comfy.utils.set_attr_param(self.model, key, bk.weight) + for key in list(self.backup_buffers.keys()): + comfy.utils.set_attr_buffer(self.model, key, self.backup_buffers.pop(key)) + self.model.model_loaded_weight_memory = 0 + return restored + def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False, full_load=False, dirty=False): @@ -1605,12 +1790,20 @@ class ModelPatcherDynamic(ModelPatcher): num_patches = 0 allocated_size = 0 - self.model.model_loaded_weight_memory = 0 + self.restore_loaded_backups() with self.use_ejected(): self.unpatch_hooks() vbar = self._vbar_get(create=True) + pin_state = self.model.dynamic_pins[self.load_device] + if not pin_state["hostbufs_initialized"]: + hostbuf_size = comfy.model_management.pinned_hostbuf_size(self.model_size()) + pin_state["weights"] = (comfy_aimdo.host_buffer.HostBuffer(0, 64 * 1024 * 1024, hostbuf_size), [], [-1], [0]) + pin_state["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024, hostbuf_size), [], [-1], [0]) + pin_state["hostbufs_initialized"] = True + pin_state["failed"] = False + pin_state["active"] = True if vbar is not None: vbar.prioritize() @@ -1636,7 +1829,9 @@ class ModelPatcherDynamic(ModelPatcher): if key in self.patches: if comfy.lora.calculate_shape(self.patches[key], weight, key) != weight.shape: return (True, 0) - setattr(m, param_key + "_lowvram_function", LowVramPatch(key, self.patches)) + lowvram_patch = LowVramPatch(key, self.patches) + lowvram_patch._pin_state = pin_state + setattr(m, param_key + "_lowvram_function", lowvram_patch) num_patches += 1 else: setattr(m, param_key + "_lowvram_function", None) @@ -1653,6 +1848,9 @@ class ModelPatcherDynamic(ModelPatcher): def force_load_param(self, param_key, device_to): key = key_param_name_to_key(n, param_key) + weight, _, _ = get_key_weight(self.model, key) + if weight is None: + return if key in self.backup: comfy.utils.set_attr_param(self.model, key, self.backup[key].weight) self.patch_weight_to_device(key, device_to=device_to, force_cast=True) @@ -1662,17 +1860,26 @@ class ModelPatcherDynamic(ModelPatcher): if hasattr(m, "comfy_cast_weights"): m.comfy_cast_weights = True - m.pin_failed = False m.seed_key = n + m._pin_state = pin_state set_dirty(m, dirty) - force_load, v_weight_size = setup_param(self, m, n, "weight") - force_load_bias, v_weight_bias = setup_param(self, m, n, "bias") - force_load = force_load or force_load_bias - v_weight_size += v_weight_bias + #Models that mix tiny and giant weights can causing lopsided stream buffer + #rotations and stall. force the tinys over. + if module_mem > 16 * 1024: + force_load, v_weight_size = setup_param(self, m, n, "weight") + force_load_bias, v_weight_bias = setup_param(self, m, n, "bias") + force_load = force_load or force_load_bias + v_weight_size += v_weight_bias + if force_load: + logging.info(f"Module {n} has resizing Lora - force loading") + else: + force_load=True if force_load: - logging.info(f"Module {n} has resizing Lora - force loading") + if hasattr(m, "_v"): + comfy_aimdo.model_vbar.vbar_unpin(m._v) + delattr(m, "_v") force_load_param(self, "weight", device_to) force_load_param(self, "bias", device_to) else: @@ -1730,33 +1937,62 @@ class ModelPatcherDynamic(ModelPatcher): freed = 0 if vbar is None else vbar.free_memory(memory_to_free) if freed < memory_to_free: - for key in list(self.backup.keys()): - bk = self.backup.pop(key) - comfy.utils.set_attr_param(self.model, key, bk.weight) - for key in list(self.backup_buffers.keys()): - comfy.utils.set_attr_buffer(self.model, key, self.backup_buffers.pop(key)) - freed += self.model.model_loaded_weight_memory - self.model.model_loaded_weight_memory = 0 + freed += self.restore_loaded_backups() return freed - def pinned_memory_size(self): - total = 0 - loading = self._load_list(for_dynamic=True) - for x in loading: - _, _, _, _, m, _ = x - pin = comfy.pinned_memory.get_pin(m) - if pin is not None: - total += pin.numel() * pin.element_size() - return total + def loaded_ram_size(self): + return (self.model.dynamic_pins[self.load_device]["weights"][0].size + + self.model.dynamic_pins[self.load_device]["patches"][0].size) - def partially_unload_ram(self, ram_to_unload): - loading = self._load_list(for_dynamic=True, default_device=self.offload_device) - for x in loading: - *_, m, _ = x - ram_to_unload -= comfy.pinned_memory.unpin_memory(m) - if ram_to_unload <= 0: - return + def pinned_memory_size(self): + return (self.model.dynamic_pins[self.load_device]["weights"][3][0] + + self.model.dynamic_pins[self.load_device]["patches"][3][0]) + + def unregister_inactive_pins(self, ram_to_unload, subsets=[ "weights", "patches" ]): + freed = 0 + pin_state = self.model.dynamic_pins[self.load_device] + for subset in subsets: + hostbuf, stack, stack_split, pinned_size = pin_state[subset] + split = stack_split[0] + while split >= 0: + module, offset = stack[split] + split -= 1 + stack_split[0] = split + if not module._pin_registered: + continue + size = module._pin.numel() * module._pin.element_size() + if torch.cuda.cudart().cudaHostUnregister(module._pin.data_ptr()) != 0: + comfy.model_management.discard_cuda_async_error() + continue + module._pin_registered = False + comfy.model_management.TOTAL_PINNED_MEMORY = max(0, comfy.model_management.TOTAL_PINNED_MEMORY - size) + pinned_size[0] = max(0, pinned_size[0] - size) + freed += size + ram_to_unload -= size + if ram_to_unload <= 0: + return freed + return freed + + def partially_unload_ram(self, ram_to_unload, subsets=[ "weights", "patches" ]): + freed = 0 + pin_state = self.model.dynamic_pins[self.load_device] + for subset in subsets: + hostbuf, stack, stack_split, pinned_size = pin_state[subset] + while len(stack) > 0: + module, offset = stack.pop() + size = module._pin.numel() * module._pin.element_size() + del module._pin + hostbuf.truncate(offset, do_unregister=module._pin_registered) + stack_split[0] = min(stack_split[0], len(stack) - 1) + if module._pin_registered: + comfy.model_management.TOTAL_PINNED_MEMORY = max(0, comfy.model_management.TOTAL_PINNED_MEMORY - size) + pinned_size[0] = max(0, pinned_size[0] - size) + freed += size + ram_to_unload -= size + if ram_to_unload <= 0: + return freed + return freed def patch_model(self, device_to=None, lowvram_model_memory=0, load_weights=True, force_patch_weights=False): #This isn't used by the core at all and can only be to load a model out of diff --git a/comfy/multigpu.py b/comfy/multigpu.py new file mode 100644 index 000000000..e7f5b3d6f --- /dev/null +++ b/comfy/multigpu.py @@ -0,0 +1,248 @@ +from __future__ import annotations +import queue +import threading +import torch +import logging + +from collections import namedtuple +from typing import TYPE_CHECKING +if TYPE_CHECKING: + from comfy.model_patcher import ModelPatcher +import comfy.utils +import comfy.patcher_extension +import comfy.model_management + + +class MultiGPUThreadPool: + """Persistent thread pool for multi-GPU work distribution. + + Maintains one worker thread per extra GPU device. Each thread calls + torch.cuda.set_device() once at startup so that compiled kernel caches + (inductor/triton) stay warm across diffusion steps. + """ + + def __init__(self, devices: list[torch.device]): + self._workers: list[threading.Thread] = [] + self._work_queues: dict[torch.device, queue.Queue] = {} + self._result_queues: dict[torch.device, queue.Queue] = {} + + for device in devices: + wq = queue.Queue() + rq = queue.Queue() + self._work_queues[device] = wq + self._result_queues[device] = rq + t = threading.Thread(target=self._worker_loop, args=(device, wq, rq), daemon=True) + t.start() + self._workers.append(t) + + def _worker_loop(self, device: torch.device, work_q: queue.Queue, result_q: queue.Queue): + try: + torch.cuda.set_device(device) + except Exception as e: + logging.error(f"MultiGPUThreadPool: failed to set device {device}: {e}") + while True: + item = work_q.get() + if item is None: + return + result_q.put((None, e)) + return + while True: + item = work_q.get() + if item is None: + break + fn, args, kwargs = item + try: + result = fn(*args, **kwargs) + result_q.put((result, None)) + except Exception as e: + result_q.put((None, e)) + + def submit(self, device: torch.device, fn, *args, **kwargs): + self._work_queues[device].put((fn, args, kwargs)) + + def get_result(self, device: torch.device): + return self._result_queues[device].get() + + @property + def devices(self) -> list[torch.device]: + return list(self._work_queues.keys()) + + def shutdown(self): + for wq in self._work_queues.values(): + wq.put(None) # sentinel + for t in self._workers: + t.join(timeout=5.0) + + +class GPUOptions: + def __init__(self, device_index: int, relative_speed: float): + self.device_index = device_index + self.relative_speed = relative_speed + + def clone(self): + return GPUOptions(self.device_index, self.relative_speed) + + def create_dict(self): + return { + "relative_speed": self.relative_speed + } + +class GPUOptionsGroup: + def __init__(self): + self.options: dict[int, GPUOptions] = {} + + def add(self, info: GPUOptions): + self.options[info.device_index] = info + + def clone(self): + c = GPUOptionsGroup() + for opt in self.options.values(): + c.add(opt) + return c + + def register(self, model: ModelPatcher): + opts_dict = {} + # get devices that are valid for this model + devices: list[torch.device] = [model.load_device] + for extra_model in model.get_additional_models_with_key("multigpu"): + extra_model: ModelPatcher + devices.append(extra_model.load_device) + # create dictionary with actual device mapped to its GPUOptions + device_opts_list: list[GPUOptions] = [] + for device in devices: + device_opts = self.options.get(device.index, GPUOptions(device_index=device.index, relative_speed=1.0)) + opts_dict[device] = device_opts.create_dict() + device_opts_list.append(device_opts) + # make relative_speed relative to 1.0 + min_speed = min([x.relative_speed for x in device_opts_list]) + for value in opts_dict.values(): + value['relative_speed'] /= min_speed + model.model_options['multigpu_options'] = opts_dict + + +def create_multigpu_deepclones(model: ModelPatcher, max_gpus: int, gpu_options: GPUOptionsGroup=None, reuse_loaded=False): + 'Prepare ModelPatcher to contain deepclones of its BaseModel and related properties.' + model = model.clone() + # check if multigpu is already prepared - get the load devices from them if possible to exclude + skip_devices = set() + multigpu_models = model.get_additional_models_with_key("multigpu") + if len(multigpu_models) > 0: + for mm in multigpu_models: + skip_devices.add(mm.load_device) + skip_devices = list(skip_devices) + + # Exclude the primary model's actual device, not the global current device: + # after SelectModelDevice(gpu:N) the primary may not live on the process's + # current CUDA device, and excluding the wrong device picks bad extras. + all_devices = comfy.model_management.get_all_torch_devices(exclude_current=False) + full_extra_devices = [d for d in all_devices if d != model.load_device] + limit_extra_devices = full_extra_devices[:max_gpus-1] + extra_devices = limit_extra_devices.copy() + # exclude skipped devices + for skip in skip_devices: + if skip in extra_devices: + extra_devices.remove(skip) + # create new deepclones + if len(extra_devices) > 0: + for device in extra_devices: + device_patcher = None + if reuse_loaded: + # Only reuse a previously-loaded MultiGPU clone. A SelectModelDevice + # patcher on the same device shares clone_base_uuid but has + # is_multigpu_base_clone=False, which would later be filtered out by + # prepare_model_patcher_multigpu_clones() and silently shrink the + # work split back to one GPU. + loaded_models: list[ModelPatcher] = comfy.model_management.loaded_models() + for lm in loaded_models: + if lm.model is None: + continue + if lm.load_device != device: + continue + if lm.clone_base_uuid != model.clone_base_uuid: + continue + if not getattr(lm, "is_multigpu_base_clone", False): + continue + device_patcher = lm.clone() + logging.info(f"Reusing loaded multigpu deepclone of {device_patcher.model.__class__.__name__} for {device}") + break + if device_patcher is None: + device_patcher = model.deepclone_multigpu(new_load_device=device) + # Always flag the clone; whether reused or freshly deepcloned, it must + # advertise itself as a MultiGPU base clone so the cond scheduler picks + # it up in prepare_model_patcher_multigpu_clones(). + device_patcher.is_multigpu_base_clone = True + multigpu_models = model.get_additional_models_with_key("multigpu") + multigpu_models.append(device_patcher) + model.set_additional_models("multigpu", multigpu_models) + model.match_multigpu_clones() + if gpu_options is None: + gpu_options = GPUOptionsGroup() + gpu_options.register(model) + else: + logging.info("No extra torch devices need initialization, skipping initializing MultiGPU Work Units.") + # only keep model clones that don't go 'past' the intended max_gpu count; + # this prunes any inherited multigpu clones whose load_device is no longer allowed + # when max_gpus is lowered between runs. + allowed_devices = set(limit_extra_devices) + allowed_devices.add(model.load_device) + multigpu_models = model.get_additional_models_with_key("multigpu") + new_multigpu_models = [m for m in multigpu_models if m.load_device in allowed_devices] + if len(new_multigpu_models) != len(multigpu_models): + model.set_additional_models("multigpu", new_multigpu_models) + model.match_multigpu_clones() + return model + + +LoadBalance = namedtuple('LoadBalance', ['work_per_device', 'idle_time']) +def load_balance_devices(model_options: dict[str], total_work: int, return_idle_time=False, work_normalized: int=None): + 'Optimize work assigned to different devices, accounting for their relative speeds and splittable work.' + opts_dict = model_options['multigpu_options'] + devices = list(model_options['multigpu_clones'].keys()) + speed_per_device = [] + work_per_device = [] + # get sum of each device's relative_speed + total_speed = 0.0 + for opts in opts_dict.values(): + total_speed += opts['relative_speed'] + # get relative work for each device; + # obtained by w = (W*r)/R + for device in devices: + relative_speed = opts_dict[device]['relative_speed'] + relative_work = (total_work*relative_speed) / total_speed + speed_per_device.append(relative_speed) + work_per_device.append(relative_work) + # relative work must be expressed in whole numbers, but likely is a decimal; + # perform rounding while maintaining total sum equal to total work (sum of relative works) + work_per_device = round_preserved(work_per_device) + dict_work_per_device = {} + for device, relative_work in zip(devices, work_per_device): + dict_work_per_device[device] = relative_work + if not return_idle_time: + return LoadBalance(dict_work_per_device, None) + # divide relative work by relative speed to get estimated completion time of said work by each device; + # time here is relative and does not correspond to real-world units + completion_time = [w/r for w,r in zip(work_per_device, speed_per_device)] + # calculate relative time spent by the devices waiting on each other after their work is completed + idle_time = abs(min(completion_time) - max(completion_time)) + # if need to compare work idle time, need to normalize to a common total work + if work_normalized: + idle_time *= (work_normalized/total_work) + + return LoadBalance(dict_work_per_device, idle_time) + +def round_preserved(values: list[float]): + 'Round all values in a list, preserving the combined sum of values.' + # get floor of values; casting to int does it too + floored = [int(x) for x in values] + total_floored = sum(floored) + # get remainder to distribute + remainder = round(sum(values)) - total_floored + # pair values with fractional portions + fractional = [(i, x-floored[i]) for i, x in enumerate(values)] + # sort by fractional part in descending order + fractional.sort(key=lambda x: x[1], reverse=True) + # distribute the remainder + for i in range(remainder): + index = fractional[i][0] + floored[index] += 1 + return floored diff --git a/comfy/ops.py b/comfy/ops.py index 284030ed1..62b7a14bc 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -18,6 +18,7 @@ import torch import logging +import contextlib import comfy.model_management from comfy.cli_args import args, PerformanceFeature import comfy.float @@ -75,6 +76,8 @@ except: cast_to = comfy.model_management.cast_to #TODO: remove once no more references +STREAM_PIN_BUFFER_HEADROOM = 8 * 1024 * 1024 + def cast_to_input(weight, input, non_blocking=False, copy=True): return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy) @@ -91,6 +94,9 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin offload_stream = None cast_buffer = None cast_buffer_offset = 0 + stream_pin_hostbuf = None + stream_pin_offset = 0 + stream_pin_queue = [] def ensure_offload_stream(module, required_size, check_largest): nonlocal offload_stream @@ -124,6 +130,22 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin cast_buffer_offset += buffer_size return buffer + def get_stream_pin_buffer_offset(buffer_size): + nonlocal stream_pin_hostbuf + nonlocal stream_pin_offset + + if buffer_size == 0 or offload_stream is None: + return None + + if stream_pin_hostbuf is None: + stream_pin_hostbuf = comfy.model_management.get_pin_buffer(offload_stream) + if stream_pin_hostbuf is None: + return None + + offset = stream_pin_offset + stream_pin_offset += buffer_size + return offset + for s in comfy_modules: signature = comfy_aimdo.model_vbar.vbar_fault(s._v) resident = comfy_aimdo.model_vbar.vbar_signature_compare(signature, s._v_signature) @@ -162,23 +184,47 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin if xfer_dest is None: xfer_dest = get_cast_buffer(dest_size) - if signature is None and pin is None: - comfy.pinned_memory.pin_memory(s) - pin = comfy.pinned_memory.get_pin(s) - else: - pin = None + def cast_maybe_lowvram_patch(xfer_source, xfer_dest, stream): + if xfer_source is not None: + if getattr(xfer_source, "is_lowvram_patch", False): + xfer_source.prepare(xfer_dest, stream, copy=True, commit=False) + else: + comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=stream) - if pin is not None: - comfy.model_management.cast_to_gathered(xfer_source, pin) - xfer_source = [ pin ] - #send it over - comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=offload_stream) + def handle_pin(m, pin, source, dest, subset="weights", size=None): + if pin is not None: + cast_maybe_lowvram_patch([pin], dest, offload_stream) + return + if signature is None: + comfy.pinned_memory.pin_memory(m, subset=subset, size=size) + pin = comfy.pinned_memory.get_pin(m, subset=subset) + if pin is not None: + if isinstance(source, list): + comfy.model_management.cast_to_gathered(source, pin, non_blocking=non_blocking, stream=offload_stream, r2=dest) + else: + cast_maybe_lowvram_patch(source, pin, None) + cast_maybe_lowvram_patch([ pin ], dest, offload_stream) + return + if pin is None: + pin_offset = get_stream_pin_buffer_offset(size) + if pin_offset is not None: + stream_pin_queue.append((source, pin_offset, size, dest)) + return + cast_maybe_lowvram_patch(source, dest, offload_stream) + + handle_pin(s, pin, xfer_source, xfer_dest, size=dest_size) for param_key in ("weight", "bias"): - lowvram_fn = getattr(s, param_key + "_lowvram_function", None) - if lowvram_fn is not None: + lowvram_source = getattr(s, param_key + "_lowvram_function", None) + if lowvram_source is not None: ensure_offload_stream(s, cast_buffer_offset, False) - lowvram_fn.prepare(lambda size: get_cast_buffer(size), offload_stream) + lowvram_size = lowvram_source.memory_required() + lowvram_dest = get_cast_buffer(lowvram_size) + lowvram_source.prepare(lowvram_dest, None, copy=False, commit=True) + + pin = comfy.pinned_memory.get_pin(lowvram_source, subset="patches") + handle_pin(lowvram_source, pin, lowvram_source, lowvram_dest, subset="patches", size=lowvram_size) + prefetch["xfer_dest"] = xfer_dest prefetch["cast_dest"] = cast_dest @@ -186,6 +232,23 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin prefetch["needs_cast"] = needs_cast s._prefetch = prefetch + if stream_pin_offset > 0: + if stream_pin_hostbuf.size < stream_pin_offset: + if not comfy.model_management.resize_pin_buffer(stream_pin_hostbuf, stream_pin_offset + STREAM_PIN_BUFFER_HEADROOM): + for xfer_source, _, _, xfer_dest in stream_pin_queue: + cast_maybe_lowvram_patch(xfer_source, xfer_dest, offload_stream) + return offload_stream + stream_pin_tensor = comfy_aimdo.torch.hostbuf_to_tensor(stream_pin_hostbuf) + stream_pin_tensor.untyped_storage()._comfy_hostbuf = stream_pin_hostbuf + for xfer_source, pin_offset, pin_size, xfer_dest in stream_pin_queue: + pin = stream_pin_tensor[pin_offset:pin_offset + pin_size] + if isinstance(xfer_source, list): + comfy.model_management.cast_to_gathered(xfer_source, pin, non_blocking=non_blocking, stream=offload_stream, r2=xfer_dest) + else: + cast_maybe_lowvram_patch(xfer_source, pin, None) + comfy.model_management.cast_to_gathered([ pin ], xfer_dest, non_blocking=non_blocking, stream=offload_stream) + stream_pin_hostbuf._comfy_event = offload_stream.record_event() + return offload_stream @@ -260,7 +323,7 @@ def resolve_cast_module_with_vbar(s, dtype, device, bias_dtype, compute_dtype, w def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, offloadable=False, compute_dtype=None, want_requant=False): - # NOTE: offloadable=False is a a legacy and if you are a custom node author reading this please pass + # NOTE: offloadable=False is a legacy mode and if you are a custom node author reading this please pass # offloadable=True and call uncast_bias_weight() after your last usage of the weight/bias. This # will add async-offload support to your cast and improve performance. if input is not None: @@ -985,6 +1048,184 @@ class QuantLinearFunc(torch.autograd.Function): return grad_input, grad_weight, grad_bias, None, None, None +# Quantized-weight module helpers + +def _quantized_apply(module, fn, recurse=True): + """Re-wrap Parameters after fn so .to()/.cuda() propagate through QuantizedTensor weights.""" + if recurse: + for child in module.children(): + child._apply(fn) + for key, param in module._parameters.items(): + if param is None: + continue + p = fn(param) + if (not torch.is_inference_mode_enabled()) and p.is_inference(): + p = p.clone() + module.register_parameter(key, torch.nn.Parameter(p, requires_grad=False)) + for key, buf in module._buffers.items(): + if buf is not None: + module._buffers[key] = fn(buf) + return module + + +def _load_quantized_module(module, super_load, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs, load_extra_params=False): + """Shared _load_from_state_dict body for quantized-weight modules. + + Pops weight (+ scales, +/- extras), populates module.weight as a Parameter + or Parameter-wrapped QuantizedTensor, then calls super_load and strips + consumed keys from missing_keys. Reads compute_dtype from factory_kwargs + and disabled formats from module._disabled_formats. + """ + device = module.factory_kwargs["device"] + compute_dtype = module.factory_kwargs["dtype"] + disabled_formats = module._disabled_formats + layer_name = prefix.rstrip('.') + + weight = state_dict.pop(f"{prefix}weight", None) + if weight is None: + logging.warning(f"Missing weight for layer {layer_name}") + module.weight = None + return + manually_loaded_keys = [f"{prefix}weight"] + + def pop_scale(name, dtype=None): + key = f"{prefix}{name}" + v = state_dict.pop(key, None) + if v is not None: + v = v.to(device=device) + if dtype is not None: + v = v.view(dtype=dtype) + manually_loaded_keys.append(key) + return v + + layer_conf = state_dict.pop(f"{prefix}comfy_quant", None) + if layer_conf is not None: + layer_conf = json.loads(layer_conf.numpy().tobytes()) + + if layer_conf is None: + module.weight = torch.nn.Parameter(weight.to(device=device, dtype=compute_dtype), requires_grad=False) + else: + module.quant_format = layer_conf.get("format", None) + module._full_precision_mm_config = layer_conf.get("full_precision_matrix_mult", False) + if not module._full_precision_mm: + module._full_precision_mm = module._full_precision_mm_config + if module.quant_format in disabled_formats: + module._full_precision_mm = True + if module.quant_format is None: + raise ValueError(f"Unknown quantization format for layer {layer_name}") + + if module.quant_format not in QUANT_ALGOS: + raise ValueError( + f"Quantization format '{module.quant_format}' for layer {layer_name} " + f"is not available in this build (supported: {sorted(QUANT_ALGOS.keys())}). " + "Update comfy_kitchen to enable it." + ) + + qconfig = QUANT_ALGOS[module.quant_format] + module.layout_type = qconfig["comfy_tensor_layout"] + layout_cls = get_layout_class(module.layout_type) + module._layout_cls = layout_cls + # W4A16-style layouts keep the activation in compute dtype; the forward + # path reads this to decide whether to quantize the input. + module._layout_quantizes_input = getattr(layout_cls, "QUANTIZES_INPUT", True) + + # Per-format scales; fp8 dtype views handle both legacy uint8-on-disk and native fp8. + if module.quant_format in ("float8_e4m3fn", "float8_e5m2"): + scales = {"scale": pop_scale("weight_scale")} + elif module.quant_format == "mxfp8": + bs = pop_scale("weight_scale", torch.float8_e8m0fnu) + if bs is None: + raise ValueError(f"Missing MXFP8 block scales for layer {layer_name}") + scales = {"scale": bs} + elif module.quant_format == "nvfp4": + ts = pop_scale("weight_scale_2") + bs = pop_scale("weight_scale", torch.float8_e4m3fn) + if ts is None or bs is None: + raise ValueError(f"Missing NVFP4 scales for layer {layer_name}") + scales = {"scale": ts, "block_scale": bs} + elif module.quant_format == "svdquant_w4a4": + # SVDQuant W4A4: per-group weight scales + low-rank correction + # (proj_down, proj_up) + activation smoothing (smooth_factor). + wscales = pop_scale("weight_scale") + proj_down = pop_scale("proj_down") + proj_up = pop_scale("proj_up") + smooth_factor = pop_scale("smooth_factor") + if any(t is None for t in (wscales, proj_down, proj_up, smooth_factor)): + raise ValueError(f"Missing SVDQuant W4A4 parameters for layer {layer_name}") + scales = { + "scale": wscales, + "proj_down": proj_down, + "proj_up": proj_up, + "smooth_factor": smooth_factor, + "act_unsigned": bool(layer_conf.get("act_unsigned", False)), + } + elif module.quant_format == "awq_w4a16": + # AWQ W4A16: int4 weight, fp16/bf16 activation. Used by + # Qwen-Image-Edit modulation linears so they stay packed instead of + # being dequantized to bf16 at load time. + wscales = pop_scale("weight_scale") + wzeros = pop_scale("weight_zero") + if wscales is None or wzeros is None: + raise ValueError(f"Missing AWQ W4A16 parameters for layer {layer_name}") + scales = { + "scale": wscales, + "zeros": wzeros, + "group_size": int(layer_conf.get("group_size", qconfig.get("group_size", 64))), + } + else: + raise ValueError(f"Unsupported quantization format: {module.quant_format}") + + params = layout_cls.Params(**scales, orig_dtype=compute_dtype, orig_shape=module._orig_shape) + module.weight = torch.nn.Parameter( + QuantizedTensor(weight.to(device=device, dtype=qconfig["storage_t"]), module.layout_type, params), + requires_grad=False, + ) + + if load_extra_params: + for param_name in qconfig["parameters"]: + if param_name in {"weight_scale", "weight_scale_2"}: + continue + param_key = f"{prefix}{param_name}" + _v = state_dict.pop(param_key, None) + if _v is None: + continue + module.register_parameter(param_name, torch.nn.Parameter(_v.to(device=device), requires_grad=False)) + manually_loaded_keys.append(param_key) + + super_load(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) + for key in manually_loaded_keys: + if key in missing_keys: + missing_keys.remove(key) + + +def _quantized_weight_state_dict(module, sd, prefix, extra_quant_conf=None, extra_quant_params=()): + """Shared state_dict body. extra_quant_conf merges into the comfy_quant JSON; + extra_quant_params names attributes written as additional top-level keys.""" + if not hasattr(module, 'weight'): + logging.warning(f"Warning: state dict on uninitialized op {prefix}") + return sd + bias = getattr(module, 'bias', None) + if bias is not None: + sd[f"{prefix}bias"] = bias + if module.weight is None: + return sd + if isinstance(module.weight, QuantizedTensor): + sd.update(module.weight.state_dict(f"{prefix}weight")) + quant_conf = {"format": module.quant_format} + if getattr(module, '_full_precision_mm_config', False): + quant_conf["full_precision_matrix_mult"] = True + if extra_quant_conf: + quant_conf.update(extra_quant_conf) + sd[f"{prefix}comfy_quant"] = torch.tensor(list(json.dumps(quant_conf).encode("utf-8")), dtype=torch.uint8) + for name in extra_quant_params: + value = getattr(module, name, None) + if value is not None: + sd[f"{prefix}{name}"] = value + else: + sd[f"{prefix}weight"] = module.weight + return sd + def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_precision_mm=False, disabled=[]): class MixedPrecisionOps(manual_cast): @@ -994,21 +1235,16 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec _disabled = disabled class Linear(torch.nn.Module, CastWeightBiasOp): - def __init__( - self, - in_features: int, - out_features: int, - bias: bool = True, - device=None, - dtype=None, - ) -> None: + _disabled_formats = disabled + + def __init__(self, in_features: int, out_features: int, bias: bool = True, device=None, dtype=None): super().__init__() self.factory_kwargs = {"device": device, "dtype": MixedPrecisionOps._compute_dtype} - # self.factory_kwargs = {"device": device, "dtype": dtype} self.in_features = in_features self.out_features = out_features + self._orig_shape = (out_features, in_features) if bias: self.bias = torch.nn.Parameter(torch.empty(out_features, **self.factory_kwargs)) else: @@ -1021,197 +1257,15 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec def reset_parameters(self): return None - def _load_scale_param(self, state_dict, prefix, param_name, device, manually_loaded_keys, dtype=None): - key = f"{prefix}{param_name}" - value = state_dict.pop(key, None) - if value is not None: - value = value.to(device=device) - if dtype is not None: - value = value.view(dtype=dtype) - manually_loaded_keys.append(key) - return value - - def _load_from_state_dict(self, state_dict, prefix, local_metadata, - strict, missing_keys, unexpected_keys, error_msgs): - - device = self.factory_kwargs["device"] - layer_name = prefix.rstrip('.') - weight_key = f"{prefix}weight" - weight = state_dict.pop(weight_key, None) - if weight is None: - logging.warning(f"Missing weight for layer {layer_name}") - self.weight = None - return - - manually_loaded_keys = [weight_key] - - layer_conf = state_dict.pop(f"{prefix}comfy_quant", None) - if layer_conf is not None: - layer_conf = json.loads(layer_conf.numpy().tobytes()) - - if layer_conf is None: - self.weight = torch.nn.Parameter(weight.to(device=device, dtype=MixedPrecisionOps._compute_dtype), requires_grad=False) - else: - self.quant_format = layer_conf.get("format", None) - self._full_precision_mm_config = layer_conf.get("full_precision_matrix_mult", False) - if not self._full_precision_mm: - self._full_precision_mm = self._full_precision_mm_config - - if self.quant_format in MixedPrecisionOps._disabled: - self._full_precision_mm = True - - if self.quant_format is None: - raise ValueError(f"Unknown quantization format for layer {layer_name}") - - if self.quant_format not in QUANT_ALGOS: - raise ValueError( - f"Quantization format '{self.quant_format}' for layer {layer_name} " - f"is not available in this build (supported: {sorted(QUANT_ALGOS.keys())}). " - "Update comfy_kitchen to enable it." - ) - qconfig = QUANT_ALGOS[self.quant_format] - self.layout_type = qconfig["comfy_tensor_layout"] - self._layout_cls = get_layout_class(self.layout_type) - self._layout_quantizes_input = getattr(self._layout_cls, "QUANTIZES_INPUT", True) - layout_cls = self._layout_cls - - # Load format-specific parameters - if self.quant_format in ["float8_e4m3fn", "float8_e5m2"]: - # FP8: single tensor scale - scale = self._load_scale_param(state_dict, prefix, "weight_scale", device, manually_loaded_keys) - - params = layout_cls.Params( - scale=scale, - orig_dtype=MixedPrecisionOps._compute_dtype, - orig_shape=(self.out_features, self.in_features), - ) - - elif self.quant_format == "mxfp8": - # MXFP8: E8M0 block scales stored as uint8 in safetensors - block_scale = self._load_scale_param(state_dict, prefix, "weight_scale", device, manually_loaded_keys, - dtype=torch.uint8) - - if block_scale is None: - raise ValueError(f"Missing MXFP8 block scales for layer {layer_name}") - - block_scale = block_scale.view(torch.float8_e8m0fnu) - - params = layout_cls.Params( - scale=block_scale, - orig_dtype=MixedPrecisionOps._compute_dtype, - orig_shape=(self.out_features, self.in_features), - ) - - elif self.quant_format == "nvfp4": - # NVFP4: tensor_scale (weight_scale_2) + block_scale (weight_scale) - tensor_scale = self._load_scale_param(state_dict, prefix, "weight_scale_2", device, manually_loaded_keys) - block_scale = self._load_scale_param(state_dict, prefix, "weight_scale", device, manually_loaded_keys, - dtype=torch.float8_e4m3fn) - - if tensor_scale is None or block_scale is None: - raise ValueError(f"Missing NVFP4 scales for layer {layer_name}") - - params = layout_cls.Params( - scale=tensor_scale, - block_scale=block_scale, - orig_dtype=MixedPrecisionOps._compute_dtype, - orig_shape=(self.out_features, self.in_features), - ) - elif self.quant_format == "svdquant_w4a4": - # SVDQuant W4A4: per-group weight scales + low-rank correction - # (proj_down, proj_up) + activation smoothing (smooth_factor) - wscales = self._load_scale_param(state_dict, prefix, "weight_scale", device, manually_loaded_keys) - proj_down = self._load_scale_param(state_dict, prefix, "proj_down", device, manually_loaded_keys) - proj_up = self._load_scale_param(state_dict, prefix, "proj_up", device, manually_loaded_keys) - smooth_factor = self._load_scale_param(state_dict, prefix, "smooth_factor", device, manually_loaded_keys) - act_unsigned = bool(layer_conf.get("act_unsigned", False)) - - if any(t is None for t in (wscales, proj_down, proj_up, smooth_factor)): - raise ValueError(f"Missing SVDQuant W4A4 parameters for layer {layer_name}") - - params = layout_cls.Params( - scale=wscales, - orig_dtype=MixedPrecisionOps._compute_dtype, - orig_shape=(self.out_features, self.in_features), - proj_down=proj_down, - proj_up=proj_up, - smooth_factor=smooth_factor, - act_unsigned=act_unsigned, - ) - elif self.quant_format == "awq_w4a16": - # AWQ W4A16: int4 weight, fp16/bf16 activation. Used by - # Qwen-Image-Edit modulation linears so they stay packed - # instead of being dequantized to bf16 at load time. - wscales = self._load_scale_param(state_dict, prefix, "weight_scale", device, manually_loaded_keys) - wzeros = self._load_scale_param(state_dict, prefix, "weight_zero", device, manually_loaded_keys) - if wscales is None or wzeros is None: - raise ValueError(f"Missing AWQ W4A16 parameters for layer {layer_name}") - params = layout_cls.Params( - scale=wscales, - zeros=wzeros, - group_size=int(layer_conf.get("group_size", qconfig.get("group_size", 64))), - orig_dtype=MixedPrecisionOps._compute_dtype, - orig_shape=(self.out_features, self.in_features), - ) - else: - raise ValueError(f"Unsupported quantization format: {self.quant_format}") - - self.weight = torch.nn.Parameter( - QuantizedTensor(weight.to(device=device, dtype=qconfig["storage_t"]), self.layout_type, params), - requires_grad=False - ) - - for param_name in qconfig["parameters"]: - if param_name in {"weight_scale", "weight_scale_2"}: - continue # Already handled above - - param_key = f"{prefix}{param_name}" - _v = state_dict.pop(param_key, None) - if _v is None: - continue - self.register_parameter(param_name, torch.nn.Parameter(_v.to(device=device), requires_grad=False)) - manually_loaded_keys.append(param_key) - - super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) - - for key in manually_loaded_keys: - if key in missing_keys: - missing_keys.remove(key) + def _load_from_state_dict(self, *args): + _load_quantized_module(self, super()._load_from_state_dict, *args, load_extra_params=True) def state_dict(self, *args, destination=None, prefix="", **kwargs): - if destination is not None: - sd = destination - else: - sd = {} - - if not hasattr(self, 'weight'): - logging.warning("Warning: state dict on uninitialized op {}".format(prefix)) - return sd - - if self.bias is not None: - sd["{}bias".format(prefix)] = self.bias - - if self.weight is None: - return sd - - if isinstance(self.weight, QuantizedTensor): - sd_out = self.weight.state_dict("{}weight".format(prefix)) - for k in sd_out: - sd[k] = sd_out[k] - - quant_conf = {"format": self.quant_format} - if self._full_precision_mm_config: - quant_conf["full_precision_matrix_mult"] = True - if bool(getattr(getattr(self.weight, "_params", None), "act_unsigned", False)): - quant_conf["act_unsigned"] = True - sd["{}comfy_quant".format(prefix)] = torch.tensor(list(json.dumps(quant_conf).encode('utf-8')), dtype=torch.uint8) - - input_scale = getattr(self, 'input_scale', None) - if input_scale is not None: - sd["{}input_scale".format(prefix)] = input_scale - else: - sd["{}weight".format(prefix)] = self.weight - return sd + sd = destination if destination is not None else {} + # Preserve the SVDQuant W4A4 act_unsigned flag on round-trip save. + _params = getattr(getattr(self, 'weight', None), '_params', None) + extra_quant_conf = {"act_unsigned": True} if getattr(_params, 'act_unsigned', False) else None + return _quantized_weight_state_dict(self, sd, prefix, extra_quant_conf=extra_quant_conf, extra_quant_params=("input_scale",)) def _forward(self, input, weight, bias): return torch.nn.functional.linear(input, weight, bias) @@ -1301,25 +1355,126 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec self.weight = torch.nn.Parameter(weight, requires_grad=False) def _apply(self, fn, recurse=True): # This is to get torch.compile + moving weights to another device working - if recurse: - for module in self.children(): - module._apply(fn) + return _quantized_apply(self, fn, recurse) - for key, param in self._parameters.items(): - if param is None: - continue - p = fn(param) - if (not torch.is_inference_mode_enabled()) and p.is_inference(): - p = p.clone() - self.register_parameter(key, torch.nn.Parameter(p, requires_grad=False)) - for key, buf in self._buffers.items(): - if buf is not None: - self._buffers[key] = fn(buf) - return self + class MoEExperts(torch.nn.Module, CastWeightBiasOp): + """Container for E quantized expert weights, indexed via expert_weight(i). + + The bank lives on self.weight as a single 3D tensor — either a + compute_dtype Parameter or a Parameter wrapping a QuantizedTensor + with leading expert dim. + + State-dict layout matches mixed_precision_ops.Linear with a leading + expert dim: + {prefix}.weight quant data (storage_t), leading dim = E + {prefix}.weight_scale block / per-tensor scale + {prefix}.weight_scale_2 [E] or scalar NVFP4 only + {prefix}.bias [E, out_features] optional, compute_dtype + {prefix}.comfy_quant json -> {{"format": "...", "num_experts": E}} + + Without comfy_quant the weight loads as a plain compute_dtype 3D Parameter [E, out, in]. + """ + + _disabled_formats = disabled + + def __init__(self, num_experts: int, in_features: int, out_features: int, bias: bool = True, device=None, dtype=None): + super().__init__() + self.num_experts = num_experts + self.in_features = in_features + self.out_features = out_features + self._orig_shape = (num_experts, out_features, in_features) + self.factory_kwargs = {"device": device, "dtype": MixedPrecisionOps._compute_dtype} + if bias: + self.bias = torch.nn.Parameter(torch.empty(num_experts, out_features, **self.factory_kwargs)) + else: + self.register_parameter("bias", None) + + # Populated by _load_from_state_dict: + self.weight = None + self.quant_format = None + self.layout_type = None + self._full_precision_mm = MixedPrecisionOps._full_precision_mm + self._full_precision_mm_config = False + self._resident_bank = None + + def reset_parameters(self): + return None + + def _apply(self, fn, recurse=True): + return _quantized_apply(self, fn, recurse) + + def _load_from_state_dict(self, *args): + _load_quantized_module(self, super()._load_from_state_dict, *args, load_extra_params=False) + + def expert_weight(self, i: int): + """Expert i's weight (Tensor or per-expert QuantizedTensor view).""" + if isinstance(self.weight, QuantizedTensor): + return self._expert_qt_from(self.weight, i) + return self.weight[i] + + @contextlib.contextmanager + def bank_resident(self, input): + """Cast the whole bank once; expert_linear inside reuses the cast. + Not re-entrant — do not nest calls on the same instance. + """ + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) + self._resident_bank = (weight, bias) + try: + yield self + finally: + self._resident_bank = None + uncast_bias_weight(self, weight, bias, offload_stream) + + def expert_linear(self, input: torch.Tensor, i: int) -> torch.Tensor: + """Linear against expert i's weight (with optional bias).""" + resident = getattr(self, "_resident_bank", None) + if resident is not None: + weight, bias = resident + return self._expert_linear_impl(input, weight, bias, i) + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) + try: + return self._expert_linear_impl(input, weight, bias, i) + finally: + uncast_bias_weight(self, weight, bias, offload_stream) + + def _expert_linear_impl(self, input, weight, bias, i): + if isinstance(weight, QuantizedTensor): + qw = self._expert_qt_from(weight, i) + else: + qw = weight[i] + b = cast_to_input(bias[i], input, copy=False) if bias is not None else None + + if isinstance(qw, QuantizedTensor): + use_fast = ( + not self._full_precision_mm + and qw.layout_cls.supports_fast_matmul() + and input.dim() == 2 + ) + if use_fast: + qin = QuantizedTensor.from_float(input, self.layout_type) + return torch.nn.functional.linear(qin, qw, b) + out = input @ qw.dequantize().t() + return out + b if b is not None else out + return torch.nn.functional.linear(input, qw, b) + + def _expert_qt_from(self, weight: QuantizedTensor, i: int) -> QuantizedTensor: + """Build a per-expert QuantizedTensor by indexing into a resident bank.""" + params = weight._params + kwargs = { + "scale": params.scale[i] if params.scale.dim() else params.scale, + "orig_dtype": params.orig_dtype, + "orig_shape": (self.out_features, self.in_features), + } + if hasattr(params, "block_scale"): # NVFP4 + kwargs["block_scale"] = params.block_scale[i] + return QuantizedTensor(weight._qdata[i], weight._layout_cls, type(params)(**kwargs)) + + def state_dict(self, *args, destination=None, prefix="", **kwargs): + sd = destination if destination is not None else {} + return _quantized_weight_state_dict(self, sd, prefix, extra_quant_conf={"num_experts": self.num_experts}) class Embedding(manual_cast.Embedding): - def _load_from_state_dict(self, state_dict, prefix, local_metadata, - strict, missing_keys, unexpected_keys, error_msgs): + def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs): weight_key = f"{prefix}weight" layer_conf = state_dict.pop(f"{prefix}comfy_quant", None) if layer_conf is not None: @@ -1327,14 +1482,16 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec # Only fp8 makes sense for embeddings (per-row dequant via index select). # Block-scaled formats (NVFP4, MXFP8) can't do per-row lookup efficiently. - quant_format = layer_conf.get("format", None) if layer_conf is not None else None - if quant_format in ["float8_e4m3fn", "float8_e5m2"] and weight_key in state_dict: + quant_format = layer_conf.get("format") if layer_conf is not None else None + manually_loaded_keys = [] + + if quant_format in ("float8_e4m3fn", "float8_e5m2") and weight_key in state_dict: self.quant_format = quant_format qconfig = QUANT_ALGOS[quant_format] self.layout_type = qconfig["comfy_tensor_layout"] layout_cls = get_layout_class(self.layout_type) weight = state_dict.pop(weight_key) - manually_loaded_keys = [weight_key] + manually_loaded_keys.append(weight_key) scale_key = f"{prefix}weight_scale" scale = state_dict.pop(scale_key, None) @@ -1350,35 +1507,19 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec self.weight = torch.nn.Parameter( QuantizedTensor(weight.to(dtype=qconfig["storage_t"]), qconfig["comfy_tensor_layout"], params), requires_grad=False) + elif layer_conf is not None: + # Unsupported format — restore the marker so it round-trips; fall through to default load. + state_dict[f"{prefix}comfy_quant"] = torch.tensor( + list(json.dumps(layer_conf).encode('utf-8')), dtype=torch.uint8) - super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) - for k in manually_loaded_keys: - if k in missing_keys: - missing_keys.remove(k) - else: - if layer_conf is not None: - state_dict[f"{prefix}comfy_quant"] = torch.tensor(list(json.dumps(layer_conf).encode('utf-8')), dtype=torch.uint8) - super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) + super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) + for k in manually_loaded_keys: + if k in missing_keys: + missing_keys.remove(k) def state_dict(self, *args, destination=None, prefix="", **kwargs): - if destination is not None: - sd = destination - else: - sd = {} - - if not hasattr(self, 'weight') or self.weight is None: - return sd - - if isinstance(self.weight, QuantizedTensor): - sd_out = self.weight.state_dict("{}weight".format(prefix)) - for k in sd_out: - sd[k] = sd_out[k] - - quant_conf = {"format": self.quant_format} - sd["{}comfy_quant".format(prefix)] = torch.tensor(list(json.dumps(quant_conf).encode('utf-8')), dtype=torch.uint8) - else: - sd["{}weight".format(prefix)] = self.weight - return sd + sd = destination if destination is not None else {} + return _quantized_weight_state_dict(self, sd, prefix) def forward_comfy_cast_weights(self, input, out_dtype=None): weight = self.weight diff --git a/comfy/patcher_extension.py b/comfy/patcher_extension.py index 5ee4d5ee5..189ee84ca 100644 --- a/comfy/patcher_extension.py +++ b/comfy/patcher_extension.py @@ -1,8 +1,9 @@ -from __future__ import annotations from typing import Callable class CallbacksMP: ON_CLONE = "on_clone" + ON_DEEPCLONE_MULTIGPU = "on_deepclone_multigpu" + ON_MATCH_MULTIGPU_CLONES = "on_match_multigpu_clones" ON_LOAD = "on_load_after" ON_DETACH = "on_detach_after" ON_CLEANUP = "on_cleanup" diff --git a/comfy/pinned_memory.py b/comfy/pinned_memory.py index 6d3ba367a..0e8f573ba 100644 --- a/comfy/pinned_memory.py +++ b/comfy/pinned_memory.py @@ -2,42 +2,62 @@ import comfy.model_management import comfy.memory_management import comfy_aimdo.host_buffer import comfy_aimdo.torch +import torch from comfy.cli_args import args -def get_pin(module): - return getattr(module, "_pin", None) +def get_pin(module, subset="weights"): + pin = getattr(module, "_pin", None) + if pin is None or module._pin_registered or args.disable_pinned_memory: + return pin -def pin_memory(module): - if module.pin_failed or args.disable_pinned_memory or get_pin(module) is not None: + _, _, stack_split, pinned_size = module._pin_state[subset] + size = pin.nbytes + comfy.model_management.ensure_pin_registerable(size) + + if torch.cuda.cudart().cudaHostRegister(pin.data_ptr(), size, 1) != 0: + comfy.model_management.discard_cuda_async_error() + return pin + + module._pin_registered = True + stack_split[0] = max(stack_split[0], module._pin_stack_index) + comfy.model_management.TOTAL_PINNED_MEMORY += size + pinned_size[0] += size + return pin + +def pin_memory(module, subset="weights", size=None): + pin_state = module._pin_state + if args.disable_pinned_memory: return - size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ]) + pin = get_pin(module, subset) + if pin is not None or pin_state["failed"]: + return - if comfy.model_management.MAX_PINNED_MEMORY <= 0 or (comfy.model_management.TOTAL_PINNED_MEMORY + size) > comfy.model_management.MAX_PINNED_MEMORY: - module.pin_failed = True + hostbuf, stack, stack_split, pinned_size = pin_state[subset] + if size is None: + size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ]) + offset = hostbuf.size + registerable_size = size + max(0, hostbuf.size - pinned_size[0]) + + comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM) + if (not comfy.model_management.ensure_pin_budget(size) or + not comfy.model_management.ensure_pin_registerable(registerable_size)): + pin_state["failed"] = True return False try: - hostbuf = comfy_aimdo.host_buffer.HostBuffer(size) + hostbuf.extend(size=size) except RuntimeError: - module.pin_failed = True + pin_state["failed"] = True return False - module._pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf) - module._pin_hostbuf = hostbuf + module._pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)[offset:offset + size] + module._pin.untyped_storage()._comfy_hostbuf = hostbuf + stack.append((module, offset)) + module._pin_registered = True + module._pin_stack_index = len(stack) - 1 + stack_split[0] = max(stack_split[0], module._pin_stack_index) comfy.model_management.TOTAL_PINNED_MEMORY += size + pinned_size[0] += size return True - -def unpin_memory(module): - if get_pin(module) is None: - return 0 - size = module._pin.numel() * module._pin.element_size() - - comfy.model_management.TOTAL_PINNED_MEMORY -= size - if comfy.model_management.TOTAL_PINNED_MEMORY < 0: - comfy.model_management.TOTAL_PINNED_MEMORY = 0 - - del module._pin - del module._pin_hostbuf - return size diff --git a/comfy/sampler_helpers.py b/comfy/sampler_helpers.py index 3782fd2d5..bdce2f2d8 100644 --- a/comfy/sampler_helpers.py +++ b/comfy/sampler_helpers.py @@ -1,16 +1,18 @@ from __future__ import annotations +import torch import uuid import math import collections import comfy.model_management import comfy.conds +import comfy.model_patcher import comfy.utils import comfy.hooks import comfy.patcher_extension from typing import TYPE_CHECKING if TYPE_CHECKING: - from comfy.model_patcher import ModelPatcher from comfy.model_base import BaseModel + from comfy.model_patcher import ModelPatcher from comfy.controlnet import ControlBase def prepare_mask(noise_mask, shape, device): @@ -119,6 +121,47 @@ def cleanup_additional_models(models): if hasattr(m, 'cleanup'): m.cleanup() +def preprocess_multigpu_conds(conds: dict[str, list[dict[str]]], model: ModelPatcher, model_options: dict[str]): + '''If multigpu acceleration required, creates deepclones of ControlNets and GLIGEN per device.''' + multigpu_models: list[ModelPatcher] = model.get_additional_models_with_key("multigpu") + if len(multigpu_models) == 0: + return + extra_devices = [x.load_device for x in multigpu_models] + # handle controlnets + controlnets: set[ControlBase] = set() + for k in conds: + for kk in conds[k]: + if 'control' in kk: + controlnets.add(kk['control']) + if len(controlnets) > 0: + # first, unload all controlnet clones + for cnet in list(controlnets): + cnet_models = cnet.get_models() + for cm in cnet_models: + comfy.model_management.unload_model_and_clones(cm, unload_additional_models=True) + + # next, make sure each controlnet has a deepclone for all relevant devices + for cnet in controlnets: + curr_cnet = cnet + while curr_cnet is not None: + for device in extra_devices: + if device not in curr_cnet.multigpu_clones: + curr_cnet.deepclone_multigpu(device, autoregister=True) + curr_cnet = curr_cnet.previous_controlnet + # since all device clones are now present, recreate the linked list for cloned cnets per device + for cnet in controlnets: + curr_cnet = cnet + while curr_cnet is not None: + prev_cnet = curr_cnet.previous_controlnet + for device in extra_devices: + device_cnet = curr_cnet.get_instance_for_device(device) + prev_device_cnet = None + if prev_cnet is not None: + prev_device_cnet = prev_cnet.get_instance_for_device(device) + device_cnet.set_previous_controlnet(prev_device_cnet) + curr_cnet = prev_cnet + # potentially handle gligen - since not widely used, ignored for now + def estimate_memory(model, noise_shape, conds): cond_shapes = collections.defaultdict(list) cond_shapes_min = {} @@ -143,7 +186,8 @@ def prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None return executor.execute(model, noise_shape, conds, model_options=model_options, force_full_load=force_full_load, force_offload=force_offload) def _prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None, force_full_load=False, force_offload=False): - real_model: BaseModel = None + model.match_multigpu_clones() + preprocess_multigpu_conds(conds, model, model_options) models, inference_memory = get_additional_models(conds, model.model_dtype()) models += get_additional_models_from_model_options(model_options) models += model.get_nested_additional_models() # TODO: does this require inference_memory update? @@ -155,7 +199,7 @@ def _prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=Non memory_required += inference_memory minimum_memory_required += inference_memory comfy.model_management.load_models_gpu([model] + models, memory_required=memory_required, minimum_memory_required=minimum_memory_required, force_full_load=force_full_load) - real_model = model.model + real_model: BaseModel = model.model return real_model, conds, models @@ -201,3 +245,18 @@ def prepare_model_patcher(model: ModelPatcher, conds, model_options: dict): comfy.patcher_extension.merge_nested_dicts(to_load_options.setdefault(wc_name, {}), model_options["transformer_options"][wc_name], copy_dict1=False) return to_load_options + +def prepare_model_patcher_multigpu_clones(model_patcher: ModelPatcher, loaded_models: list[ModelPatcher], model_options: dict): + ''' + In case multigpu acceleration is enabled, prep ModelPatchers for each device. + ''' + multigpu_patchers: list[ModelPatcher] = [x for x in loaded_models if x.is_multigpu_base_clone] + if len(multigpu_patchers) > 0: + multigpu_dict: dict[torch.device, ModelPatcher] = {} + multigpu_dict[model_patcher.load_device] = model_patcher + for x in multigpu_patchers: + x.hook_patches = comfy.model_patcher.create_hook_patches_clone(model_patcher.hook_patches, copy_tuples=True) + x.hook_mode = model_patcher.hook_mode # match main model's hook_mode + multigpu_dict[x.load_device] = x + model_options["multigpu_clones"] = multigpu_dict + return multigpu_patchers diff --git a/comfy/samplers.py b/comfy/samplers.py index 0a4d062db..e31277f7b 100755 --- a/comfy/samplers.py +++ b/comfy/samplers.py @@ -1,7 +1,9 @@ from __future__ import annotations + +import comfy.model_management from .k_diffusion import sampling as k_diffusion_sampling from .extra_samplers import uni_pc -from typing import TYPE_CHECKING, Callable, NamedTuple +from typing import TYPE_CHECKING, Callable, NamedTuple, Any if TYPE_CHECKING: from comfy.model_patcher import ModelPatcher from comfy.model_base import BaseModel @@ -16,6 +18,7 @@ import comfy.model_patcher import comfy.patcher_extension import comfy.hooks import comfy.context_windows +import comfy.multigpu import comfy.utils import scipy.stats import numpy @@ -141,7 +144,7 @@ def can_concat_cond(c1, c2): return cond_equal_size(c1.conditioning, c2.conditioning) -def cond_cat(c_list): +def cond_cat(c_list, device=None): temp = {} for x in c_list: for k in x: @@ -153,6 +156,8 @@ def cond_cat(c_list): for k in temp: conds = temp[k] out[k] = conds[0].concat(conds[1:]) + if device is not None and hasattr(out[k], 'to'): + out[k] = out[k].to(device) return out @@ -212,7 +217,12 @@ def _calc_cond_batch_outer(model: BaseModel, conds: list[list[dict]], x_in: torc ) return executor.execute(model, conds, x_in, timestep, model_options) -def _calc_cond_batch(model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep, model_options): +def _calc_cond_batch(model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]): + # NOTE: keep in sync with _calc_cond_batch_multigpu below. Shared logic + # (hooked_to_run accumulation, memory-fit batching, per-chunk output + # aggregation) is duplicated there with per-device scheduling layered on top. + if 'multigpu_clones' in model_options: + return _calc_cond_batch_multigpu(model, conds, x_in, timestep, model_options) out_conds = [] out_counts = [] # separate conds by matching hooks @@ -244,7 +254,7 @@ def _calc_cond_batch(model: BaseModel, conds: list[list[dict]], x_in: torch.Tens if has_default_conds: finalize_default_conds(model, hooked_to_run, default_conds, x_in, timestep, model_options) - model.current_patcher.prepare_state(timestep) + model.current_patcher.prepare_state(timestep, model_options) # run every hooked_to_run separately for hooks, to_run in hooked_to_run.items(): @@ -265,7 +275,6 @@ def _calc_cond_batch(model: BaseModel, conds: list[list[dict]], x_in: torch.Tens input_shape = [len(batch_amount) * first_shape[0]] + list(first_shape)[1:] cond_shapes = collections.defaultdict(list) for tt in batch_amount: - cond = {k: v.size() for k, v in to_run[tt][0].conditioning.items()} for k, v in to_run[tt][0].conditioning.items(): cond_shapes[k].append(v.size()) @@ -345,6 +354,239 @@ def _calc_cond_batch(model: BaseModel, conds: list[list[dict]], x_in: torch.Tens return out_conds +def _calc_cond_batch_multigpu(model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]): + # NOTE: keep in sync with _calc_cond_batch above. Same conds-by-hooks + # accumulation, memory-fit batching, and output aggregation, but adds a + # per-device scheduler, per-device patcher/control lookup, tensor .to(device) + # placement, and MultiGPUThreadPool dispatch around the inner loop. + out_conds = [] + out_counts = [] + # separate conds by matching hooks + hooked_to_run: dict[comfy.hooks.HookGroup,list[tuple[tuple,int]]] = {} + default_conds = [] + has_default_conds = False + + output_device = x_in.device + + for i in range(len(conds)): + out_conds.append(torch.zeros_like(x_in)) + out_counts.append(torch.ones_like(x_in) * 1e-37) + + cond = conds[i] + default_c = [] + if cond is not None: + for x in cond: + if 'default' in x: + default_c.append(x) + has_default_conds = True + continue + p = get_area_and_mult(x, x_in, timestep) + if p is None: + continue + if p.hooks is not None: + model.current_patcher.prepare_hook_patches_current_keyframe(timestep, p.hooks, model_options) + hooked_to_run.setdefault(p.hooks, list()) + hooked_to_run[p.hooks] += [(p, i)] + default_conds.append(default_c) + + if has_default_conds: + finalize_default_conds(model, hooked_to_run, default_conds, x_in, timestep, model_options) + + model.current_patcher.prepare_state(timestep, model_options) + + devices = list(model_options['multigpu_clones'].keys()) + device_batched_hooked_to_run: dict[torch.device, list[tuple[comfy.hooks.HookGroup, tuple]]] = {} + # Track conds currently scheduled per device; single source of truth for capacity checks. + device_load: dict[torch.device, int] = {d: 0 for d in devices} + + total_conds = sum(len(to_run) for to_run in hooked_to_run.values()) + conds_per_device = max(1, math.ceil(total_conds / len(devices))) + + def next_available_device(start: int) -> tuple[int, torch.device]: + """Return (index, device) for the next device with remaining capacity, starting at `start`. + + Scans at most len(devices) positions, so this always terminates. Raises if no device + has remaining capacity, which would indicate a bug in conds_per_device accounting. + """ + for offset in range(len(devices)): + i = (start + offset) % len(devices) + if device_load[devices[i]] < conds_per_device: + return i, devices[i] + raise RuntimeError( + f"MultiGPU scheduler: all {len(devices)} devices at capacity " + f"({conds_per_device}) but conds remain to schedule" + ) + + # run every hooked_to_run separately + index_device = 0 + for hooks, to_run in hooked_to_run.items(): + while len(to_run) > 0: + index_device, current_device = next_available_device(index_device) + remaining_capacity = conds_per_device - device_load[current_device] + + first = to_run[0] + first_shape = first[0][0].shape + # collect candidate indices that can be concatenated with `first`, up to remaining capacity + to_batch_temp = [] + for x in range(len(to_run)): + if can_concat_cond(to_run[x][0], first[0]) and len(to_batch_temp) < remaining_capacity: + to_batch_temp += [x] + + to_batch_temp.reverse() + to_batch = to_batch_temp[:1] + + free_memory = comfy.model_management.get_free_memory(current_device) + for i in range(1, len(to_batch_temp) + 1): + batch_amount = to_batch_temp[:len(to_batch_temp)//i] + input_shape = [len(batch_amount) * first_shape[0]] + list(first_shape)[1:] + cond_shapes = collections.defaultdict(list) + for tt in batch_amount: + for k, v in to_run[tt][0].conditioning.items(): + cond_shapes[k].append(v.size()) + if model.memory_required(input_shape, cond_shapes=cond_shapes) * 1.5 < free_memory: + to_batch = batch_amount + break + + conds_to_batch = [to_run.pop(x) for x in to_batch] + device_load[current_device] += len(conds_to_batch) + device_batched_hooked_to_run.setdefault(current_device, []).append((hooks, conds_to_batch)) + + if device_load[current_device] >= conds_per_device: + index_device += 1 + + class thread_result(NamedTuple): + output: Any + mult: Any + area: Any + batch_chunks: int + cond_or_uncond: Any + error: Exception = None + + def _handle_batch(device: torch.device, batch_tuple: tuple[comfy.hooks.HookGroup, tuple], results: list[thread_result]): + try: + # TODO: non-NVIDIA support -- guard with `if device.type == "cuda":` once + # we extend multigpu QA beyond CUDA. Unconditional call crashes on + # XPU/NPU/MPS/CPU/DirectML backends. + torch.cuda.set_device(device) + model_current: BaseModel = model_options["multigpu_clones"][device].model + # run every hooked_to_run separately + with torch.no_grad(): + for hooks, to_batch in batch_tuple: + input_x = [] + mult = [] + c = [] + cond_or_uncond = [] + uuids = [] + area = [] + control: ControlBase = None + patches = None + for x in to_batch: + o = x + p = o[0] + input_x.append(p.input_x) + mult.append(p.mult) + c.append(p.conditioning) + area.append(p.area) + cond_or_uncond.append(o[1]) + uuids.append(p.uuid) + control = p.control + patches = p.patches + + batch_chunks = len(cond_or_uncond) + input_x = torch.cat(input_x).to(device) + c = cond_cat(c, device=device) + timestep_ = torch.cat([timestep.to(device)] * batch_chunks) + + transformer_options = model_current.current_patcher.apply_hooks(hooks=hooks) + if 'transformer_options' in model_options: + transformer_options = comfy.patcher_extension.merge_nested_dicts(transformer_options, + model_options['transformer_options'], + copy_dict1=False) + + if patches is not None: + transformer_options["patches"] = comfy.patcher_extension.merge_nested_dicts( + transformer_options.get("patches", {}), + patches + ) + + transformer_options["cond_or_uncond"] = cond_or_uncond[:] + transformer_options["uuids"] = uuids[:] + transformer_options["sigmas"] = timestep.to(device) + transformer_options["sample_sigmas"] = transformer_options["sample_sigmas"].to(device) + transformer_options["multigpu_thread_device"] = device + + cast_transformer_options(transformer_options, device=device) + c['transformer_options'] = transformer_options + + if control is not None: + device_control = control.get_instance_for_device(device) + c['control'] = device_control.get_control(input_x, timestep_, c, len(cond_or_uncond), transformer_options) + + if 'model_function_wrapper' in model_options: + output = model_options['model_function_wrapper'](model_current.apply_model, {"input": input_x, "timestep": timestep_, "c": c, "cond_or_uncond": cond_or_uncond}).to(output_device).chunk(batch_chunks) + else: + output = model_current.apply_model(input_x, timestep_, **c).to(output_device).chunk(batch_chunks) + # TODO: non-NVIDIA support -- the `.to(output_device)` copies + # above are async on CUDA, so the main thread's aggregation + # could race with in-flight transfers. CUDA-only QA has not + # surfaced this in practice, but before extending multigpu + # beyond NVIDIA add a `torch.cuda.synchronize(output_device)` + # here (guarded by `output_device.type == "cuda"`). + results.append(thread_result(output, mult, area, batch_chunks, cond_or_uncond)) + except Exception as e: + results.append(thread_result(None, None, None, None, None, error=e)) + raise + + + def _handle_batch_pooled(device, batch_tuple): + worker_results = [] + _handle_batch(device, batch_tuple, worker_results) + return worker_results + + results: list[thread_result] = [] + thread_pool: comfy.multigpu.MultiGPUThreadPool = model_options.get("multigpu_thread_pool") + + # Submit all GPU work to pool threads + pool_devices = [] + for device, batch_tuple in device_batched_hooked_to_run.items(): + if thread_pool is not None: + thread_pool.submit(device, _handle_batch_pooled, device, batch_tuple) + pool_devices.append(device) + else: + # Fallback: no pool, run everything on main thread + _handle_batch(device, batch_tuple, results) + + # Collect results from pool workers + for device in pool_devices: + worker_results, error = thread_pool.get_result(device) + if error is not None: + raise error + results.extend(worker_results) + + for output, mult, area, batch_chunks, cond_or_uncond, error in results: + if error is not None: + raise error + for o in range(batch_chunks): + cond_index = cond_or_uncond[o] + a = area[o] + if a is None: + out_conds[cond_index] += output[o] * mult[o] + out_counts[cond_index] += mult[o] + else: + out_c = out_conds[cond_index] + out_cts = out_counts[cond_index] + dims = len(a) // 2 + for i in range(dims): + out_c = out_c.narrow(i + 2, a[i + dims], a[i]) + out_cts = out_cts.narrow(i + 2, a[i + dims], a[i]) + out_c += output[o] * mult[o] + out_cts += mult[o] + + for i in range(len(out_conds)): + out_conds[i] /= out_counts[i] + + return out_conds + def calc_cond_uncond_batch(model, cond, uncond, x_in, timestep, model_options): #TODO: remove logging.warning("WARNING: The comfy.samplers.calc_cond_uncond_batch function is deprecated please use the calc_cond_batch one instead.") return tuple(calc_cond_batch(model, [cond, uncond], x_in, timestep, model_options)) @@ -643,12 +885,21 @@ def calculate_start_end_timesteps(model, conds): def pre_run_control(model, conds): s = model.model_sampling + # Per-device model lookup so multigpu control clones get the matching + # diffusion_model (e.g. QwenFunControlNet stashes it into extra_args). + device_models: dict = {} + patcher = getattr(model, "current_patcher", None) + if patcher is not None: + for p in patcher.get_additional_models_with_key("multigpu"): + device_models[p.load_device] = p.model for t in range(len(conds)): x = conds[t] percent_to_timestep_function = lambda a: s.percent_to_sigma(a) if 'control' in x: x['control'].pre_run(model, percent_to_timestep_function) + for device, device_cnet in x['control'].multigpu_clones.items(): + device_cnet.pre_run(device_models.get(device, model), percent_to_timestep_function) def apply_empty_x_to_equal_area(conds, uncond, name, uncond_fill_func): cond_cnets = [] @@ -891,7 +1142,9 @@ def cast_to_load_options(model_options: dict[str], device=None, dtype=None): to_load_options = model_options.get("to_load_options", None) if to_load_options is None: return + cast_transformer_options(to_load_options, device, dtype) +def cast_transformer_options(transformer_options: dict[str], device=None, dtype=None): casts = [] if device is not None: casts.append(device) @@ -900,18 +1153,17 @@ def cast_to_load_options(model_options: dict[str], device=None, dtype=None): # if nothing to apply, do nothing if len(casts) == 0: return - # try to call .to on patches - if "patches" in to_load_options: - patches = to_load_options["patches"] + if "patches" in transformer_options: + patches = transformer_options["patches"] for name in patches: patch_list = patches[name] for i in range(len(patch_list)): if hasattr(patch_list[i], "to"): for cast in casts: patch_list[i] = patch_list[i].to(cast) - if "patches_replace" in to_load_options: - patches = to_load_options["patches_replace"] + if "patches_replace" in transformer_options: + patches = transformer_options["patches_replace"] for name in patches: patch_list = patches[name] for k in patch_list: @@ -921,8 +1173,8 @@ def cast_to_load_options(model_options: dict[str], device=None, dtype=None): # try to call .to on any wrappers/callbacks wrappers_and_callbacks = ["wrappers", "callbacks"] for wc_name in wrappers_and_callbacks: - if wc_name in to_load_options: - wc: dict[str, list] = to_load_options[wc_name] + if wc_name in transformer_options: + wc: dict[str, list] = transformer_options[wc_name] for wc_dict in wc.values(): for wc_list in wc_dict.values(): for i in range(len(wc_list)): @@ -930,7 +1182,6 @@ def cast_to_load_options(model_options: dict[str], device=None, dtype=None): for cast in casts: wc_list[i] = wc_list[i].to(cast) - class CFGGuider: def __init__(self, model_patcher: ModelPatcher): self.model_patcher = model_patcher @@ -985,16 +1236,32 @@ class CFGGuider: self.inner_model, self.conds, self.loaded_models = comfy.sampler_helpers.prepare_sampling(self.model_patcher, noise.shape, self.conds, self.model_options) device = self.model_patcher.load_device - noise = noise.to(device=device, dtype=torch.float32) - latent_image = latent_image.to(device=device, dtype=torch.float32) - sigmas = sigmas.to(device) - cast_to_load_options(self.model_options, device=device, dtype=self.model_patcher.model_dtype()) + multigpu_patchers = comfy.sampler_helpers.prepare_model_patcher_multigpu_clones(self.model_patcher, self.loaded_models, self.model_options) - try: - self.model_patcher.pre_run() - output = self.inner_sample(noise, latent_image, device, sampler, sigmas, denoise_mask, callback, disable_pbar, seed, latent_shapes=latent_shapes) - finally: - self.model_patcher.cleanup() + # Create persistent thread pool for all GPU devices (main + extras) + if multigpu_patchers: + extra_devices = [p.load_device for p in multigpu_patchers] + all_devices = [device] + extra_devices + self.model_options["multigpu_thread_pool"] = comfy.multigpu.MultiGPUThreadPool(all_devices) + + with comfy.model_management.cuda_device_context(device): + try: + noise = noise.to(device=device, dtype=torch.float32) + latent_image = latent_image.to(device=device, dtype=torch.float32) + sigmas = sigmas.to(device) + cast_to_load_options(self.model_options, device=device, dtype=self.model_patcher.model_dtype()) + + self.model_patcher.pre_run() + for multigpu_patcher in multigpu_patchers: + multigpu_patcher.pre_run() + output = self.inner_sample(noise, latent_image, device, sampler, sigmas, denoise_mask, callback, disable_pbar, seed, latent_shapes=latent_shapes) + finally: + thread_pool = self.model_options.pop("multigpu_thread_pool", None) + if thread_pool is not None: + thread_pool.shutdown() + self.model_patcher.cleanup() + for multigpu_patcher in multigpu_patchers: + multigpu_patcher.cleanup() comfy.sampler_helpers.cleanup_models(self.conds, self.loaded_models) del self.inner_model diff --git a/comfy/sd.py b/comfy/sd.py index 2443353a4..30b877b85 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -1,4 +1,3 @@ -from __future__ import annotations import json import torch from enum import Enum @@ -21,6 +20,7 @@ import comfy.ldm.ace.vae.music_dcae_pipeline import comfy.ldm.cogvideo.vae import comfy.ldm.hunyuan_video.vae import comfy.ldm.mmaudio.vae.autoencoder +import comfy.ldm.audio.vae_sa3 import comfy.pixel_space_convert import comfy.weight_adapter import yaml @@ -49,6 +49,7 @@ import comfy.text_encoders.lt import comfy.text_encoders.hunyuan_video import comfy.text_encoders.cosmos import comfy.text_encoders.lumina2 +import comfy.text_encoders.pixeldit import comfy.text_encoders.wan import comfy.text_encoders.hidream import comfy.text_encoders.ace @@ -67,6 +68,8 @@ import comfy.text_encoders.qwen35 import comfy.text_encoders.ernie import comfy.text_encoders.gemma4 import comfy.text_encoders.cogvideo +import comfy.text_encoders.sa3 +import comfy.text_encoders.gpt_oss import comfy.model_patcher import comfy.lora @@ -333,41 +336,43 @@ class CLIP: self.cond_stage_model.set_clip_options({"projected_pooled": False}) self.load_model(tokens) - self.cond_stage_model.set_clip_options({"execution_device": self.patcher.load_device}) + device = self.patcher.load_device + self.cond_stage_model.set_clip_options({"execution_device": device}) all_hooks.reset() self.patcher.patch_hooks(None) if show_pbar: pbar = ProgressBar(len(scheduled_keyframes)) - for scheduled_opts in scheduled_keyframes: - t_range = scheduled_opts[0] - # don't bother encoding any conds outside of start_percent and end_percent bounds - if "start_percent" in add_dict: - if t_range[1] < add_dict["start_percent"]: - continue - if "end_percent" in add_dict: - if t_range[0] > add_dict["end_percent"]: - continue - hooks_keyframes = scheduled_opts[1] - for hook, keyframe in hooks_keyframes: - hook.hook_keyframe._current_keyframe = keyframe - # apply appropriate hooks with values that match new hook_keyframe - self.patcher.patch_hooks(all_hooks) - # perform encoding as normal - o = self.cond_stage_model.encode_token_weights(tokens) - cond, pooled = o[:2] - pooled_dict = {"pooled_output": pooled} - # add clip_start_percent and clip_end_percent in pooled - pooled_dict["clip_start_percent"] = t_range[0] - pooled_dict["clip_end_percent"] = t_range[1] - # add/update any keys with the provided add_dict - pooled_dict.update(add_dict) - # add hooks stored on clip - self.add_hooks_to_dict(pooled_dict) - all_cond_pooled.append([cond, pooled_dict]) - if show_pbar: - pbar.update(1) - model_management.throw_exception_if_processing_interrupted() + with model_management.cuda_device_context(device): + for scheduled_opts in scheduled_keyframes: + t_range = scheduled_opts[0] + # don't bother encoding any conds outside of start_percent and end_percent bounds + if "start_percent" in add_dict: + if t_range[1] < add_dict["start_percent"]: + continue + if "end_percent" in add_dict: + if t_range[0] > add_dict["end_percent"]: + continue + hooks_keyframes = scheduled_opts[1] + for hook, keyframe in hooks_keyframes: + hook.hook_keyframe._current_keyframe = keyframe + # apply appropriate hooks with values that match new hook_keyframe + self.patcher.patch_hooks(all_hooks) + # perform encoding as normal + o = self.cond_stage_model.encode_token_weights(tokens) + cond, pooled = o[:2] + pooled_dict = {"pooled_output": pooled} + # add clip_start_percent and clip_end_percent in pooled + pooled_dict["clip_start_percent"] = t_range[0] + pooled_dict["clip_end_percent"] = t_range[1] + # add/update any keys with the provided add_dict + pooled_dict.update(add_dict) + # add hooks stored on clip + self.add_hooks_to_dict(pooled_dict) + all_cond_pooled.append([cond, pooled_dict]) + if show_pbar: + pbar.update(1) + model_management.throw_exception_if_processing_interrupted() all_hooks.reset() return all_cond_pooled @@ -381,8 +386,12 @@ class CLIP: self.cond_stage_model.set_clip_options({"projected_pooled": False}) self.load_model(tokens) - self.cond_stage_model.set_clip_options({"execution_device": self.patcher.load_device}) - o = self.cond_stage_model.encode_token_weights(tokens) + device = self.patcher.load_device + self.cond_stage_model.set_clip_options({"execution_device": device}) + + with model_management.cuda_device_context(device): + o = self.cond_stage_model.encode_token_weights(tokens) + cond, pooled = o[:2] if return_dict: out = {"cond": cond, "pooled_output": pooled} @@ -444,9 +453,12 @@ class CLIP: self.cond_stage_model.reset_clip_options() self.load_model(tokens) + device = self.patcher.load_device self.cond_stage_model.set_clip_options({"layer": None}) - self.cond_stage_model.set_clip_options({"execution_device": self.patcher.load_device}) - return self.cond_stage_model.generate(tokens, do_sample=do_sample, max_length=max_length, temperature=temperature, top_k=top_k, top_p=top_p, min_p=min_p, repetition_penalty=repetition_penalty, seed=seed, presence_penalty=presence_penalty) + self.cond_stage_model.set_clip_options({"execution_device": device}) + + with model_management.cuda_device_context(device): + return self.cond_stage_model.generate(tokens, do_sample=do_sample, max_length=max_length, temperature=temperature, top_k=top_k, top_p=top_p, min_p=min_p, repetition_penalty=repetition_penalty, seed=seed, presence_penalty=presence_penalty) def decode(self, token_ids, skip_special_tokens=True): return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) @@ -854,6 +866,34 @@ class VAE: self.working_dtypes = [torch.float32] self.disable_offload = True self.extra_1d_channel = 16 + elif "decoder.layers.3.transformers.0.pre_norm.alpha" in sd: # Stable Audio 3 VAE + if "decoder.layers.3.transformers.11.self_attn.to_out.weight" in sd: + config = {"channels": 256, "transformer_depths": 12, "sinusoidal_blocks": 8, + "sliding_window": [1, 1], "decoder_conv_mapping": False, + "chunk_size": 128, "chunk_midpoint_shift": False} + self.memory_used_encode = lambda shape, dtype: (1500 * shape[2]) * model_management.dtype_size(dtype) + self.memory_used_decode = lambda shape, dtype: (1500 * shape[2] * 4096) * model_management.dtype_size(dtype) + else: + config = {"channels": 128, "transformer_depths": 6, "sinusoidal_blocks": 0, + "sliding_window": None, "decoder_conv_mapping": True, + "chunk_size": 32, "chunk_midpoint_shift": True} + self.memory_used_encode = lambda shape, dtype: (72 * shape[2]) * model_management.dtype_size(dtype) + self.memory_used_decode = lambda shape, dtype: (72 * shape[2] * 4096) * model_management.dtype_size(dtype) + + self.first_stage_model = comfy.ldm.audio.vae_sa3.SA3AudioVAE(**config) + self.latent_channels = 256 + self.output_channels = 2 + self.upscale_ratio = 4096 + self.downscale_ratio = 4096 + self.latent_dim = 1 + self.audio_sample_rate = 44100 + self.process_output = lambda audio: audio + self.process_input = lambda audio: audio + self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32] + #This VAE has Parameters and Buffers the non-dynamic caster cannot handle + #Force cast it for --disable-dynamic-vram users until there is a true core fix. + if not comfy.memory_management.aimdo_enabled: + self.disable_offload = True else: logging.warning("WARNING: No VAE weights detected, VAE not initalized.") self.first_stage_model = None @@ -996,50 +1036,52 @@ class VAE: do_tile = False if self.latent_dim == 2 and samples_in.ndim == 5: samples_in = samples_in[:, :, 0] - try: - memory_used = self.memory_used_decode(samples_in.shape, self.vae_dtype) - model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload) - free_memory = self.patcher.get_free_memory(self.device) - batch_number = int(free_memory / memory_used) - batch_number = max(1, batch_number) - # Pre-allocate output for VAEs that support direct buffer writes - preallocated = False - if getattr(self.first_stage_model, 'comfy_has_chunked_io', False): - pixel_samples = torch.empty(self.first_stage_model.decode_output_shape(samples_in.shape), device=self.output_device, dtype=self.vae_output_dtype()) - preallocated = True + with model_management.cuda_device_context(self.device): + try: + memory_used = self.memory_used_decode(samples_in.shape, self.vae_dtype) + model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload) + free_memory = self.patcher.get_free_memory(self.device) + batch_number = int(free_memory / memory_used) + batch_number = max(1, batch_number) - for x in range(0, samples_in.shape[0], batch_number): - samples = samples_in[x:x + batch_number].to(device=self.device, dtype=self.vae_dtype) - if preallocated: - self.first_stage_model.decode(samples, output_buffer=pixel_samples[x:x+batch_number], **vae_options) - else: - out = self.first_stage_model.decode(samples, **vae_options).to(device=self.output_device, dtype=self.vae_output_dtype(), copy=True) - if pixel_samples is None: - pixel_samples = torch.empty((samples_in.shape[0],) + tuple(out.shape[1:]), device=self.output_device, dtype=self.vae_output_dtype()) - pixel_samples[x:x+batch_number].copy_(out) - del out - self.process_output(pixel_samples[x:x+batch_number]) - except Exception as e: - model_management.raise_non_oom(e) - logging.warning("Warning: Ran out of memory when regular VAE decoding, retrying with tiled VAE decoding.") - #NOTE: We don't know what tensors were allocated to stack variables at the time of the - #exception and the exception itself refs them all until we get out of this except block. - #So we just set a flag for tiler fallback so that tensor gc can happen once the - #exception is fully off the books. - do_tile = True + # Pre-allocate output for VAEs that support direct buffer writes + preallocated = False + if getattr(self.first_stage_model, 'comfy_has_chunked_io', False): + pixel_samples = torch.empty(self.first_stage_model.decode_output_shape(samples_in.shape), device=self.output_device, dtype=self.vae_output_dtype()) + preallocated = True - if do_tile: - comfy.model_management.soft_empty_cache() - dims = samples_in.ndim - 2 - if dims == 1 or self.extra_1d_channel is not None: - pixel_samples = self.decode_tiled_1d(samples_in) - elif dims == 2: - pixel_samples = self.decode_tiled_(samples_in) - elif dims == 3: - tile = 256 // self.spacial_compression_decode() - overlap = tile // 4 - pixel_samples = self.decode_tiled_3d(samples_in, tile_x=tile, tile_y=tile, overlap=(1, overlap, overlap)) + for x in range(0, samples_in.shape[0], batch_number): + samples = samples_in[x:x + batch_number].to(device=self.device, dtype=self.vae_dtype) + if preallocated: + self.first_stage_model.decode(samples, output_buffer=pixel_samples[x:x+batch_number], **vae_options) + else: + out = self.first_stage_model.decode(samples, **vae_options).to(device=self.output_device, dtype=self.vae_output_dtype(), copy=True) + if pixel_samples is None: + pixel_samples = torch.empty((samples_in.shape[0],) + tuple(out.shape[1:]), device=self.output_device, dtype=self.vae_output_dtype()) + pixel_samples[x:x+batch_number].copy_(out) + del out + self.process_output(pixel_samples[x:x+batch_number]) + except Exception as e: + model_management.raise_non_oom(e) + logging.warning("Warning: Ran out of memory when regular VAE decoding, retrying with tiled VAE decoding.") + #NOTE: We don't know what tensors were allocated to stack variables at the time of the + #exception and the exception itself refs them all until we get out of this except block. + #So we just set a flag for tiler fallback so that tensor gc can happen once the + #exception is fully off the books. + do_tile = True + + if do_tile: + comfy.model_management.soft_empty_cache() + dims = samples_in.ndim - 2 + if dims == 1 or self.extra_1d_channel is not None: + pixel_samples = self.decode_tiled_1d(samples_in) + elif dims == 2: + pixel_samples = self.decode_tiled_(samples_in) + elif dims == 3: + tile = 256 // self.spacial_compression_decode() + overlap = tile // 4 + pixel_samples = self.decode_tiled_3d(samples_in, tile_x=tile, tile_y=tile, overlap=(1, overlap, overlap)) pixel_samples = pixel_samples.to(self.output_device).movedim(1,-1) return pixel_samples @@ -1057,20 +1099,21 @@ class VAE: if overlap is not None: args["overlap"] = overlap - if dims == 1 or self.extra_1d_channel is not None: - args.pop("tile_y") - output = self.decode_tiled_1d(samples, **args) - elif dims == 2: - output = self.decode_tiled_(samples, **args) - elif dims == 3: - if overlap_t is None: - args["overlap"] = (1, overlap, overlap) - else: - args["overlap"] = (max(1, overlap_t), overlap, overlap) - if tile_t is not None: - args["tile_t"] = max(2, tile_t) + with model_management.cuda_device_context(self.device): + if dims == 1 or self.extra_1d_channel is not None: + args.pop("tile_y") + output = self.decode_tiled_1d(samples, **args) + elif dims == 2: + output = self.decode_tiled_(samples, **args) + elif dims == 3: + if overlap_t is None: + args["overlap"] = (1, overlap, overlap) + else: + args["overlap"] = (max(1, overlap_t), overlap, overlap) + if tile_t is not None: + args["tile_t"] = max(2, tile_t) - output = self.decode_tiled_3d(samples, **args) + output = self.decode_tiled_3d(samples, **args) return output.movedim(1, -1) def encode(self, pixel_samples): @@ -1083,44 +1126,46 @@ class VAE: pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0) else: pixel_samples = pixel_samples.unsqueeze(2) - try: - memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype) - model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload) - free_memory = self.patcher.get_free_memory(self.device) - batch_number = int(free_memory / max(1, memory_used)) - batch_number = max(1, batch_number) - samples = None - for x in range(0, pixel_samples.shape[0], batch_number): - pixels_in = self.process_input(pixel_samples[x:x + batch_number]).to(self.vae_dtype) - if getattr(self.first_stage_model, 'comfy_has_chunked_io', False): - out = self.first_stage_model.encode(pixels_in, device=self.device) + + with model_management.cuda_device_context(self.device): + try: + memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype) + model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload) + free_memory = self.patcher.get_free_memory(self.device) + batch_number = int(free_memory / max(1, memory_used)) + batch_number = max(1, batch_number) + samples = None + for x in range(0, pixel_samples.shape[0], batch_number): + pixels_in = self.process_input(pixel_samples[x:x + batch_number]).to(self.vae_dtype) + if getattr(self.first_stage_model, 'comfy_has_chunked_io', False): + out = self.first_stage_model.encode(pixels_in, device=self.device) + else: + pixels_in = pixels_in.to(self.device) + out = self.first_stage_model.encode(pixels_in) + out = out.to(self.output_device).to(dtype=self.vae_output_dtype()) + if samples is None: + samples = torch.empty((pixel_samples.shape[0],) + tuple(out.shape[1:]), device=self.output_device, dtype=self.vae_output_dtype()) + samples[x:x + batch_number] = out + + except Exception as e: + model_management.raise_non_oom(e) + logging.warning("Warning: Ran out of memory when regular VAE encoding, retrying with tiled VAE encoding.") + #NOTE: We don't know what tensors were allocated to stack variables at the time of the + #exception and the exception itself refs them all until we get out of this except block. + #So we just set a flag for tiler fallback so that tensor gc can happen once the + #exception is fully off the books. + do_tile = True + + if do_tile: + comfy.model_management.soft_empty_cache() + if self.latent_dim == 3: + tile = 256 + overlap = tile // 4 + samples = self.encode_tiled_3d(pixel_samples, tile_x=tile, tile_y=tile, overlap=(1, overlap, overlap)) + elif self.latent_dim == 1 or self.extra_1d_channel is not None: + samples = self.encode_tiled_1d(pixel_samples) else: - pixels_in = pixels_in.to(self.device) - out = self.first_stage_model.encode(pixels_in) - out = out.to(self.output_device).to(dtype=self.vae_output_dtype()) - if samples is None: - samples = torch.empty((pixel_samples.shape[0],) + tuple(out.shape[1:]), device=self.output_device, dtype=self.vae_output_dtype()) - samples[x:x + batch_number] = out - - except Exception as e: - model_management.raise_non_oom(e) - logging.warning("Warning: Ran out of memory when regular VAE encoding, retrying with tiled VAE encoding.") - #NOTE: We don't know what tensors were allocated to stack variables at the time of the - #exception and the exception itself refs them all until we get out of this except block. - #So we just set a flag for tiler fallback so that tensor gc can happen once the - #exception is fully off the books. - do_tile = True - - if do_tile: - comfy.model_management.soft_empty_cache() - if self.latent_dim == 3: - tile = 256 - overlap = tile // 4 - samples = self.encode_tiled_3d(pixel_samples, tile_x=tile, tile_y=tile, overlap=(1, overlap, overlap)) - elif self.latent_dim == 1 or self.extra_1d_channel is not None: - samples = self.encode_tiled_1d(pixel_samples) - else: - samples = self.encode_tiled_(pixel_samples) + samples = self.encode_tiled_(pixel_samples) return samples @@ -1146,26 +1191,27 @@ class VAE: if overlap is not None: args["overlap"] = overlap - if dims == 1: - args.pop("tile_y") - samples = self.encode_tiled_1d(pixel_samples, **args) - elif dims == 2: - samples = self.encode_tiled_(pixel_samples, **args) - elif dims == 3: - if tile_t is not None: - tile_t_latent = max(2, self.downscale_ratio[0](tile_t)) - else: - tile_t_latent = 9999 - args["tile_t"] = self.upscale_ratio[0](tile_t_latent) + with model_management.cuda_device_context(self.device): + if dims == 1: + args.pop("tile_y") + samples = self.encode_tiled_1d(pixel_samples, **args) + elif dims == 2: + samples = self.encode_tiled_(pixel_samples, **args) + elif dims == 3: + if tile_t is not None: + tile_t_latent = max(2, self.downscale_ratio[0](tile_t)) + else: + tile_t_latent = 9999 + args["tile_t"] = self.upscale_ratio[0](tile_t_latent) - if overlap_t is None: - args["overlap"] = (1, overlap, overlap) - else: - args["overlap"] = (self.upscale_ratio[0](max(1, min(tile_t_latent // 2, self.downscale_ratio[0](overlap_t)))), overlap, overlap) - maximum = pixel_samples.shape[2] - maximum = self.upscale_ratio[0](self.downscale_ratio[0](maximum)) + if overlap_t is None: + args["overlap"] = (1, overlap, overlap) + else: + args["overlap"] = (self.upscale_ratio[0](max(1, min(tile_t_latent // 2, self.downscale_ratio[0](overlap_t)))), overlap, overlap) + maximum = pixel_samples.shape[2] + maximum = self.upscale_ratio[0](self.downscale_ratio[0](maximum)) - samples = self.encode_tiled_3d(pixel_samples[:,:,:maximum], **args) + samples = self.encode_tiled_3d(pixel_samples[:,:,:maximum], **args) return samples @@ -1239,6 +1285,8 @@ class CLIPType(Enum): FLUX2 = 25 LONGCAT_IMAGE = 26 COGVIDEOX = 27 + LENS = 28 + PIXELDIT = 29 @@ -1290,6 +1338,8 @@ class TEModel(Enum): GEMMA_4_E4B = 29 GEMMA_4_E2B = 30 GEMMA_4_31B = 31 + T5_GEMMA = 32 + GPT_OSS_20B = 33 def detect_te_model(sd): @@ -1314,6 +1364,8 @@ def detect_te_model(sd): if weight.shape[0] == 384: return TEModel.BYT5_SMALL_GLYPH return TEModel.T5_BASE + if "model.encoder.layers.0.pre_self_attn_layernorm.weight" in sd: + return TEModel.T5_GEMMA if 'model.layers.0.post_feedforward_layernorm.weight' in sd: if 'model.layers.59.self_attn.q_norm.weight' in sd: return TEModel.GEMMA_4_31B @@ -1329,6 +1381,9 @@ def detect_te_model(sd): else: return TEModel.GEMMA_3_4B return TEModel.GEMMA_2_2B + # Must precede the Qwen2.5-7B k_proj.bias=512 check (GPT-OSS also has 8*64=512). + if "layers.0.self_attn.sinks" in sd and "layers.0.mlp.experts.gate_up_proj.weight" in sd: + return TEModel.GPT_OSS_20B if 'model.layers.0.self_attn.k_proj.bias' in sd: weight = sd['model.layers.0.self_attn.k_proj.bias'] if weight.shape[0] == 256: @@ -1463,6 +1518,10 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip else: clip_target.clip = comfy.text_encoders.sa_t5.SAT5Model clip_target.tokenizer = comfy.text_encoders.sa_t5.SAT5Tokenizer + elif te_model == TEModel.T5_GEMMA: + clip_target.clip = comfy.text_encoders.sa3.SAT5GemmaModel + clip_target.tokenizer = comfy.text_encoders.sa3.SAT5GemmaTokenizer + tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None) elif te_model in (TEModel.GEMMA_4_E4B, TEModel.GEMMA_4_E2B, TEModel.GEMMA_4_31B): variant = {TEModel.GEMMA_4_E4B: comfy.text_encoders.gemma4.Gemma4_E4B, TEModel.GEMMA_4_E2B: comfy.text_encoders.gemma4.Gemma4_E2B, @@ -1471,8 +1530,12 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_target.tokenizer = variant.tokenizer tokenizer_data["tokenizer_json"] = clip_data[0].get("tokenizer_json", None) elif te_model == TEModel.GEMMA_2_2B: - clip_target.clip = comfy.text_encoders.lumina2.te(**llama_detect(clip_data)) - clip_target.tokenizer = comfy.text_encoders.lumina2.LuminaTokenizer + if clip_type == CLIPType.PIXELDIT: + clip_target.clip = comfy.text_encoders.pixeldit.pixeldit_te(**llama_detect(clip_data)) + clip_target.tokenizer = comfy.text_encoders.pixeldit.PixelDiTGemma2Tokenizer + else: + clip_target.clip = comfy.text_encoders.lumina2.te(**llama_detect(clip_data)) + clip_target.tokenizer = comfy.text_encoders.lumina2.LuminaTokenizer tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None) elif te_model == TEModel.GEMMA_3_4B: clip_target.clip = comfy.text_encoders.lumina2.te(**llama_detect(clip_data), model_type="gemma3_4b") @@ -1507,6 +1570,10 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_target.clip = comfy.text_encoders.flux.flux2_te(**llama_detect(clip_data), pruned=te_model == TEModel.MISTRAL3_24B_PRUNED_FLUX2) clip_target.tokenizer = comfy.text_encoders.flux.Flux2Tokenizer tokenizer_data["tekken_model"] = clip_data[0].get("tekken_model", None) + elif te_model == TEModel.GPT_OSS_20B: + clip_target.clip = comfy.text_encoders.gpt_oss.lens_te(**llama_detect(clip_data)) + clip_target.tokenizer = comfy.text_encoders.gpt_oss.LensTokenizer + tokenizer_data["tokenizer_json"] = clip_data[0].get("tokenizer_json", None) elif te_model == TEModel.QWEN3_4B: if clip_type == CLIPType.FLUX or clip_type == CLIPType.FLUX2: clip_target.clip = comfy.text_encoders.flux.klein_te(**llama_detect(clip_data), model_type="qwen3_4b") @@ -1673,12 +1740,52 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o out = load_state_dict_guess_config(sd, output_vae, output_clip, output_clipvision, embedding_directory, output_model, model_options, te_model_options=te_model_options, metadata=metadata, disable_dynamic=disable_dynamic) if out is None: raise RuntimeError("ERROR: Could not detect model type of: {}\n{}".format(ckpt_path, model_detection_error_hint(ckpt_path, sd))) - if output_model and out[0] is not None: - out[0].cached_patcher_init = (load_checkpoint_guess_config_model_only, (ckpt_path, embedding_directory, model_options, te_model_options)) - if output_clip and out[1] is not None: - out[1].patcher.cached_patcher_init = (load_checkpoint_guess_config_clip_only, (ckpt_path, embedding_directory, model_options, te_model_options)) + if out[0] is not None: + out[0].cached_patcher_init = (load_checkpoint_guess_config, (ckpt_path, False, False, False, embedding_directory, output_model, model_options, te_model_options), 0) + # Register reload factories for the CLIP and VAE produced by the same checkpoint so + # ModelPatcher.deepclone_multigpu can spawn per-device copies (Select{CLIP,VAE}Device, + # MultiGPU work-units, etc.) without falling back to copy.deepcopy of an + # already-loaded module. + if out[1] is not None and getattr(out[1], "patcher", None) is not None: + out[1].patcher.cached_patcher_init = (load_checkpoint_clip_patcher, (ckpt_path, embedding_directory, model_options, te_model_options)) + if out[2] is not None and getattr(out[2], "patcher", None) is not None: + out[2].patcher.cached_patcher_init = (load_checkpoint_vae_patcher, (ckpt_path, embedding_directory, model_options, te_model_options)) return out + +def load_checkpoint_clip_patcher(ckpt_path, embedding_directory=None, model_options={}, te_model_options={}, disable_dynamic=False): + """Reload only the CLIP patcher from a checkpoint. Used as the cached_patcher_init + factory for the CLIP returned by load_checkpoint_guess_config.""" + _, clip, _, _ = load_checkpoint_guess_config( + ckpt_path, + output_vae=False, + output_clip=True, + output_clipvision=False, + embedding_directory=embedding_directory, + output_model=False, + model_options=model_options, + te_model_options=te_model_options, + disable_dynamic=disable_dynamic, + ) + return clip.patcher + + +def load_checkpoint_vae_patcher(ckpt_path, embedding_directory=None, model_options={}, te_model_options={}, disable_dynamic=False): + """Reload only the VAE patcher from a checkpoint. Used as the cached_patcher_init + factory for the VAE returned by load_checkpoint_guess_config.""" + _, _, vae, _ = load_checkpoint_guess_config( + ckpt_path, + output_vae=True, + output_clip=False, + output_clipvision=False, + embedding_directory=embedding_directory, + output_model=False, + model_options=model_options, + te_model_options=te_model_options, + disable_dynamic=disable_dynamic, + ) + return vae.patcher + def load_checkpoint_guess_config_model_only(ckpt_path, embedding_directory=None, model_options={}, te_model_options={}, disable_dynamic=False): model, *_ = load_checkpoint_guess_config(ckpt_path, False, False, False, embedding_directory=embedding_directory, @@ -1705,7 +1812,7 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c diffusion_model_prefix = model_detection.unet_prefix_from_state_dict(sd) parameters = comfy.utils.calculate_parameters(sd, diffusion_model_prefix) weight_dtype = comfy.utils.weight_dtype(sd, diffusion_model_prefix) - load_device = model_management.get_torch_device() + load_device = model_options.get("load_device", model_management.get_torch_device()) custom_operations = model_options.get("custom_operations", None) if custom_operations is None: @@ -1745,13 +1852,15 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c inital_load_device = model_management.unet_inital_load_device(parameters, unet_dtype) model = model_config.get_model(sd, diffusion_model_prefix, device=inital_load_device) ModelPatcher = comfy.model_patcher.ModelPatcher if disable_dynamic else comfy.model_patcher.CoreModelPatcher - model_patcher = ModelPatcher(model, load_device=load_device, offload_device=model_management.unet_offload_device()) + offload_device = model_options.get("offload_device", model_management.unet_offload_device()) + model_patcher = ModelPatcher(model, load_device=load_device, offload_device=offload_device) model.load_model_weights(sd, diffusion_model_prefix, assign=model_patcher.is_dynamic()) if output_vae: vae_sd = comfy.utils.state_dict_prefix_replace(sd, {k: "" for k in model_config.vae_key_prefix}, filter_keys=True) vae_sd = model_config.process_vae_state_dict(vae_sd) - vae = VAE(sd=vae_sd, metadata=metadata) + vae_device = model_options.get("load_device", None) + vae = VAE(sd=vae_sd, metadata=metadata, device=vae_device) if output_clip: if te_model_options.get("custom_operations", None) is None: @@ -1835,7 +1944,7 @@ def load_diffusion_model_state_dict(sd, model_options={}, metadata=None, disable parameters = comfy.utils.calculate_parameters(sd) weight_dtype = comfy.utils.weight_dtype(sd) - load_device = model_management.get_torch_device() + load_device = model_options.get("load_device", model_management.get_torch_device()) model_config = model_detection.model_config_from_unet(sd, "", metadata=metadata) if model_config is not None: @@ -1860,7 +1969,7 @@ def load_diffusion_model_state_dict(sd, model_options={}, metadata=None, disable else: logging.warning("{} {}".format(diffusers_keys[k], k)) - offload_device = model_management.unet_offload_device() + offload_device = model_options.get("offload_device", model_management.unet_offload_device()) unet_weight_dtype = list(model_config.supported_inference_dtypes) if model_config.quant_config is not None: weight_dtype = None @@ -1902,6 +2011,26 @@ def load_diffusion_model(unet_path, model_options={}, disable_dynamic=False): model.cached_patcher_init = (load_diffusion_model, (unet_path, model_options)) return model + +def load_vae_patcher(vae_path, metadata=None, device=None, disable_dynamic=False): + """Reload a disk-backed VAE from ``vae_path`` and return its patcher. + + Used as the ``cached_patcher_init`` factory on ``VAE.patcher`` so + :meth:`comfy.model_patcher.ModelPatcher.deepclone_multigpu` can produce a + fresh, untainted VAE patcher (no inherited per-device load state, no + in-place quantization fallout) for multigpu work-units and the + SelectVAEDevice node. The optional ``device`` matches the source loader's + VAE initialization path; the deepclone's ``load_device`` still controls + where the cloned patcher is targeted. + """ + if metadata is None: + sd, metadata = comfy.utils.load_torch_file(vae_path, return_metadata=True) + else: + sd = comfy.utils.load_torch_file(vae_path) + vae = VAE(sd=sd, metadata=metadata, device=device) + vae.throw_exception_if_invalid() + return vae.patcher + def load_unet(unet_path, dtype=None): logging.warning("The load_unet function has been deprecated and will be removed please switch to: load_diffusion_model") return load_diffusion_model(unet_path, model_options={"dtype": dtype}) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 1e4434fd5..00941da53 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -7,6 +7,7 @@ from . import sdxl_clip import comfy.text_encoders.sd2_clip import comfy.text_encoders.sd3_clip import comfy.text_encoders.sa_t5 +import comfy.text_encoders.sa3 import comfy.text_encoders.aura_t5 import comfy.text_encoders.pixart_t5 import comfy.text_encoders.hydit @@ -29,6 +30,7 @@ import comfy.text_encoders.longcat_image import comfy.text_encoders.ernie import comfy.text_encoders.cogvideo import comfy.text_encoders.hidream_o1 +import comfy.text_encoders.pixeldit from . import supported_models_base from . import latent_formats @@ -603,6 +605,29 @@ class StableAudio(supported_models_base.BASE): def clip_target(self, state_dict={}): return supported_models_base.ClipTarget(comfy.text_encoders.sa_t5.SAT5Tokenizer, comfy.text_encoders.sa_t5.SAT5Model) +class StableAudio3(StableAudio): + unet_config = { + "audio_model": "dit1.0", + "global_cond_shared_embed": True, + } + + sampling_settings = { + "multiplier": 1.0, + "shift": 2.0, + } + + latent_format = latent_formats.StableAudio3 + + memory_usage_factor = 7 + + def get_model(self, state_dict, prefix="", device=None): + seconds_total_sd = utils.state_dict_prefix_replace(state_dict, {"conditioner.conditioners.seconds_total.": ""}, filter_keys=True) + padding_embedding = state_dict.get("conditioner.conditioners.prompt.padding_embedding", None) + return model_base.StableAudio3(self, seconds_total_embedder_weights=seconds_total_sd, padding_embedding=padding_embedding, device=device) + + def clip_target(self, state_dict={}): + return supported_models_base.ClipTarget(comfy.text_encoders.sa3.SAT5GemmaTokenizer, comfy.text_encoders.sa3.SAT5GemmaModel) + class AuraFlow(supported_models_base.BASE): unet_config = { "cond_seq_dim": 2048, @@ -805,6 +830,50 @@ class Flux2(Flux): return None + +class Lens(supported_models_base.BASE): + """Microsoft Lens (3.8B dual-stream MMDiT, GPT-OSS-20B text features, Flux2 VAE).""" + + unet_config = { + "image_model": "lens", + } + + sampling_settings = { + "shift": 1.829, # Default mu for 1440x1440 (and any seq_len > 4300 + } + + unet_extra_config = {} + latent_format = latent_formats.Flux2 + + memory_usage_factor = 4.0 + + supported_inference_dtypes = [torch.bfloat16, torch.float32] # fp16 causes NaNs + + vae_key_prefix = ["vae."] + text_encoder_key_prefix = ["text_encoders."] + + def __init__(self, unet_config): + super().__init__(unet_config) + + def get_model(self, state_dict, prefix="", device=None): + return model_base.Lens(self, model_type=model_base.ModelType.FLUX, device=device) + + def clip_target(self, state_dict={}): + pref = self.text_encoder_key_prefix[0] + for hint in ("gpt_oss.transformer.", ""): + full_prefix = "{}{}".format(pref, hint) + if "{}layers.0.self_attn.sinks".format(full_prefix) in state_dict: + detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, full_prefix) + return supported_models_base.ClipTarget( + comfy.text_encoders.gpt_oss.LensTokenizer, + comfy.text_encoders.gpt_oss.lens_te(**detect), + ) + return supported_models_base.ClipTarget( + comfy.text_encoders.gpt_oss.LensTokenizer, + comfy.text_encoders.gpt_oss.lens_te(), + ) + + class GenmoMochi(supported_models_base.BASE): unet_config = { "image_model": "mochi_preview", @@ -1135,6 +1204,72 @@ class ZImagePixelSpace(ZImage): def get_model(self, state_dict, prefix="", device=None): return model_base.ZImagePixelSpace(self, device=device) +class PixelDiTT2I(supported_models_base.BASE): + unet_config = { + "image_model": "pixeldit_t2i", + } + + unet_extra_config = {} + + sampling_settings = { + "shift": 4.0, # 1024px stage 3 default; 2.0 for 512px + } + + latent_format = latent_formats.PixelDiTPixel + memory_usage_factor = 0.04 + supported_inference_dtypes = [torch.bfloat16, torch.float32] + + vae_key_prefix = ["vae."] + text_encoder_key_prefix = ["text_encoders."] + + def get_model(self, state_dict, prefix="", device=None): + return model_base.PixelDiTT2I(self, device=device) + + def process_unet_state_dict(self, state_dict): + # pixel_dim from pixel_embedder.proj.weight = (pixel_dim, in_channels); p2 derived per-weight from total // (6 * pixel_dim). + pixel_dim = next(v for k, v in state_dict.items() if k.endswith("pixel_embedder.proj.weight")).shape[0] + + out = {} + marker = ".adaLN_modulation.0." + for k, v in state_dict.items(): + if k.startswith("_repa_projector") or k.startswith("net_ema."): + continue + if k.startswith("core."): + k = k[len("core."):] + elif k.startswith("net."): + k = k[len("net."):] + if "pixel_blocks." in k and marker in k: + # Split into msa (chunks 0-2) and mlp (chunks 3-5) for the two-Linear PiTBlock to reduce peak VRAM + p2 = v.shape[0] // (6 * pixel_dim) + trail = v.shape[1:] # () for bias, (in_dim,) for weight + vv = v.view(p2, 6, pixel_dim, *trail) + base, suffix = k.split(marker) + out[f"{base}.adaLN_modulation_msa.{suffix}"] = vv[:, 0:3].reshape(3 * p2 * pixel_dim, *trail).contiguous() + out[f"{base}.adaLN_modulation_mlp.{suffix}"] = vv[:, 3:6].reshape(3 * p2 * pixel_dim, *trail).contiguous() + else: + out[k] = v + return out + + def clip_target(self, state_dict={}): + return supported_models_base.ClipTarget( + comfy.text_encoders.pixeldit.PixelDiTGemma2Tokenizer, + comfy.text_encoders.pixeldit.PixelDiTGemma2TE, + ) + +class PiD(PixelDiTT2I): + unet_config = { + "image_model": "pid", + } + + sampling_settings = { + "shift": 1.5, # close approximation of the original distill 4 steps [0.999, 0.866, 0.634, 0.342, 0] + } + + memory_usage_factor = 0.04 + + def get_model(self, state_dict, prefix="", device=None): + return model_base.PiD(self, device=device) + class WAN21_T2V(supported_models_base.BASE): unet_config = { "image_model": "wan2.1", @@ -2018,6 +2153,7 @@ models = [ SV3D_u, SV3D_p, SD3, + StableAudio3, StableAudio, AuraFlow, PixArtAlpha, @@ -2044,6 +2180,8 @@ models = [ CosmosI2VPredict2, ZImagePixelSpace, ZImage, + PiD, + PixelDiTT2I, Lumina2, WAN22_T2V, WAN21_CausalAR_T2V, @@ -2071,6 +2209,7 @@ models = [ Omnigen2, QwenImage, Flux2, + Lens, Kandinsky5Image, Kandinsky5, Anima, diff --git a/comfy/text_encoders/gpt_oss.py b/comfy/text_encoders/gpt_oss.py new file mode 100644 index 000000000..d596ef9a0 --- /dev/null +++ b/comfy/text_encoders/gpt_oss.py @@ -0,0 +1,600 @@ +"""GPT-OSS text encoder for Lens.""" + +from __future__ import annotations + +import math +from dataclasses import dataclass +from typing import Any, List, Optional, Sequence + +import torch +import torch.nn as nn +import torch.nn.functional as F + +import comfy.ops +from comfy import sd1_clip +from comfy.ldm.modules.attention import TORCH_HAS_GQA, optimized_attention_for_device +from comfy.text_encoders.llama import RMSNorm, apply_rope + + +@dataclass +class GptOss20BConfig: + vocab_size: int = 201088 + hidden_size: int = 2880 + intermediate_size: int = 2880 + num_hidden_layers: int = 24 + num_attention_heads: int = 64 + num_key_value_heads: int = 8 + head_dim: int = 64 + num_local_experts: int = 32 + num_experts_per_tok: int = 4 + sliding_window: int = 128 + original_max_position_embeddings: int = 4096 + rope_theta: float = 150000.0 + rope_factor: float = 32.0 + rope_beta_fast: float = 32.0 + rope_beta_slow: float = 1.0 + rope_truncate: bool = False + rms_norm_eps: float = 1e-5 + attention_bias: bool = True + layer_types: Optional[List[str]] = None + moe_alpha: float = 1.702 + moe_limit: float = 7.0 + + def __post_init__(self): + if self.layer_types is None: + self.layer_types = [ + "sliding_attention" if (i + 1) % 2 else "full_attention" + for i in range(self.num_hidden_layers) + ] + + +def _yarn_inv_freq(head_dim: int, base: float, factor: float, beta_fast: float, beta_slow: float, + original_max_position_embeddings: int, truncate: bool, device=None) -> tuple[torch.Tensor, float]: + """YARN inv_freq + attention scaling (matches transformers).""" + dim = head_dim + + def find_correction_dim(num_rotations: float) -> float: + return (dim * math.log(original_max_position_embeddings / (num_rotations * 2 * math.pi))) / ( + 2 * math.log(base) + ) + + def find_correction_range() -> tuple[float, float]: + low = find_correction_dim(beta_fast) + high = find_correction_dim(beta_slow) + if truncate: + low = math.floor(low) + high = math.ceil(high) + return max(low, 0), min(high, dim - 1) + + def linear_ramp_factor(min_: float, max_: float, n: int) -> torch.Tensor: + if min_ == max_: + max_ += 0.001 + linear = (torch.arange(n, dtype=torch.float32, device=device) - min_) / (max_ - min_) + return torch.clamp(linear, 0, 1) + + def get_mscale(scale: float) -> float: + if scale <= 1: + return 1.0 + return 0.1 * math.log(scale) + 1.0 + + attention_scaling = get_mscale(factor) + + pos_freqs = base ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim) + inv_freq_extrapolation = 1.0 / pos_freqs + inv_freq_interpolation = 1.0 / (factor * pos_freqs) + + low, high = find_correction_range() + extrap_factor = 1 - linear_ramp_factor(low, high, dim // 2) + inv_freq = inv_freq_interpolation * (1 - extrap_factor) + inv_freq_extrapolation * extrap_factor + return inv_freq, attention_scaling + + +def _build_freqs_cis(inv_freq: torch.Tensor, attention_scaling: float, position_ids: torch.Tensor, dtype: torch.dtype, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + inv_freq_e = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + pos_e = position_ids[:, None, :].float() + freqs = (inv_freq_e @ pos_e).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = (emb.cos() * attention_scaling).to(dtype).unsqueeze(1) + sin = (emb.sin() * attention_scaling).to(dtype).unsqueeze(1) + sin_split = sin.shape[-1] // 2 + return cos, sin[..., :sin_split], -sin[..., sin_split:] + + +def _attention_with_sinks(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, sinks: torch.Tensor, + attention_mask: Optional[torch.Tensor], num_heads: int, num_kv_groups: int) -> torch.Tensor: + """Attention with per-head sinks. + + Sinks add a learned term to each row's softmax denominator but contribute + nothing to the output. We fake this by appending one zero k/v position and + putting the sink logit in the mask at that column. + """ + + if num_kv_groups > 1 and not TORCH_HAS_GQA: + k = k.repeat_interleave(num_kv_groups, dim=1) + v = v.repeat_interleave(num_kv_groups, dim=1) + + B, _, S_q, D = q.shape + H_kv = k.shape[1] + S_kv = k.shape[-2] + + k = torch.cat([k, k.new_zeros(B, H_kv, 1, D)], dim=-2) + v = torch.cat([v, v.new_zeros(B, H_kv, 1, D)], dim=-2) + + sinks_col = sinks.to(q.dtype).view(1, num_heads, 1, 1).expand(B, num_heads, S_q, 1) + if attention_mask is not None: + mask_left = attention_mask[..., :S_kv].expand(B, num_heads, S_q, S_kv) + else: + mask_left = q.new_zeros(B, num_heads, S_q, S_kv) + mask = torch.cat([mask_left, sinks_col], dim=-1) + + op = optimized_attention_for_device(q.device, mask=True, small_input=True) + return op(q, k, v, num_heads, mask=mask, skip_reshape=True, enable_gqa=True) + + +class GptOssAttention(nn.Module): + def __init__(self, config: GptOss20BConfig, layer_idx: int, device=None, dtype=None, ops: Any = None): + super().__init__() + self.layer_idx = layer_idx + self.layer_type = config.layer_types[layer_idx] + self.num_heads = config.num_attention_heads + self.num_kv_heads = config.num_key_value_heads + self.num_kv_groups = self.num_heads // self.num_kv_heads + self.head_dim = config.head_dim + self.hidden_size = config.hidden_size + self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None + + bias = config.attention_bias + self.q_proj = ops.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=bias, device=device, dtype=dtype) + self.k_proj = ops.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=bias, device=device, dtype=dtype) + self.v_proj = ops.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=bias, device=device, dtype=dtype) + self.o_proj = ops.Linear(self.num_heads * self.head_dim, config.hidden_size, bias=bias, device=device, dtype=dtype) + self.sinks = nn.Parameter(torch.empty(self.num_heads, device=device, dtype=dtype)) + + def forward(self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor], freqs_cis) -> torch.Tensor: + B, S, _ = hidden_states.shape + + q = self.q_proj(hidden_states).view(B, S, self.num_heads, self.head_dim).transpose(1, 2) + k = self.k_proj(hidden_states).view(B, S, self.num_kv_heads, self.head_dim).transpose(1, 2) + v = self.v_proj(hidden_states).view(B, S, self.num_kv_heads, self.head_dim).transpose(1, 2) + + q, k = apply_rope(q, k, freqs_cis) + + out = _attention_with_sinks(q, k, v, self.sinks, attention_mask, self.num_heads, self.num_kv_groups) + return self.o_proj(out) + + +# Mixture of Experts + +class GptOssTopKRouter(nn.Module): + def __init__(self, config: GptOss20BConfig, device=None, dtype=None): + super().__init__() + self.top_k = config.num_experts_per_tok + self.num_experts = config.num_local_experts + self.weight = nn.Parameter(torch.empty(config.num_local_experts, config.hidden_size, device=device, dtype=dtype)) + self.bias = nn.Parameter(torch.empty(config.num_local_experts, device=device, dtype=dtype)) + + def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + weight = comfy.ops.cast_to_input(self.weight, hidden_states, copy=False) + bias = comfy.ops.cast_to_input(self.bias, hidden_states, copy=False) + logits = F.linear(hidden_states, weight, bias) + top_vals, top_idx = torch.topk(logits, self.top_k, dim=-1) + # Softmax over top-k slice only + scores = F.softmax(top_vals, dim=-1, dtype=top_vals.dtype) + return scores, top_idx + + +class GptOssExperts(nn.Module): + def __init__(self, config: GptOss20BConfig, device=None, dtype=None, ops: Any = None): + super().__init__() + self.num_experts = config.num_local_experts + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.alpha = config.moe_alpha + self.limit = config.moe_limit + + E = self.num_experts + H = self.hidden_size + I = self.intermediate_size + + self.gate_up_proj = ops.MoEExperts(num_experts=E, in_features=H, out_features=2 * I, bias=True, device=device, dtype=dtype) + self.down_proj = ops.MoEExperts(num_experts=E, in_features=I, out_features=H, bias=True, device=device, dtype=dtype) + + def _apply_gate(self, gate_up: torch.Tensor) -> torch.Tensor: + gate = gate_up[..., ::2] + up = gate_up[..., 1::2] + gate = gate.clamp(max=self.limit) + up = up.clamp(min=-self.limit, max=self.limit) + glu = gate * torch.sigmoid(gate * self.alpha) + return torch.addcmul(glu, up, glu) + + def forward(self, hidden_states: torch.Tensor, router_indices: torch.Tensor, routing_weights: torch.Tensor) -> torch.Tensor: + N = hidden_states.shape[0] + top_k = router_indices.shape[-1] + H = hidden_states.shape[-1] + + per_pair = torch.zeros((N * top_k, H), dtype=hidden_states.dtype, device=hidden_states.device) + + expert_mask = F.one_hot(router_indices, num_classes=self.num_experts).permute(2, 1, 0) + expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero() + + with self.gate_up_proj.bank_resident(hidden_states) as gate_up_bank, \ + self.down_proj.bank_resident(hidden_states) as down_bank: + for ei in expert_hit: + expert_idx = int(ei.item()) + top_k_pos, token_idx = torch.where(expert_mask[expert_idx]) + current = hidden_states[token_idx] + + gate_up = gate_up_bank.expert_linear(current, expert_idx) + gated = self._apply_gate(gate_up) + expert_out = down_bank.expert_linear(gated, expert_idx) + + weighted = expert_out * routing_weights[token_idx, top_k_pos, None] + + flat_idx = token_idx * top_k + top_k_pos + per_pair[flat_idx] = weighted.to(per_pair.dtype) + + return per_pair.view(N, top_k, H).sum(dim=1) + + +class GptOssMLP(nn.Module): + def __init__(self, config: GptOss20BConfig, device=None, dtype=None, ops: Any = None): + super().__init__() + self.router = GptOssTopKRouter(config, device=device, dtype=dtype) + self.experts = GptOssExperts(config, device=device, dtype=dtype, ops=ops) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + B, S, H = hidden_states.shape + flat = hidden_states.reshape(-1, H) + scores, idx = self.router(flat) + out = self.experts(flat, idx, scores) + return out.reshape(B, S, H) + + +# Decoder layer + model + +class GptOssDecoderLayer(nn.Module): + def __init__(self, config: GptOss20BConfig, layer_idx: int, device=None, dtype=None, ops: Any = None): + super().__init__() + self.self_attn = GptOssAttention(config, layer_idx, device=device, dtype=dtype, ops=ops) + self.mlp = GptOssMLP(config, device=device, dtype=dtype, ops=ops) + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, device=device, dtype=dtype) + self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, device=device, dtype=dtype) + self.layer_type = config.layer_types[layer_idx] + + def forward(self, x: torch.Tensor, attention_masks: dict[str, Optional[torch.Tensor]], freqs_cis) -> torch.Tensor: + residual = x + x = self.input_layernorm(x) + x = self.self_attn(x, attention_masks[self.layer_type], freqs_cis) + x = residual + x + + residual = x + x = self.post_attention_layernorm(x) + x = self.mlp(x) + x = residual + x + return x + + +def _make_full_causal_mask(B: int, S: int, key_padding_mask: Optional[torch.Tensor], dtype, device): + neg = torch.finfo(dtype).min + mask = torch.full((S, S), neg, dtype=dtype, device=device).triu_(1) + mask = mask.unsqueeze(0).unsqueeze(0).expand(B, 1, S, S).contiguous() + if key_padding_mask is not None: + kp = key_padding_mask.to(dtype=dtype) + kp = (1.0 - kp).reshape(B, 1, 1, S) * neg + mask = mask + kp + return mask + + +def _make_sliding_causal_mask(B: int, S: int, window: int, key_padding_mask: Optional[torch.Tensor], dtype, device): + neg = torch.finfo(dtype).min + i = torch.arange(S, device=device).view(-1, 1) + j = torch.arange(S, device=device).view(1, -1) + keep = (j <= i) & (j > i - window) + mask = torch.where(keep, torch.zeros((), dtype=dtype, device=device), torch.full((), neg, dtype=dtype, device=device)) + mask = mask.unsqueeze(0).unsqueeze(0).expand(B, 1, S, S).contiguous() + if key_padding_mask is not None: + kp = key_padding_mask.to(dtype=dtype) + kp = (1.0 - kp).reshape(B, 1, 1, S) * neg + mask = mask + kp + return mask + + +class GptOssModel(nn.Module): + """GPT-OSS decoder with multi-layer hidden-state capture + early exit.""" + + def __init__(self, config: GptOss20BConfig, device=None, dtype=None, ops: Any = None): + super().__init__() + self.config = config + self.dtype = dtype + self.embed_tokens = ops.Embedding(config.vocab_size, config.hidden_size, device=device, dtype=dtype) + self.layers = nn.ModuleList( + [ + GptOssDecoderLayer(config, i, device=device, dtype=dtype, ops=ops) + for i in range(config.num_hidden_layers) + ] + ) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, device=device, dtype=dtype) + + # Always build on CPU so the buffer survives meta-device construction. + inv_freq, attn_scaling = _yarn_inv_freq( + head_dim=config.head_dim, + base=config.rope_theta, + factor=config.rope_factor, + beta_fast=config.rope_beta_fast, + beta_slow=config.rope_beta_slow, + original_max_position_embeddings=config.original_max_position_embeddings, + truncate=config.rope_truncate, + device=torch.device("cpu"), + ) + self.register_buffer("rope_inv_freq", inv_freq, persistent=False) + self.rope_attention_scaling = float(attn_scaling) + + @property + def num_layers(self) -> int: + return self.config.num_hidden_layers + + def get_input_embeddings(self): + return self.embed_tokens + + def _build_attention_masks(self, B: int, S: int, attention_mask: Optional[torch.Tensor], dtype: torch.dtype, device, + ) -> dict[str, torch.Tensor]: + full = _make_full_causal_mask(B, S, attention_mask, dtype, device) + masks = {"full_attention": full} + if any(t == "sliding_attention" for t in self.config.layer_types): + masks["sliding_attention"] = _make_sliding_causal_mask( + B, S, self.config.sliding_window, attention_mask, dtype, device + ) + return masks + + def forward(self, input_ids: torch.LongTensor, attention_mask: Optional[torch.Tensor] = None, + capture_layers: Optional[Sequence[int]] = None) -> dict[str, Any]: + B, S = input_ids.shape + device = input_ids.device + dtype = self.dtype + + hidden_states = self.embed_tokens(input_ids, out_dtype=dtype) + + position_ids = torch.arange(S, device=device).unsqueeze(0).expand(B, -1) + freqs_cis = _build_freqs_cis(self.rope_inv_freq.to(device=device), self.rope_attention_scaling, position_ids, dtype) + + attn_masks = self._build_attention_masks(B, S, attention_mask, dtype, device) + + capture_layers = list(capture_layers) if capture_layers else None + if capture_layers: + max_layer = max(capture_layers) + wanted = {idx: pos for pos, idx in enumerate(capture_layers)} + captured: List[Optional[torch.Tensor]] = [None] * len(capture_layers) + else: + max_layer = self.config.num_hidden_layers - 1 + wanted = None + captured = None + + for i, layer in enumerate(self.layers): + hidden_states = layer(hidden_states, attn_masks, freqs_cis) + if wanted is not None and i in wanted: + captured[wanted[i]] = hidden_states + if i >= max_layer: + break + + if captured is not None: + return {"hidden_states": captured} + return {"last_hidden_state": self.norm(hidden_states)} + + +# Lens chat-template constants (verbatim from the reference pipeline). +_LENS_CHAT_SYSTEM = ( + "Describe the image by detailing the color, shape, size, texture, " + "quantity, text, spatial relationships of the objects and background." +) +_LENS_CHAT_ASSISTANT_THINKING = "Need to generate one image according to the description." +LENS_TXT_OFFSET = 97 +LENS_SELECTED_LAYERS = (5, 11, 17, 23) +LENS_MAX_TOKENS = 512 + + +# The reference GPT-OSS Harmony template injects today's date here +_LENS_CHAT_DATE = "2026-05-23" + + +def _lens_render_chat(prompt: str) -> str: + """Render the Lens prompt in GPT-OSS Harmony format.""" + return ( + f"<|start|>system<|message|>" + f"You are ChatGPT, a large language model trained by OpenAI.\n" + f"Knowledge cutoff: 2024-06\n" + f"Current date: {_LENS_CHAT_DATE}\n\n" + f"Reasoning: medium\n\n" + f"# Valid channels: analysis, commentary, final. " + f"Channel must be included for every message.<|end|>" + f"<|start|>developer<|message|># Instructions\n\n" + f"{_LENS_CHAT_SYSTEM}\n\n<|end|>" + f"<|start|>user<|message|>{prompt}<|end|>" + f"<|start|>assistant<|channel|>analysis<|message|>" + f"{_LENS_CHAT_ASSISTANT_THINKING}<|end|>" + f"<|start|>assistant<|channel|>final<|message|>" + ) + + +# GPT-OSS-20B fixed token IDs (from the tokenizer's added-tokens table). +_LENS_PAD_TOKEN_ID = 199999 # <|endoftext|> + + +class _GptOssRawTokenizer: + """Raw ``tokenizers.Tokenizer`` wrapper. + + The tokenizer JSON ships as a byte tensor inside the encoder checkpoint + (``tokenizer_json`` key) rather than as a committed file. Extracted + it in ``sd.py`` and passes it here via ``tokenizer_data``. + """ + + def __init__(self, tokenizer_json_bytes=None, **kwargs): + from tokenizers import Tokenizer + if isinstance(tokenizer_json_bytes, torch.Tensor): + tokenizer_json_bytes = bytes(tokenizer_json_bytes.tolist()) + if tokenizer_json_bytes is None: + raise ValueError( + "Lens tokenizer requires the ``tokenizer_json`` byte tensor in the " + "encoder state dict. Re-bundle the encoder via bundle_te.py so it " + "embeds the tokenizer." + ) + self.tokenizer = Tokenizer.from_str(tokenizer_json_bytes.decode("utf-8")) + + @classmethod + def from_pretrained(cls, tokenizer_data, **kwargs): + return cls(tokenizer_json_bytes=tokenizer_data, **kwargs) + + def __call__(self, text): + return {"input_ids": self.tokenizer.encode(text, add_special_tokens=False).ids} + + def get_vocab(self): + return self.tokenizer.get_vocab() + + def convert_tokens_to_ids(self, tokens): + return [self.tokenizer.token_to_id(t) for t in tokens] + + def decode(self, ids, **kwargs): + return self.tokenizer.decode(ids, skip_special_tokens=kwargs.get("skip_special_tokens", False)) + + +class LensGptOssTokenizer(sd1_clip.SDTokenizer): + tokenizer_json_data = None + + def __init__(self, embedding_directory=None, tokenizer_data={}): + tokenizer_json = tokenizer_data.get("tokenizer_json", None) + self.tokenizer_json_data = tokenizer_json + super().__init__( + tokenizer_json, + embedding_directory=embedding_directory, + pad_with_end=False, + embedding_size=2880, + embedding_key="gpt_oss", + tokenizer_class=_GptOssRawTokenizer, + has_start_token=False, + has_end_token=False, + pad_to_max_length=False, + max_length=99999999, + min_length=1, + pad_left=False, + disable_weights=True, + tokenizer_data=tokenizer_data, + ) + self.pad_token_id = _LENS_PAD_TOKEN_ID + + def tokenize_with_weights(self, text: str, return_word_ids=False, **kwargs): + # Empty prompt -> empty list; encode_token_weights returns zeros (uncond). + if not text or not text.strip(): + return [[]] + rendered = _lens_render_chat(text) + ids = self.tokenizer(rendered)["input_ids"] + if len(ids) > LENS_MAX_TOKENS: + ids = ids[:LENS_MAX_TOKENS] + return [[(int(t), 1.0) for t in ids]] + + def state_dict(self): + if self.tokenizer_json_data is not None: + return {"tokenizer_json": self.tokenizer_json_data} + return {} + + +class LensTokenizer(sd1_clip.SD1Tokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + super().__init__( + embedding_directory=embedding_directory, + tokenizer_data=tokenizer_data, + name="gpt_oss", + tokenizer=LensGptOssTokenizer, + ) + + +class LensGptOssClipModel(nn.Module): + """SDClipModel-shaped Lens GPT-OSS encoder (multi-layer feature extractor).""" + + def __init__(self, device="cpu", dtype=None, model_options=None, **kwargs): + super().__init__() + model_options = dict(model_options or {}) + + operations = model_options.get("custom_operations") + if operations is None: + quant_config = model_options.get("quantization_metadata") or {} + operations = comfy.ops.mixed_precision_ops(quant_config, dtype, full_precision_mm=True) + self.operations = operations + + cfg_overrides = model_options.get("gpt_oss_config", {}) + self.config = GptOss20BConfig(**cfg_overrides) + self.selected_layers = tuple(model_options.get("selected_layers", LENS_SELECTED_LAYERS)) + self.txt_offset = int(model_options.get("txt_offset", LENS_TXT_OFFSET)) + + self.transformer = GptOssModel(self.config, device=device, dtype=dtype, ops=operations) + self.num_layers = self.config.num_hidden_layers + self.dtype = dtype + self.execution_device = None + self._pad_token_id = _LENS_PAD_TOKEN_ID + + def set_clip_options(self, options): + self.execution_device = options.get("execution_device", self.execution_device) + + def reset_clip_options(self): + self.execution_device = None + + def _gather_tokens(self, token_weight_pairs): + ids_list = [[int(t[0]) for t in batch] for batch in token_weight_pairs] + pad_id = self._pad_token_id + max_len = max(len(x) for x in ids_list) + device = self.execution_device + ids = torch.full((len(ids_list), max_len), pad_id, dtype=torch.long, device=device) + mask = torch.zeros((len(ids_list), max_len), dtype=torch.long, device=device) + for i, x in enumerate(ids_list): + ids[i, : len(x)] = torch.tensor(x, dtype=torch.long, device=device) + mask[i, : len(x)] = 1 + return ids, mask + + def encode_token_weights(self, token_weight_pairs): + # Empty negative: emit zero-length features + zero mask + if all(len(batch) == 0 for batch in token_weight_pairs): + device = self.execution_device + B = len(token_weight_pairs) + L = len(self.selected_layers) + H = self.config.hidden_size + flat = torch.zeros(B, 0, L * H, dtype=self.dtype, device=device) + mask = torch.zeros(B, 0, dtype=torch.long, device=device) + return flat, None, {"attention_mask": mask, "num_layers_stacked": L} + + input_ids, attn_mask = self._gather_tokens(token_weight_pairs) + out = self.transformer(input_ids, attention_mask=attn_mask, capture_layers=self.selected_layers) + layers = out["hidden_states"] # list of L × [B, S, H] + stacked = torch.stack(layers, dim=2) # [B, S, L, H] + + offset = self.txt_offset + if stacked.shape[1] > offset: + stacked = stacked[:, offset:].contiguous() + mask_trim = attn_mask[:, offset:] + else: + stacked = stacked[:, :0] + mask_trim = attn_mask[:, :0] + + B, S, L, H = stacked.shape + flat = stacked.reshape(B, S, L * H) + extra = {"attention_mask": mask_trim, "num_layers_stacked": L} + return flat, None, extra + + def load_sd(self, sd): + return self.transformer.load_state_dict(sd, strict=False, assign=True) + + +class LensTEModel(sd1_clip.SD1ClipModel): + def __init__(self, device="cpu", dtype=None, model_options=None): + super().__init__(device=device, dtype=dtype, name="gpt_oss", clip_model=LensGptOssClipModel, model_options=model_options or {}) + + +def lens_te(dtype_llama=None, llama_quantization_metadata=None): + class LensTEModel_(LensTEModel): + def __init__(self, device="cpu", dtype=None, model_options=None): + mo = dict(model_options or {}) + if llama_quantization_metadata is not None: + mo["quantization_metadata"] = llama_quantization_metadata + if dtype is None and dtype_llama is not None: + dtype = dtype_llama + super().__init__(device=device, dtype=dtype, model_options=mo) + + return LensTEModel_ diff --git a/comfy/text_encoders/pixeldit.py b/comfy/text_encoders/pixeldit.py new file mode 100644 index 000000000..3539711e4 --- /dev/null +++ b/comfy/text_encoders/pixeldit.py @@ -0,0 +1,104 @@ +import torch + +from comfy import sd1_clip +from .lumina2 import Gemma2BTokenizer, LuminaModel +import comfy.text_encoders.llama + + +class PixelDiTGemma2_2BModel(sd1_clip.SDClipModel): + def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options={}): + llama_quantization_metadata = model_options.get("llama_quantization_metadata", None) + if llama_quantization_metadata is not None: + model_options = model_options.copy() + model_options["quantization_metadata"] = llama_quantization_metadata + + super().__init__( + device=device, layer=layer, layer_idx=layer_idx, + textmodel_json_config={}, dtype=dtype, + special_tokens={"start": 2, "pad": 0}, + layer_norm_hidden_state=False, + model_class=comfy.text_encoders.llama.Gemma2_2B, + enable_attention_masks=attention_mask, + return_attention_masks=attention_mask, + model_options=model_options, + ) + + +_PIXELDIT_CHI_PROMPT = ( + 'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions ' + "suitable for image generation. Evaluate the level of detail in the user prompt:\n" + "- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, " + "and spatial relationships to create vivid and concrete scenes.\n" + "- If the prompt is already detailed, refine and enhance the existing details slightly without " + "overcomplicating.\n" + "Here are examples of how to transform or refine prompts:\n" + "- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, " + "sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.\n" + "- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring " + "glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus " + "passing by towering glass skyscrapers.\n" + "Please generate only the enhanced description for the prompt below and avoid including any " + "additional commentary or evaluations:\n" + "User Prompt: " +) + +_PIXELDIT_MAX_LENGTH = 300 +_PIXELDIT_CHI_PROMPT_DETECT_PREFIX = 'Given a user prompt, generate an "Enhanced prompt"' + + +class PixelDiTGemma2Tokenizer(sd1_clip.SD1Tokenizer): + def __init__(self, embedding_directory=None, tokenizer_data=None): + if tokenizer_data is None: + tokenizer_data = {} + super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, + name="gemma2_2b", tokenizer=Gemma2BTokenizer) + + def tokenize_with_weights(self, text, return_word_ids=False, **kwargs): + if not text.strip(): + return super().tokenize_with_weights("", return_word_ids=return_word_ids, disable_weights=True, min_length=_PIXELDIT_MAX_LENGTH) + + chi_token_count = len(self.gemma2_2b.tokenizer(_PIXELDIT_CHI_PROMPT)["input_ids"]) + combined = text if text.startswith(_PIXELDIT_CHI_PROMPT_DETECT_PREFIX) else _PIXELDIT_CHI_PROMPT + text + max_length_all = chi_token_count + _PIXELDIT_MAX_LENGTH - 2 + out = super().tokenize_with_weights(combined, return_word_ids=return_word_ids, + disable_weights=True, min_length=max_length_all) + out["gemma2_2b"] = [out["gemma2_2b"][0][:max_length_all]] + return out + + def untokenize(self, token_weight_pair): + return self.gemma2_2b.untokenize(token_weight_pair) + + def state_dict(self): + return self.gemma2_2b.state_dict() + + +class PixelDiTGemma2TE(LuminaModel): + # PixelDiT's select_index: keep BOS + last 299 embeddings of the padded sequence. + def __init__(self, device="cpu", dtype=None, model_options={}): + super().__init__(device=device, dtype=dtype, name="gemma2_2b", + clip_model=PixelDiTGemma2_2BModel, model_options=model_options) + + def encode_token_weights(self, token_weight_pairs): + result = super().encode_token_weights(token_weight_pairs) + cond, pooled = result[0], result[1] + extra = result[2] if len(result) > 2 else None + if cond.shape[1] > _PIXELDIT_MAX_LENGTH: + cond = torch.cat([cond[:, :1], cond[:, -(_PIXELDIT_MAX_LENGTH - 1):]], dim=1) + if extra is not None and "attention_mask" in extra: + am = extra["attention_mask"] + extra["attention_mask"] = torch.cat([am[..., :1], am[..., -(_PIXELDIT_MAX_LENGTH - 1):]], dim=-1) + if extra is not None: + return cond, pooled, extra + return cond, pooled + + +def pixeldit_te(dtype_llama=None, llama_quantization_metadata=None): + class PixelDiTTE_(PixelDiTGemma2TE): + def __init__(self, device="cpu", dtype=None, model_options={}): + if llama_quantization_metadata is not None: + model_options = model_options.copy() + model_options["llama_quantization_metadata"] = llama_quantization_metadata + if dtype_llama is not None: + dtype = dtype_llama + super().__init__(device=device, dtype=dtype, model_options=model_options) + return PixelDiTTE_ diff --git a/comfy/text_encoders/sa3.py b/comfy/text_encoders/sa3.py new file mode 100644 index 000000000..0a1c73ec1 --- /dev/null +++ b/comfy/text_encoders/sa3.py @@ -0,0 +1,207 @@ +import torch +import torch.nn as nn +from comfy import sd1_clip +from comfy.text_encoders.llama import Attention as LlamaAttention, RMSNorm, MLP, precompute_freqs_cis, apply_rope, _make_scaled_embedding +from comfy.text_encoders.spiece_tokenizer import SPieceTokenizer + + +class T5GemmaEncoderConfig: + def __init__(self): + self.vocab_size = 256000 + self.hidden_size = 768 + self.intermediate_size = 2048 + self.num_hidden_layers = 12 + self.num_attention_heads = 12 + self.num_key_value_heads = 12 + self.head_dim = 64 + self.rms_norm_eps = 1e-6 + self.rms_norm_add = False + self.rope_theta = 10000.0 + self.attn_logit_softcapping = 50.0 + self.query_pre_attn_scalar = 64 + self.sliding_window = 4096 + self.mlp_activation = "gelu_pytorch_tanh" + self.layer_types = ["sliding_attention", "full_attention"] * 6 + self.qkv_bias = False + self.q_norm = None + self.k_norm = None + self.rms_norm_add = True + + +class T5GemmaAttention(LlamaAttention): + """Reuses LlamaAttention projection setup; overrides forward for softcap attention. + + T5Gemma applies tanh(QK^T * scale / cap) * cap between the matmul and softmax. + This nonlinearity is incompatible with fused SDPA kernels, so attention is + computed manually. Everything else (projections, RoPE, GQA expansion) is identical + to LlamaAttention so __init__ is inherited unchanged. + """ + + def __init__(self, config, device=None, dtype=None, ops=None): + super().__init__(config, device=device, dtype=dtype, ops=ops) + self.scale = config.query_pre_attn_scalar ** -0.5 + self.softcap = config.attn_logit_softcapping + + def forward(self, hidden_states, attention_mask=None, freqs_cis=None, **kwargs): + B, S, _ = hidden_states.shape + xq = self.q_proj(hidden_states).view(B, S, self.num_heads, self.head_dim).transpose(1, 2) + xk = self.k_proj(hidden_states).view(B, S, self.num_kv_heads, self.head_dim).transpose(1, 2) + xv = self.v_proj(hidden_states).view(B, S, self.num_kv_heads, self.head_dim).transpose(1, 2) + xq, xk = apply_rope(xq, xk, freqs_cis) + xk = xk.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1) + xv = xv.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1) + attn = torch.matmul(xq * self.scale, xk.transpose(-2, -1)) + attn = torch.tanh(attn / self.softcap) * self.softcap + if attention_mask is not None: + attn = attn + attention_mask + attn = torch.nn.functional.softmax(attn.float(), dim=-1).to(xq.dtype) + out = torch.matmul(attn, xv).transpose(1, 2).reshape(B, S, self.inner_size) + return self.o_proj(out), None + + +class T5GemmaBlock(nn.Module): + def __init__(self, config, layer_type, device=None, dtype=None, ops=None): + super().__init__() + self.self_attn = T5GemmaAttention(config, device=device, dtype=dtype, ops=ops) + self.mlp = MLP(config, device=device, dtype=dtype, ops=ops) + # Names match checkpoint keys: model.encoder.layers.X..weight + self.pre_self_attn_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=True, device=device, dtype=dtype) + self.post_self_attn_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=True, device=device, dtype=dtype) + self.pre_feedforward_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=True, device=device, dtype=dtype) + self.post_feedforward_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=True, device=device, dtype=dtype) + self.is_sliding = (layer_type == "sliding_attention") + self.sliding_window = config.sliding_window + + def forward(self, x, attention_mask=None, freqs_cis=None): + attn_mask = attention_mask + if self.is_sliding and x.shape[1] > self.sliding_window: + S = x.shape[1] + pos = torch.arange(S, device=x.device) + dist = (pos.unsqueeze(0) - pos.unsqueeze(1)).abs() + sw_mask = torch.zeros(S, S, dtype=x.dtype, device=x.device) + sw_mask.masked_fill_(dist > self.sliding_window, -torch.finfo(x.dtype).max) + sw_mask = sw_mask.unsqueeze(0).unsqueeze(0) + attn_mask = (attention_mask + sw_mask) if attention_mask is not None else sw_mask + residual = x + x = self.pre_self_attn_layernorm(x) + x, _ = self.self_attn(x, attention_mask=attn_mask, freqs_cis=freqs_cis) + x = self.post_self_attn_layernorm(x) + x = residual + x + residual = x + x = self.pre_feedforward_layernorm(x) + x = self.mlp(x) + x = self.post_feedforward_layernorm(x) + x = residual + x + return x + + +class T5GemmaEncoder(nn.Module): + """Encoder stack: embed_tokens, layers, norm. + Keys: embed_tokens.*, layers.X.*, norm.*""" + + def __init__(self, config, device, dtype, ops): + super().__init__() + self.config = config + # Gemma-style scaled embedding: output *= sqrt(hidden_size) + self.embed_tokens = _make_scaled_embedding( + ops, config.vocab_size, config.hidden_size, config.hidden_size ** 0.5, device, dtype) + self.layers = nn.ModuleList([ + T5GemmaBlock(config, config.layer_types[i], device=device, dtype=dtype, ops=ops) + for i in range(config.num_hidden_layers) + ]) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=True, device=device, dtype=dtype) + + def forward(self, input_ids, attention_mask=None, embeds=None, intermediate_output=None, + final_layer_norm_intermediate=True, dtype=None, num_layers=None): + x = embeds if embeds is not None else self.embed_tokens(input_ids, out_dtype=dtype or torch.float32) + seq_len = x.shape[1] + position_ids = torch.arange(seq_len, device=x.device).unsqueeze(0) + freqs_cis = precompute_freqs_cis(self.config.head_dim, position_ids, self.config.rope_theta, device=x.device) + mask = None + if attention_mask is not None: + mask = 1.0 - attention_mask.to(x.dtype).reshape( + (attention_mask.shape[0], 1, -1, attention_mask.shape[-1]) + ).expand(attention_mask.shape[0], 1, seq_len, attention_mask.shape[-1]) + mask = mask.masked_fill(mask.to(torch.bool), -torch.finfo(x.dtype).max) + intermediate = None + for i, layer in enumerate(self.layers): + x = layer(x, attention_mask=mask, freqs_cis=freqs_cis) + if i == intermediate_output: + intermediate = x.clone() + x = self.norm(x) + if intermediate is not None and final_layer_norm_intermediate: + intermediate = self.norm(intermediate) + return x, intermediate + + +class T5GemmaBody(nn.Module): + """Provides the 'encoder' sub-module. + Keys: encoder.*""" + + def __init__(self, config, device, dtype, ops): + super().__init__() + self.encoder = T5GemmaEncoder(config, device, dtype, ops) + + +class T5GemmaModel(nn.Module): + """Top-level model class passed to SDClipModel as model_class. + Module layout: self.model.encoder.* → matches checkpoint keys model.encoder.*""" + + def __init__(self, config_dict, dtype, device, operations): + super().__init__() + config = T5GemmaEncoderConfig() + self.num_layers = config.num_hidden_layers + self.dtype = dtype + self.model = T5GemmaBody(config, device, dtype, operations) + + def get_input_embeddings(self): + return self.model.encoder.embed_tokens + + def set_input_embeddings(self, embeddings): + self.model.encoder.embed_tokens = embeddings + + def forward(self, input_ids, attention_mask=None, embeds=None, num_tokens=None, + intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, **kwargs): + if intermediate_output is not None and intermediate_output < 0: + intermediate_output = self.num_layers + intermediate_output + return self.model.encoder( + input_ids, attention_mask=attention_mask, embeds=embeds, + intermediate_output=intermediate_output, + final_layer_norm_intermediate=final_layer_norm_intermediate, + dtype=dtype, num_layers=self.num_layers) + + +class T5GemmaSDClipModel(sd1_clip.SDClipModel): + def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, model_options={}): + super().__init__(device=device, layer=layer, layer_idx=layer_idx, + textmodel_json_config={}, dtype=dtype, + special_tokens={"pad": 0}, + model_class=T5GemmaModel, + enable_attention_masks=True, zero_out_masked=True, + model_options=model_options) + + +class T5GemmaSDTokenizer(sd1_clip.SDTokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + tokenizer_model = tokenizer_data.get("spiece_model", None) + super().__init__(tokenizer_model, pad_with_end=False, embedding_size=768, + embedding_key="t5gemma", tokenizer_class=SPieceTokenizer, + has_start_token=False, has_end_token=False, pad_to_max_length=False, + max_length=99999999, min_length=1, pad_token=0, + tokenizer_data=tokenizer_data, + tokenizer_args={"add_bos": False, "add_eos": False}) + + def state_dict(self): + return {"spiece_model": self.tokenizer.serialize_model()} + + +class SAT5GemmaTokenizer(sd1_clip.SD1Tokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + super().__init__(embedding_directory=embedding_directory, + tokenizer_data=tokenizer_data, clip_name="t5gemma", tokenizer=T5GemmaSDTokenizer) + + +class SAT5GemmaModel(sd1_clip.SD1ClipModel): + def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs): + super().__init__(device=device, dtype=dtype, model_options=model_options, + name="t5gemma", clip_model=T5GemmaSDClipModel, **kwargs) diff --git a/comfy/utils.py b/comfy/utils.py index 66682690a..49ae12b06 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -86,6 +86,7 @@ def load_safetensors(ckpt): import comfy_aimdo.model_mmap f = open(ckpt, "rb", buffering=0) + file_lock = threading.Lock() model_mmap = comfy_aimdo.model_mmap.ModelMMAP(ckpt) file_size = os.path.getsize(ckpt) mv = memoryview((ctypes.c_uint8 * file_size).from_address(model_mmap.get())) @@ -111,9 +112,8 @@ def load_safetensors(ckpt): storage = tensor.untyped_storage() setattr(storage, "_comfy_tensor_file_slice", - comfy.memory_management.TensorFileSlice(f, threading.get_ident(), data_base_offset + start, end - start)) + comfy.memory_management.TensorFileSlice(f, file_lock, data_base_offset + start, end - start)) setattr(storage, "_comfy_tensor_mmap_refs", (model_mmap, mv)) - setattr(storage, "_comfy_tensor_mmap_touched", False) sd[name] = tensor return sd, header.get("__metadata__", {}), @@ -1020,10 +1020,11 @@ def bislerp(samples, width, height): def lanczos(samples, width, height): #the below API is strict and expects grayscale to be squeezed - samples = samples.squeeze(1) if samples.shape[1] == 1 else samples.movedim(1, -1) + if samples.ndim == 4: + samples = samples.squeeze(1) if samples.shape[1] == 1 else samples.movedim(1, -1) images = [Image.fromarray(np.clip(255. * image.cpu().numpy(), 0, 255).astype(np.uint8)) for image in samples] images = [image.resize((width, height), resample=Image.Resampling.LANCZOS) for image in images] - images = [torch.from_numpy(np.array(image).astype(np.float32) / 255.0).movedim(-1, 0) for image in images] + images = [torch.from_numpy(t).movedim(-1, 0) if (t := np.array(image).astype(np.float32) / 255.0).ndim == 3 else torch.from_numpy(t) for image in images] result = torch.stack(images) return result.to(samples.device, samples.dtype) @@ -1451,4 +1452,3 @@ def deepcopy_list_dict(obj, memo=None): memo[obj_id] = res return res - diff --git a/comfy/windows.py b/comfy/windows.py deleted file mode 100644 index 213dc481d..000000000 --- a/comfy/windows.py +++ /dev/null @@ -1,52 +0,0 @@ -import ctypes -import logging -import psutil -from ctypes import wintypes - -import comfy_aimdo.control - -psapi = ctypes.WinDLL("psapi") -kernel32 = ctypes.WinDLL("kernel32") - -class PERFORMANCE_INFORMATION(ctypes.Structure): - _fields_ = [ - ("cb", wintypes.DWORD), - ("CommitTotal", ctypes.c_size_t), - ("CommitLimit", ctypes.c_size_t), - ("CommitPeak", ctypes.c_size_t), - ("PhysicalTotal", ctypes.c_size_t), - ("PhysicalAvailable", ctypes.c_size_t), - ("SystemCache", ctypes.c_size_t), - ("KernelTotal", ctypes.c_size_t), - ("KernelPaged", ctypes.c_size_t), - ("KernelNonpaged", ctypes.c_size_t), - ("PageSize", ctypes.c_size_t), - ("HandleCount", wintypes.DWORD), - ("ProcessCount", wintypes.DWORD), - ("ThreadCount", wintypes.DWORD), - ] - -def get_free_ram(): - #Windows is way too conservative and chalks recently used uncommitted model RAM - #as "in-use". So, calculate free RAM for the sake of general use as the greater of: - # - #1: What psutil says - #2: Total Memory - (Committed Memory - VRAM in use) - # - #We have to subtract VRAM in use from the comitted memory as WDDM creates a naked - #commit charge for all VRAM used just incase it wants to page it all out. This just - #isn't realistic so "overcommit" on our calculations by just subtracting it off. - - pi = PERFORMANCE_INFORMATION() - pi.cb = ctypes.sizeof(pi) - - if not psapi.GetPerformanceInfo(ctypes.byref(pi), pi.cb): - logging.warning("WARNING: Failed to query windows performance info. RAM usage may be sub optimal") - return psutil.virtual_memory().available - - committed = pi.CommitTotal * pi.PageSize - total = pi.PhysicalTotal * pi.PageSize - - return max(psutil.virtual_memory().available, - total - (committed - comfy_aimdo.control.get_total_vram_usage())) - diff --git a/comfy_api/latest/__init__.py b/comfy_api/latest/__init__.py index 04973fea0..e0a585b10 100644 --- a/comfy_api/latest/__init__.py +++ b/comfy_api/latest/__init__.py @@ -1,5 +1,3 @@ -from __future__ import annotations - from abc import ABC, abstractmethod from typing import TYPE_CHECKING from comfy_api.internal import ComfyAPIBase diff --git a/comfy_api/latest/_input_impl/video_types.py b/comfy_api/latest/_input_impl/video_types.py index 942278d88..99e67d363 100644 --- a/comfy_api/latest/_input_impl/video_types.py +++ b/comfy_api/latest/_input_impl/video_types.py @@ -1,4 +1,3 @@ -from __future__ import annotations from av.container import InputContainer from av.subtitles.stream import SubtitleStream from fractions import Fraction diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index 5ed968960..fed8dc7f0 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -762,10 +762,17 @@ class Accumulation(ComfyTypeIO): @comfytype(io_type="LOAD3D_CAMERA") class Load3DCamera(ComfyTypeIO): class CameraInfo(TypedDict): - position: dict[str, float | int] - target: dict[str, float | int] - zoom: int - cameraType: str + # Coordinate system: right-handed, Y-up, camera looks down -Z + position: dict[str, float | int] # scene units + target: dict[str, float | int] # scene units; OrbitControls focus point + zoom: float | int # dimensionless, 1 = 100% + cameraType: str # 'perspective' | 'orthographic' + quaternion: NotRequired[dict[str, float | int]] # normalized, dimensionless; camera world rotation + fov: NotRequired[float | int] # degrees, vertical FOV (perspective only) + aspect: NotRequired[float | int] # width / height (perspective only) + near: NotRequired[float | int] # scene units + far: NotRequired[float | int] # scene units + frustum: NotRequired[dict[str, float | int]] # orthographic only: {left, right, top, bottom} in scene units Type = CameraInfo diff --git a/comfy_api/latest/_util/video_types.py b/comfy_api/latest/_util/video_types.py index c92477f08..6c9d6a526 100644 --- a/comfy_api/latest/_util/video_types.py +++ b/comfy_api/latest/_util/video_types.py @@ -1,4 +1,3 @@ -from __future__ import annotations from dataclasses import dataclass from enum import Enum from fractions import Fraction diff --git a/comfy_api_nodes/apis/__init__.py b/comfy_api_nodes/apis/__init__.py index 46a583b5e..9c4cfb9b6 100644 --- a/comfy_api_nodes/apis/__init__.py +++ b/comfy_api_nodes/apis/__init__.py @@ -3,7 +3,6 @@ # timestamp: 2025-07-30T08:54:00+00:00 # pylint: disable -from __future__ import annotations from datetime import date, datetime from enum import Enum diff --git a/comfy_api_nodes/apis/anthropic.py b/comfy_api_nodes/apis/anthropic.py index 6cac537ea..46a5bb428 100644 --- a/comfy_api_nodes/apis/anthropic.py +++ b/comfy_api_nodes/apis/anthropic.py @@ -35,6 +35,19 @@ class AnthropicMessage(BaseModel): content: list[AnthropicTextContent | AnthropicImageContent] = Field(...) +class AnthropicThinkingConfig(BaseModel): + type: Literal["enabled", "disabled", "adaptive"] = Field(...) + budget_tokens: int | None = Field( + None, ge=1024, + description="Reasoning budget in tokens. Used when type is 'enabled'. Must be less than max_tokens.", + ) + + +class AnthropicOutputConfig(BaseModel): + """Used with `thinking.type='adaptive'` on models like Opus 4.7.""" + effort: Literal["low", "medium", "high"] | None = Field(None) + + class AnthropicMessagesRequest(BaseModel): model: str = Field(...) messages: list[AnthropicMessage] = Field(...) @@ -44,6 +57,8 @@ class AnthropicMessagesRequest(BaseModel): top_p: float | None = Field(None, ge=0.0, le=1.0) top_k: int | None = Field(None, ge=0) stop_sequences: list[str] | None = Field(None) + thinking: AnthropicThinkingConfig | None = Field(None) + output_config: AnthropicOutputConfig | None = Field(None) class AnthropicResponseTextBlock(BaseModel): @@ -51,6 +66,14 @@ class AnthropicResponseTextBlock(BaseModel): text: str = Field(...) +class AnthropicResponseThinkingBlock(BaseModel): + type: Literal["thinking"] = "thinking" + thinking: str = Field(...) + + +AnthropicResponseBlock = AnthropicResponseTextBlock | AnthropicResponseThinkingBlock + + class AnthropicCacheCreationUsage(BaseModel): ephemeral_5m_input_tokens: int | None = Field(None) ephemeral_1h_input_tokens: int | None = Field(None) @@ -69,7 +92,7 @@ class AnthropicMessagesResponse(BaseModel): type: str | None = Field(None) role: str | None = Field(None) model: str | None = Field(None) - content: list[AnthropicResponseTextBlock] | None = Field(None) + content: list[AnthropicResponseBlock] | None = Field(None) stop_reason: str | None = Field(None) stop_sequence: str | None = Field(None) usage: AnthropicMessagesUsage | None = Field(None) diff --git a/comfy_api_nodes/apis/beeble.py b/comfy_api_nodes/apis/beeble.py new file mode 100644 index 000000000..90175b214 --- /dev/null +++ b/comfy_api_nodes/apis/beeble.py @@ -0,0 +1,32 @@ +from pydantic import BaseModel, Field + + +class CreateSwitchXRequest(BaseModel): + generation_type: str = Field(...) + source_uri: str = Field(...) + alpha_mode: str = Field(...) + prompt: str | None = Field(None, max_length=2000) + reference_image_uri: str | None = Field(None) + alpha_uri: str | None = Field(None) + max_resolution: int = Field(1080) + callback_url: str | None = Field(None) + idempotency_key: str | None = Field(None, max_length=256, min_length=1) + + +class SwitchXOutputUrls(BaseModel): + render: str | None = Field(None) + source: str | None = Field(None) + alpha: str | None = Field(None) + + +class SwitchXStatusResponse(BaseModel): + id: str = Field(...) + status: str = Field(...) + progress: int | None = Field(None) + generation_type: str | None = Field(None) + alpha_mode: str | None = Field(None) + output: SwitchXOutputUrls | None = Field(None) + error: str | None = Field(None) + created_at: str | None = Field(None) + modified_at: str | None = Field(None) + completed_at: str | None = Field(None) diff --git a/comfy_api_nodes/apis/bfl.py b/comfy_api_nodes/apis/bfl.py index d8d3557b3..f0665fa09 100644 --- a/comfy_api_nodes/apis/bfl.py +++ b/comfy_api_nodes/apis/bfl.py @@ -1,5 +1,3 @@ -from __future__ import annotations - from enum import Enum from typing import Any, Dict, Optional diff --git a/comfy_api_nodes/apis/bytedance.py b/comfy_api_nodes/apis/bytedance.py index 03f4c445b..47f24586c 100644 --- a/comfy_api_nodes/apis/bytedance.py +++ b/comfy_api_nodes/apis/bytedance.py @@ -158,8 +158,9 @@ class SeedanceCreateAssetResponse(BaseModel): class SeedanceVirtualLibraryCreateAssetRequest(BaseModel): - url: str = Field(..., description="Publicly accessible URL of the image asset to upload.") + url: str = Field(..., description="Publicly accessible URL of the asset to upload.") hash: str = Field(..., description="Dedup key. Re-submitting the same hash returns the existing asset id.") + asset_type: str | None = Field(None, description="BytePlus asset type. Defaults to Image server-side when omitted.") # Dollars per 1K tokens, keyed by (model_id, has_video_input). diff --git a/comfy_api_nodes/apis/krea.py b/comfy_api_nodes/apis/krea.py new file mode 100644 index 000000000..6e294a3b7 --- /dev/null +++ b/comfy_api_nodes/apis/krea.py @@ -0,0 +1,46 @@ +"""Pydantic models for the Krea image-generation API.""" + +from pydantic import BaseModel, Field + + +class KreaMoodboard(BaseModel): + id: str = Field(...) + strength: float = Field(default=0.35, ge=-0.5, le=1.5) + + +class KreaImageStyleReference(BaseModel): + strength: float = Field(..., ge=-2.0, le=2.0) + url: str | None = Field(default=None) + + +class KreaGenerateImageRequest(BaseModel): + prompt: str = Field(...) + aspect_ratio: str = Field(...) + resolution: str = Field(...) + seed: int | None = Field(default=None) + creativity: str = Field(default="medium") + moodboards: list[KreaMoodboard] | None = Field(default=None) + image_style_references: list[KreaImageStyleReference] | None = Field(default=None) + + +class KreaJobResult(BaseModel): + urls: list[str] | None = Field(default=None) + style_id: str | None = Field(default=None) + + +class KreaJob(BaseModel): + job_id: str = Field(...) + status: str = Field(...) + created_at: str = Field(...) + completed_at: str | None = Field(default=None) + result: KreaJobResult | None = Field(default=None) + + +class KreaAssetResponse(BaseModel): + id: str = Field(...) + image_url: str = Field(...) + uploaded_at: str = Field(...) + width: float | None = Field(default=None) + height: float | None = Field(default=None) + size_bytes: float | None = Field(default=None) + mime_type: str | None = Field(default=None) diff --git a/comfy_api_nodes/apis/openrouter.py b/comfy_api_nodes/apis/openrouter.py new file mode 100644 index 000000000..e30d9bcfb --- /dev/null +++ b/comfy_api_nodes/apis/openrouter.py @@ -0,0 +1,93 @@ +"""Pydantic models for the OpenRouter chat completions API. + +See: https://openrouter.ai/docs/api/api-reference/chat/send-chat-completion-request +""" + +from typing import Literal + +from pydantic import BaseModel, Field + + +class OpenRouterTextContent(BaseModel): + type: Literal["text"] = "text" + text: str = Field(...) + + +class OpenRouterImageUrl(BaseModel): + url: str = Field(...) + + +class OpenRouterImageContent(BaseModel): + type: Literal["image_url"] = "image_url" + image_url: OpenRouterImageUrl = Field(...) + + +class OpenRouterVideoUrl(BaseModel): + url: str = Field(...) + + +class OpenRouterVideoContent(BaseModel): + type: Literal["video_url"] = "video_url" + video_url: OpenRouterVideoUrl = Field(...) + + +OpenRouterContentBlock = OpenRouterTextContent | OpenRouterImageContent | OpenRouterVideoContent + + +class OpenRouterMessage(BaseModel): + role: Literal["system", "user", "assistant"] = Field(...) + content: str | list[OpenRouterContentBlock] = Field(...) + + +class OpenRouterReasoningConfig(BaseModel): + effort: str | None = Field(None) + exclude: bool | None = Field(None, description="If true, model reasons but reasoning is excluded from response.") + + +class OpenRouterWebSearchOptions(BaseModel): + search_context_size: str | None = Field(None) + + +class OpenRouterChatRequest(BaseModel): + model: str = Field(...) + messages: list[OpenRouterMessage] = Field(...) + seed: int | None = Field(None) + reasoning: OpenRouterReasoningConfig | None = Field(None) + web_search_options: OpenRouterWebSearchOptions | None = Field(None) + stream: bool = Field(False) + + +class OpenRouterUsage(BaseModel): + prompt_tokens: int | None = Field(None) + completion_tokens: int | None = Field(None) + total_tokens: int | None = Field(None) + cost: float | None = Field(None, description="Server-side authoritative USD cost of the call.") + + +class OpenRouterResponseMessage(BaseModel): + role: str | None = Field(None) + content: str | None = Field(None) + reasoning: str | None = Field(None) + refusal: str | None = Field(None) + + +class OpenRouterChoice(BaseModel): + index: int | None = Field(None) + message: OpenRouterResponseMessage | None = Field(None) + finish_reason: str | None = Field(None) + + +class OpenRouterError(BaseModel): + code: int | str | None = Field(None) + message: str | None = Field(None) + metadata: dict | None = Field(None) + + +class OpenRouterChatResponse(BaseModel): + id: str | None = Field(None) + model: str | None = Field(None) + object: str | None = Field(None) + provider: str | None = Field(None) + choices: list[OpenRouterChoice] | None = Field(None) + usage: OpenRouterUsage | None = Field(None) + error: OpenRouterError | None = Field(None) diff --git a/comfy_api_nodes/apis/rodin.py b/comfy_api_nodes/apis/rodin.py index fc26a6e73..24524d642 100644 --- a/comfy_api_nodes/apis/rodin.py +++ b/comfy_api_nodes/apis/rodin.py @@ -1,7 +1,5 @@ -from __future__ import annotations - from enum import Enum -from typing import Optional, List + from pydantic import BaseModel, Field @@ -11,44 +9,76 @@ class Rodin3DGenerateRequest(BaseModel): material: str = Field(..., description="The material type.") quality_override: int = Field(..., description="The poly count of the mesh.") mesh_mode: str = Field(..., description="It controls the type of faces of generated models.") - TAPose: Optional[bool] = Field(None, description="") + TAPose: bool | None = Field(None, description="") + + +class Rodin3DGen25Request(BaseModel): + + tier: str = Field(..., description="Gen-2.5 tier (e.g. Gen-2.5-High).") + prompt: str | None = Field(None, description="Required for Text-to-3D; ignored otherwise.") + seed: int | None = Field(None, description="0-65535.") + material: str | None = Field(None, description="PBR | Shaded | All | None.") + geometry_file_format: str | None = Field(None, description="glb | usdz | fbx | obj | stl.") + texture_mode: str | None = Field(None, description="legacy | extreme-low | low | medium | high.") + mesh_mode: str | None = Field(None, description="Raw (triangular) | Quad.") + quality_override: int | None = Field(None, description="Mesh face count override.") + geometry_instruct_mode: str | None = Field(None, description="faithful | creative.") + bbox_condition: list[int] | None = Field(None, description="Bounding box [Width(Y), Height(Z), Length(X)] in cm.") + height: int | None = Field(None, description="Approximate model height in cm.") + TAPose: bool | None = Field(None, description="T/A pose for human-like models.") + hd_texture: bool | None = Field(None, description="Enhanced texture quality.") + texture_delight: bool | None = Field(None, description="Remove baked lighting from textures.") + is_micro: bool | None = Field(None, description="Micro detail (Extreme-High only).") + use_original_alpha: bool | None = Field(None, description="Preserve image transparency.") + preview_render: bool | None = Field(None, description="Generate high-quality preview render.") + addons: list[str] | None = Field(None, description='Optional addons, e.g. ["HighPack"].') + class GenerateJobsData(BaseModel): - uuids: List[str] = Field(..., description="str LIST") + uuids: list[str] = Field(..., description="str LIST") subscription_key: str = Field(..., description="subscription key") + class Rodin3DGenerateResponse(BaseModel): - message: Optional[str] = Field(None, description="Return message.") - prompt: Optional[str] = Field(None, description="Generated Prompt from image.") - submit_time: Optional[str] = Field(None, description="Submit Time") - uuid: Optional[str] = Field(None, description="Task str") - jobs: Optional[GenerateJobsData] = Field(None, description="Details of jobs") + message: str | None = Field(None, description="Return message.") + prompt: str | None = Field(None, description="Generated Prompt from image.") + submit_time: str | None = Field(None, description="Submit Time") + uuid: str | None = Field(None, description="Task str") + jobs: GenerateJobsData | None = Field(None, description="Details of jobs") + class JobStatus(str, Enum): """ Status for jobs """ + Done = "Done" Failed = "Failed" Generating = "Generating" Waiting = "Waiting" + class Rodin3DCheckStatusRequest(BaseModel): subscription_key: str = Field(..., description="subscription from generate endpoint") + class JobItem(BaseModel): uuid: str = Field(..., description="uuid") - status: JobStatus = Field(...,description="Status Currently") + status: JobStatus = Field(..., description="Status Currently") + class Rodin3DCheckStatusResponse(BaseModel): - jobs: List[JobItem] = Field(..., description="Job status List") + jobs: list[JobItem] = Field(..., description="Job status List") + class Rodin3DDownloadRequest(BaseModel): task_uuid: str = Field(..., description="Task str") + class RodinResourceItem(BaseModel): url: str = Field(..., description="Download Url") name: str = Field(..., description="File name with ext") + class Rodin3DDownloadResponse(BaseModel): - list: List[RodinResourceItem] = Field(..., description="Source List") + items: list[RodinResourceItem] = Field(..., alias="list", description="Source List") diff --git a/comfy_api_nodes/apis/stability.py b/comfy_api_nodes/apis/stability.py index 718360187..5b9b5ac7d 100644 --- a/comfy_api_nodes/apis/stability.py +++ b/comfy_api_nodes/apis/stability.py @@ -1,5 +1,3 @@ -from __future__ import annotations - from enum import Enum from typing import Optional diff --git a/comfy_api_nodes/nodes_anthropic.py b/comfy_api_nodes/nodes_anthropic.py index 28dd70d4e..7805c96ce 100644 --- a/comfy_api_nodes/nodes_anthropic.py +++ b/comfy_api_nodes/nodes_anthropic.py @@ -9,8 +9,11 @@ from comfy_api_nodes.apis.anthropic import ( AnthropicMessage, AnthropicMessagesRequest, AnthropicMessagesResponse, + AnthropicOutputConfig, + AnthropicResponseTextBlock, AnthropicRole, AnthropicTextContent, + AnthropicThinkingConfig, ) from comfy_api_nodes.util import ( ApiEndpoint, @@ -32,15 +35,29 @@ CLAUDE_MODELS: dict[str, str] = { "Haiku 4.5": "claude-haiku-4-5-20251001", } +_THINKING_UNSUPPORTED = {"Haiku 4.5"} +# Models that use the newer "adaptive" thinking mode (Opus 4.7 requires it; older models keep the explicit budget API). +# Anthropic decides the actual budget when adaptive is used, based on the `output_config.effort` hint. +_ADAPTIVE_THINKING_MODELS = {"Opus 4.7", "Opus 4.6", "Sonnet 4.6"} -def _claude_model_inputs(): - return [ +# Budget mode (Sonnet 4.5): effort -> reasoning budget in tokens. Must be < max_tokens. +# Sized so even the "high" budget fits comfortably under the default max_tokens=32768. +_REASONING_BUDGET: dict[str, int] = { + "low": 2048, + "medium": 8192, + "high": 16384, +} +_REASONING_EFFORTS = ["off", "low", "medium", "high"] + + +def _claude_model_inputs(model_label: str): + inputs: list = [ IO.Int.Input( "max_tokens", - default=16000, - min=32, - max=32000, - tooltip="Maximum number of tokens to generate before stopping.", + default=32768, + min=4096, + max=64000, + tooltip="Maximum number of tokens to generate (includes reasoning tokens when enabled).", advanced=True, ), IO.Float.Input( @@ -49,10 +66,24 @@ def _claude_model_inputs(): min=0.0, max=1.0, step=0.01, - tooltip="Controls randomness. 0.0 is deterministic, 1.0 is most random. Ignored for Opus 4.7.", + tooltip=( + "Controls randomness. 0.0 is deterministic, 1.0 is most random. " + "Ignored for Opus 4.7 and any model when reasoning_effort is set." + ), advanced=True, ), ] + if model_label not in _THINKING_UNSUPPORTED: + inputs.append( + IO.Combo.Input( + "reasoning_effort", + options=_REASONING_EFFORTS, + default="off", + tooltip="Extended thinking effort. 'off' disables reasoning.", + advanced=True, + ) + ) + return inputs def _model_price_per_million(model: str) -> tuple[float, float] | None: @@ -95,7 +126,11 @@ def calculate_tokens_price(response: AnthropicMessagesResponse) -> float | None: def _get_text_from_response(response: AnthropicMessagesResponse) -> str: if not response.content: return "" - return "\n".join(block.text for block in response.content if block.text) + # Thinking blocks are silently dropped — we never want reasoning in the output. + return "\n".join( + block.text for block in response.content + if isinstance(block, AnthropicResponseTextBlock) and block.text + ) async def _build_image_content_blocks( @@ -120,7 +155,7 @@ class ClaudeNode(IO.ComfyNode): return IO.Schema( node_id="ClaudeNode", display_name="Anthropic Claude", - category="api node/text/Anthropic", + category="text/partner/Anthropic", essentials_category="Text Generation", description="Generate text responses with Anthropic's Claude models. " "Provide a text prompt and optionally one or more images for multimodal context.", @@ -133,7 +168,10 @@ class ClaudeNode(IO.ComfyNode): ), IO.DynamicCombo.Input( "model", - options=[IO.DynamicCombo.Option(label, _claude_model_inputs()) for label in CLAUDE_MODELS], + options=[ + IO.DynamicCombo.Option(label, _claude_model_inputs(label)) + for label in CLAUDE_MODELS + ], tooltip="The Claude model used to generate the response.", ), IO.Int.Input( @@ -207,8 +245,29 @@ class ClaudeNode(IO.ComfyNode): ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=True, min_length=1) model_label = model["model"] - max_tokens = model["max_tokens"] - temperature = None if model_label == "Opus 4.7" else model["temperature"] + max_tokens = model.get("max_tokens", 32768) + reasoning_effort = model.get("reasoning_effort", "off") + thinking_enabled = reasoning_effort not in ("off", None) and model_label not in _THINKING_UNSUPPORTED + + # Anthropic requires temperature to be unset (defaults to 1.0) when thinking is enabled. + # Opus 4.7 also rejects user-supplied temperature. + if thinking_enabled or model_label == "Opus 4.7": + temperature = None + else: + temperature = model.get("temperature", 1.0) + + thinking_cfg: AnthropicThinkingConfig | None = None + output_cfg: AnthropicOutputConfig | None = None + if thinking_enabled: + if model_label in _ADAPTIVE_THINKING_MODELS: + # Adaptive mode - Anthropic chooses the budget based on effort hint + thinking_cfg = AnthropicThinkingConfig(type="adaptive") + output_cfg = AnthropicOutputConfig(effort=reasoning_effort) + else: + # Budget mode (Sonnet 4.5). Leave at least 1024 tokens for the actual response + budget = _REASONING_BUDGET[reasoning_effort] + budget = min(budget, max(1024, max_tokens - 1024)) + thinking_cfg = AnthropicThinkingConfig(type="enabled", budget_tokens=budget) image_tensors: list[Input.Image] = [t for t in (images or {}).values() if t is not None] if sum(get_number_of_images(t) for t in image_tensors) > CLAUDE_MAX_IMAGES: @@ -229,6 +288,8 @@ class ClaudeNode(IO.ComfyNode): messages=[AnthropicMessage(role=AnthropicRole.user, content=content)], system=system_prompt or None, temperature=temperature, + thinking=thinking_cfg, + output_config=output_cfg, ), price_extractor=calculate_tokens_price, ) diff --git a/comfy_api_nodes/nodes_beeble.py b/comfy_api_nodes/nodes_beeble.py new file mode 100644 index 000000000..f1082884c --- /dev/null +++ b/comfy_api_nodes/nodes_beeble.py @@ -0,0 +1,404 @@ +from fractions import Fraction + +from typing_extensions import override + +from comfy_api.latest import IO, ComfyExtension, Input, InputImpl, Types +from comfy_api_nodes.apis.beeble import ( + CreateSwitchXRequest, + SwitchXStatusResponse, +) +from comfy_api_nodes.util import ( + ApiEndpoint, + bytesio_to_image_tensor, + convert_mask_to_image, + download_url_as_bytesio, + download_url_to_image_tensor, + download_url_to_video_output, + downscale_image_tensor, + downscale_video_to_max_pixels, + poll_op, + sync_op, + upload_image_to_comfyapi, + upload_video_to_comfyapi, + validate_string, + validate_video_frame_count, +) + +_MAX_PIXELS = 2_770_000 +_MAX_FRAMES = 240 +_MAX_PROMPT_LEN = 2000 + + +def _validate_inputs(prompt: str | None, reference_image: Input.Image | None) -> str | None: + """Beeble requires at least one of prompt or reference_image. Returns the cleaned prompt.""" + cleaned = prompt.strip() if prompt else "" + if not cleaned and reference_image is None: + raise ValueError("At least one of 'prompt' or 'reference_image' must be provided.") + if cleaned: + validate_string(cleaned, strip_whitespace=False, max_length=_MAX_PROMPT_LEN) + return cleaned or None + + +async def _upload_mask_as_image( + cls: type[IO.ComfyNode], + mask: Input.Image, + *, + wait_label: str, +) -> str: + """Encode a single-frame MASK (H, W) or (1, H, W) as a PNG and upload.""" + if mask.dim() == 2: + mask = mask.unsqueeze(0) + image = convert_mask_to_image(mask[:1]) + return await upload_image_to_comfyapi( + cls, + image, + mime_type="image/png", + wait_label=wait_label, + total_pixels=_MAX_PIXELS, + ) + + +async def _upload_mask_batch_as_video( + cls: type[IO.ComfyNode], + mask: Input.Image, + *, + frame_rate: Fraction, + source_frame_count: int, + wait_label: str, +) -> str: + """Encode a MASK batch (N, H, W) as a grayscale H.264 MP4 at frame_rate and upload. + + The matte is always downscaled to the pixel budget so it stays within Beeble's limit and + keeps the same dimensions as the (similarly downscaled) source — both use the same algorithm + from the same starting dimensions, and downscaling is a no-op when already within budget. + """ + if mask.dim() == 2: + mask = mask.unsqueeze(0) + if mask.shape[0] != source_frame_count: + raise ValueError( + f"Custom alpha video frame count ({mask.shape[0]}) does not match the " + f"source video frame count ({source_frame_count}). The Beeble API requires " + "one mask per source frame." + ) + images = downscale_image_tensor(convert_mask_to_image(mask), _MAX_PIXELS) + alpha_video = InputImpl.VideoFromComponents(Types.VideoComponents(images=images, audio=None, frame_rate=frame_rate)) + return await upload_video_to_comfyapi(cls, alpha_video, wait_label=wait_label) + + +def _alpha_mode_input(*, video: bool) -> IO.DynamicCombo.Input: + """Build the alpha_mode DynamicCombo with mode-specific extra inputs.""" + select_keyframe_tooltip = ( + "First-frame keyframe mask. Beeble propagates this across the video." if video else "Grayscale keyframe mask." + ) + custom_tooltip = ( + "Per-frame grayscale mask covering the entire video. " + "Must have the same frame count as the source. " + "Connect a MASK output from SAM3_TrackToMask or similar." + if video + else "Grayscale mask to apply." + ) + return IO.DynamicCombo.Input( + "alpha_mode", + tooltip=( + "Controls how SwitchX decides what to keep vs. regenerate. " + "'auto' isolates the main subject automatically. " + "'fill' regenerates the entire frame while preserving geometry. " + "'select' propagates a first-frame keyframe across the clip. " + "'custom' uses a per-frame alpha matte you provide." + ), + options=[ + IO.DynamicCombo.Option("auto", []), + IO.DynamicCombo.Option("fill", []), + IO.DynamicCombo.Option( + "select", + [IO.Mask.Input("alpha_keyframe", tooltip=select_keyframe_tooltip)], + ), + IO.DynamicCombo.Option( + "custom", + [IO.Mask.Input("alpha_mask", tooltip=custom_tooltip)], + ), + ], + ) + + +def _common_inputs(*, source: IO.Input, video: bool) -> list[IO.Input]: + return [ + source, + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip=( + "Text description of the desired output (max 2000 chars). " + "At least one of 'prompt' or 'reference_image' is required." + ), + ), + IO.Image.Input( + "reference_image", + optional=True, + tooltip=( + "Reference image whose look (background, lighting, costume) the result " + "should adopt. At least one of 'reference_image' or 'prompt' is required." + ), + ), + _alpha_mode_input(video=video), + IO.Combo.Input( + "max_resolution", + options=["1080p", "720p"], + default="1080p", + tooltip="Maximum output resolution.", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + control_after_generate=True, + tooltip=( + "Seed controls whether the node should re-run; " "results are non-deterministic regardless of seed." + ), + ), + ] + + +async def _submit_and_poll( + cls: type[IO.ComfyNode], + request: CreateSwitchXRequest, +) -> SwitchXStatusResponse: + initial = await sync_op( + cls, + ApiEndpoint(path="/proxy/beeble/v1/switchx/generations", method="POST"), + response_model=SwitchXStatusResponse, + data=request, + ) + return await poll_op( + cls, + ApiEndpoint(path=f"/proxy/beeble/v1/switchx/generations/{initial.id}"), + response_model=SwitchXStatusResponse, + status_extractor=lambda r: r.status, + progress_extractor=lambda r: r.progress, + ) + + +def _require_output_url(response: SwitchXStatusResponse, name: str) -> str: + if response.output is None or getattr(response.output, name) is None: + raise RuntimeError(f"Beeble job {response.id} completed without a {name!r} output URL.") + return getattr(response.output, name) + + +def _alpha_url(response: SwitchXStatusResponse, mode: str) -> str | None: + """URL of the alpha matte, or None when the mode produces no separate matte. + + 'fill' selects the whole frame, so Beeble writes no alpha asset even though the status + response still returns a (dangling) signed URL for it — fetching it 403s with S3 + AccessDenied. The other three modes ('auto', 'custom', 'select') all produce a real, + downloadable matte. + """ + if mode == "fill" or response.output is None: + return None + return response.output.alpha + + +class BeebleSwitchXVideoEdit(IO.ComfyNode): + + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="BeebleSwitchXVideoEdit", + display_name="Beeble SwitchX Video Edit", + category="video/partner/Beeble", + description=( + "Edit a video with Beeble SwitchX. Switches anything in the scene (background, " + "lighting, costume) while preserving the original subject's pixels and motion. " + "Provide a reference image and/or text prompt to describe the new look. " + "Max 240 frames, max ~2.77MP per frame." + ), + inputs=_common_inputs(source=IO.Video.Input("video"), video=True), + outputs=[ + IO.Video.Output(display_name="video"), + IO.Video.Output( + display_name="alpha", + tooltip="The alpha matte Beeble used. Empty for 'fill' mode, which has no separate matte.", + ), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["max_resolution"]), + expr=""" + ( + $rate := widgets.max_resolution = "1080p" ? 0.429 : 0.143; + {"type":"usd","usd": $rate, "format":{"suffix":"/30 frames"}} + ) + """, + ), + ) + + @classmethod + async def execute( + cls, + video: Input.Video, + prompt: str, + alpha_mode: dict, + max_resolution: str, + seed: int, + reference_image: Input.Image | None = None, + ) -> IO.NodeOutput: + cleaned_prompt = _validate_inputs(prompt, reference_image) + + validate_video_frame_count(video, max_frame_count=_MAX_FRAMES) + video = downscale_video_to_max_pixels(video, _MAX_PIXELS) + + mode = alpha_mode["alpha_mode"] + alpha_uri: str | None = None + if mode == "select": + alpha_uri = await _upload_mask_as_image(cls, alpha_mode["alpha_keyframe"], wait_label="Uploading keyframe") + elif mode == "custom": + alpha_uri = await _upload_mask_batch_as_video( + cls, + alpha_mode["alpha_mask"], + frame_rate=video.get_frame_rate(), + source_frame_count=video.get_frame_count(), + wait_label="Uploading alpha video", + ) + + source_uri = await upload_video_to_comfyapi(cls, video, wait_label="Uploading source") + reference_uri: str | None = None + if reference_image is not None: + reference_uri = await upload_image_to_comfyapi( + cls, + reference_image, + mime_type="image/png", + wait_label="Uploading reference", + total_pixels=_MAX_PIXELS, + ) + + request = CreateSwitchXRequest( + generation_type="video", + source_uri=source_uri, + alpha_mode=mode, + prompt=cleaned_prompt, + reference_image_uri=reference_uri, + alpha_uri=alpha_uri, + max_resolution=1080 if max_resolution == "1080p" else 720, + ) + response = await _submit_and_poll(cls, request) + + render = await download_url_to_video_output(_require_output_url(response, "render")) + alpha = None + if (alpha_url := _alpha_url(response, mode)) is not None: + alpha = await download_url_to_video_output(alpha_url) + return IO.NodeOutput(render, alpha) + + +class BeebleSwitchXImageEdit(IO.ComfyNode): + + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="BeebleSwitchXImageEdit", + display_name="Beeble SwitchX Image Edit", + category="image/partner/Beeble", + description=( + "Edit a single image with Beeble SwitchX. Switches anything in the scene " + "(background, lighting, costume) while preserving the original subject's pixels. " + "Provide a reference image and/or text prompt to describe the new look. " + "Max ~2.77MP." + ), + inputs=_common_inputs(source=IO.Image.Input("image"), video=False), + outputs=[ + IO.Image.Output(display_name="image"), + IO.Mask.Output( + display_name="alpha", + tooltip="The alpha matte Beeble used. Empty for 'fill' mode, which has no separate matte.", + ), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["max_resolution"]), + expr=""" + ( + $rate := widgets.max_resolution = "1080p" ? 0.429 : 0.143; + {"type":"usd","usd": $rate} + ) + """, + ), + ) + + @classmethod + async def execute( + cls, + image: Input.Image, + prompt: str, + alpha_mode: dict, + max_resolution: str, + seed: int, + reference_image: Input.Image | None = None, + ) -> IO.NodeOutput: + cleaned_prompt = _validate_inputs(prompt, reference_image) + + image = downscale_image_tensor(image, _MAX_PIXELS) + + mode = alpha_mode["alpha_mode"] + alpha_uri: str | None = None + if mode == "select": + alpha_uri = await _upload_mask_as_image(cls, alpha_mode["alpha_keyframe"], wait_label="Uploading keyframe") + elif mode == "custom": + alpha_uri = await _upload_mask_as_image(cls, alpha_mode["alpha_mask"], wait_label="Uploading alpha") + + source_uri = await upload_image_to_comfyapi( + cls, + image, + mime_type="image/png", + wait_label="Uploading source", + total_pixels=None, + ) + reference_uri: str | None = None + if reference_image is not None: + reference_uri = await upload_image_to_comfyapi( + cls, + reference_image, + mime_type="image/png", + wait_label="Uploading reference", + total_pixels=_MAX_PIXELS, + ) + + request = CreateSwitchXRequest( + generation_type="image", + source_uri=source_uri, + alpha_mode=mode, + prompt=cleaned_prompt, + reference_image_uri=reference_uri, + alpha_uri=alpha_uri, + max_resolution=1080 if max_resolution == "1080p" else 720, + ) + response = await _submit_and_poll(cls, request) + + render = await download_url_to_image_tensor(_require_output_url(response, "render")) + alpha_mask = None + if (alpha_url := _alpha_url(response, mode)) is not None: + alpha_image = bytesio_to_image_tensor(await download_url_as_bytesio(alpha_url), mode="L") + alpha_mask = alpha_image.squeeze(-1) if alpha_image.dim() == 4 else alpha_image + return IO.NodeOutput(render, alpha_mask) + + +class BeebleExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + BeebleSwitchXVideoEdit, + BeebleSwitchXImageEdit, + ] + + +async def comfy_entrypoint() -> BeebleExtension: + return BeebleExtension() diff --git a/comfy_api_nodes/nodes_bfl.py b/comfy_api_nodes/nodes_bfl.py index 3f0ce29d8..f1a5dc5f0 100644 --- a/comfy_api_nodes/nodes_bfl.py +++ b/comfy_api_nodes/nodes_bfl.py @@ -42,7 +42,7 @@ class FluxProUltraImageNode(IO.ComfyNode): return IO.Schema( node_id="FluxProUltraImageNode", display_name="Flux 1.1 [pro] Ultra Image", - category="api node/image/BFL", + category="image/partner/BFL", description="Generates images using Flux Pro 1.1 Ultra via api based on prompt and resolution.", inputs=[ IO.String.Input( @@ -160,7 +160,7 @@ class FluxKontextProImageNode(IO.ComfyNode): return IO.Schema( node_id=cls.NODE_ID, display_name=cls.DISPLAY_NAME, - category="api node/image/BFL", + category="image/partner/BFL", description="Edits images using Flux.1 Kontext [pro] via api based on prompt and aspect ratio.", inputs=[ IO.String.Input( @@ -282,7 +282,7 @@ class FluxProExpandNode(IO.ComfyNode): return IO.Schema( node_id="FluxProExpandNode", display_name="Flux.1 Expand Image", - category="api node/image/BFL", + category="image/partner/BFL", description="Outpaints image based on prompt.", inputs=[ IO.Image.Input("image"), @@ -419,7 +419,7 @@ class FluxProFillNode(IO.ComfyNode): return IO.Schema( node_id="FluxProFillNode", display_name="Flux.1 Fill Image", - category="api node/image/BFL", + category="image/partner/BFL", description="Inpaints image based on mask and prompt.", inputs=[ IO.Image.Input("image"), @@ -545,7 +545,7 @@ class Flux2ProImageNode(IO.ComfyNode): return IO.Schema( node_id=cls.NODE_ID, display_name=cls.DISPLAY_NAME, - category="api node/image/BFL", + category="image/partner/BFL", description="Generates images synchronously based on prompt and resolution.", inputs=[ IO.String.Input( @@ -716,7 +716,7 @@ class Flux2ImageNode(IO.ComfyNode): return IO.Schema( node_id="Flux2ImageNode", display_name="Flux.2 Image", - category="api node/image/BFL", + category="image/partner/BFL", description="Generate images via Flux.2 [pro] or Flux.2 [max] from a prompt and optional reference images.", inputs=[ IO.String.Input( diff --git a/comfy_api_nodes/nodes_bria.py b/comfy_api_nodes/nodes_bria.py index 4044ee3ea..53e763210 100644 --- a/comfy_api_nodes/nodes_bria.py +++ b/comfy_api_nodes/nodes_bria.py @@ -31,7 +31,7 @@ class BriaImageEditNode(IO.ComfyNode): return IO.Schema( node_id="BriaImageEditNode", display_name="Bria FIBO Image Edit", - category="api node/image/Bria", + category="image/partner/Bria", description="Edit images using Bria latest model", inputs=[ IO.Combo.Input("model", options=["FIBO"]), @@ -169,7 +169,7 @@ class BriaRemoveImageBackground(IO.ComfyNode): return IO.Schema( node_id="BriaRemoveImageBackground", display_name="Bria Remove Image Background", - category="api node/image/Bria", + category="image/partner/Bria", description="Remove the background from an image using Bria RMBG 2.0.", inputs=[ IO.Image.Input("image"), @@ -245,7 +245,7 @@ class BriaRemoveVideoBackground(IO.ComfyNode): return IO.Schema( node_id="BriaRemoveVideoBackground", display_name="Bria Remove Video Background", - category="api node/video/Bria", + category="video/partner/Bria", description="Remove the background from a video using Bria. ", inputs=[ IO.Video.Input("video"), diff --git a/comfy_api_nodes/nodes_bytedance.py b/comfy_api_nodes/nodes_bytedance.py index d6b479336..3711bac1d 100644 --- a/comfy_api_nodes/nodes_bytedance.py +++ b/comfy_api_nodes/nodes_bytedance.py @@ -2,11 +2,12 @@ import hashlib import logging import math import re +from io import BytesIO import torch from typing_extensions import override -from comfy_api.latest import IO, ComfyExtension, Input +from comfy_api.latest import IO, ComfyExtension, Input, Types from comfy_api_nodes.apis.bytedance import ( RECOMMENDED_PRESETS, RECOMMENDED_PRESETS_SEEDREAM_4, @@ -43,15 +44,17 @@ from comfy_api_nodes.util import ( ApiEndpoint, download_url_to_image_tensor, download_url_to_video_output, + downscale_image_tensor_by_max_side, + downscale_video_to_max_pixels, get_number_of_images, image_tensor_pair_to_batch, poll_op, - resize_video_to_pixel_budget, sync_op, upload_audio_to_comfyapi, upload_image_to_comfyapi, upload_images_to_comfyapi, upload_video_to_comfyapi, + upscale_video_to_min_pixels, validate_image_aspect_ratio, validate_image_dimensions, validate_string, @@ -110,15 +113,24 @@ def _validate_ref_video_pixels(video: Input.Video, model_id: str, resolution: st max_px = limits.get("max") if min_px and pixels < min_px: raise ValueError( - f"Reference video {index} is too small: {w}x{h} = {pixels:,}px. " f"Minimum is {min_px:,}px for this model." + f"Reference video {index} is too small: {w}x{h} = {pixels:,} total pixels. " + f"Minimum for this model is {min_px:,} total pixels." ) if max_px and pixels > max_px: raise ValueError( - f"Reference video {index} is too large: {w}x{h} = {pixels:,}px. " - f"Maximum is {max_px:,}px for this model. Try downscaling the video." + f"Reference video {index} is too large: {w}x{h} = {pixels:,} total pixels. " + f"Maximum for this model is {max_px:,} total pixels. Try downscaling the video." ) +def _prepare_seedance_image(image: Input.Image) -> Input.Image: + """Auto-downscale a Seedance image input to the per-side limits, then validate it.""" + validate_image_aspect_ratio(image, (2, 5), (5, 2), strict=False) # 0.4 to 2.5 + image = downscale_image_tensor_by_max_side(image, max_side=6000) + validate_image_dimensions(image, min_width=300, min_height=300, max_width=6000, max_height=6000) + return image + + async def _resolve_reference_assets( cls: type[IO.ComfyNode], asset_ids: list[str], @@ -306,6 +318,26 @@ async def _seedance_virtual_library_upload_image_asset( return f"asset://{create_resp.asset_id}" +async def _seedance_virtual_library_upload_video_asset( + cls: type[IO.ComfyNode], + video: Input.Video, + *, + wait_label: str = "Uploading video", +) -> str: + buf = BytesIO() + video.save_to(buf, format=Types.VideoContainer.MP4, codec=Types.VideoCodec.H264) + video_hash = hashlib.sha256(buf.getbuffer()).hexdigest() + public_url = await upload_video_to_comfyapi(cls, video, wait_label=wait_label) + create_resp = await sync_op( + cls, + ApiEndpoint(path="/proxy/seedance/virtual-library/assets", method="POST"), + response_model=SeedanceCreateAssetResponse, + data=SeedanceVirtualLibraryCreateAssetRequest(url=public_url, hash=video_hash, asset_type="Video"), + ) + await _wait_for_asset_active(cls, create_resp.asset_id, group_id="virtual-library") + return f"asset://{create_resp.asset_id}" + + def _seedance2_price_extractor(model_id: str, has_video_input: bool): """Returns a price_extractor closure for Seedance 2.0 poll_op.""" rate = SEEDANCE2_PRICE_PER_1K_TOKENS.get((model_id, has_video_input)) @@ -336,7 +368,7 @@ class ByteDanceImageNode(IO.ComfyNode): return IO.Schema( node_id="ByteDanceImageNode", display_name="ByteDance Image", - category="api node/image/ByteDance", + category="image/partner/ByteDance", description="Generate images using ByteDance models via api based on prompt", inputs=[ IO.Combo.Input("model", options=["seedream-3-0-t2i-250415"]), @@ -460,7 +492,7 @@ class ByteDanceSeedreamNode(IO.ComfyNode): return IO.Schema( node_id="ByteDanceSeedreamNode", display_name="ByteDance Seedream 4.5 & 5.0", - category="api node/image/ByteDance", + category="image/partner/ByteDance", description="Unified text-to-image generation and precise single-sentence editing at up to 4K resolution.", inputs=[ IO.Combo.Input( @@ -722,7 +754,7 @@ class ByteDanceSeedreamNodeV2(IO.ComfyNode): return IO.Schema( node_id="ByteDanceSeedreamNodeV2", display_name="ByteDance Seedream 4.5 & 5.0", - category="api node/image/ByteDance", + category="image/partner/ByteDance", description="Unified text-to-image generation and precise single-sentence editing at up to 4K resolution.", inputs=[ IO.String.Input( @@ -888,7 +920,7 @@ class ByteDanceTextToVideoNode(IO.ComfyNode): return IO.Schema( node_id="ByteDanceTextToVideoNode", display_name="ByteDance Text to Video", - category="api node/video/ByteDance", + category="video/partner/ByteDance", description="Generate video using ByteDance models via api based on prompt", inputs=[ IO.Combo.Input( @@ -1016,7 +1048,7 @@ class ByteDanceImageToVideoNode(IO.ComfyNode): return IO.Schema( node_id="ByteDanceImageToVideoNode", display_name="ByteDance Image to Video", - category="api node/video/ByteDance", + category="video/partner/ByteDance", description="Generate video using ByteDance models via api based on image and prompt", inputs=[ IO.Combo.Input( @@ -1153,7 +1185,7 @@ class ByteDanceFirstLastFrameNode(IO.ComfyNode): return IO.Schema( node_id="ByteDanceFirstLastFrameNode", display_name="ByteDance First-Last-Frame to Video", - category="api node/video/ByteDance", + category="video/partner/ByteDance", description="Generate video using prompt and first and last frames.", inputs=[ IO.Combo.Input( @@ -1301,7 +1333,7 @@ class ByteDanceImageReferenceNode(IO.ComfyNode): return IO.Schema( node_id="ByteDanceImageReferenceNode", display_name="ByteDance Reference Images to Video", - category="api node/video/ByteDance", + category="video/partner/ByteDance", description="Generate video using prompt and reference images.", inputs=[ IO.Combo.Input( @@ -1544,7 +1576,7 @@ class ByteDance2TextToVideoNode(IO.ComfyNode): return IO.Schema( node_id="ByteDance2TextToVideoNode", display_name="ByteDance Seedance 2.0 Text to Video", - category="api node/video/ByteDance", + category="video/partner/ByteDance", description="Generate video using Seedance 2.0 models based on a text prompt.", inputs=[ IO.DynamicCombo.Input( @@ -1645,7 +1677,7 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode): return IO.Schema( node_id="ByteDance2FirstLastFrameNode", display_name="ByteDance Seedance 2.0 First-Last-Frame to Video", - category="api node/video/ByteDance", + category="video/partner/ByteDance", description="Generate video using Seedance 2.0 from a first frame image and optional last frame image.", inputs=[ IO.DynamicCombo.Input( @@ -1676,14 +1708,14 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode): "first_frame_asset_id", default="", tooltip="Seedance asset_id to use as the first frame. " - "Mutually exclusive with the first_frame image input.", + "Mutually exclusive with the first_frame image input.", optional=True, ), IO.String.Input( "last_frame_asset_id", default="", tooltip="Seedance asset_id to use as the last frame. " - "Mutually exclusive with the last_frame image input.", + "Mutually exclusive with the last_frame image input.", optional=True, ), IO.Int.Input( @@ -1758,6 +1790,11 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode): if last_frame is not None and last_frame_asset_id: raise ValueError("Provide only one of last_frame or last_frame_asset_id, not both.") + if first_frame is not None: + first_frame = _prepare_seedance_image(first_frame) + if last_frame is not None: + last_frame = _prepare_seedance_image(last_frame) + asset_ids_to_resolve = [a for a in (first_frame_asset_id, last_frame_asset_id) if a] image_assets: dict[str, str] = {} if asset_ids_to_resolve: @@ -1864,12 +1901,21 @@ def _seedance2_reference_inputs(resolutions: list[str], default_ratio: str = "16 ), IO.Boolean.Input( "auto_downscale", - default=False, - advanced=True, + default=True, optional=True, tooltip="Automatically downscale reference videos that exceed the model's pixel budget " "for the selected resolution. Aspect ratio is preserved; videos already within limits are untouched.", ), + IO.Boolean.Input( + "auto_upscale", + default=False, + advanced=True, + optional=True, + tooltip="Automatically upscale reference videos that are below the model's minimum pixel count " + "for the selected resolution. Aspect ratio is preserved; videos already meeting the minimum are " + "untouched. Note: upscaling a low-resolution source does not add real detail and may produce " + "lower-quality generations.", + ), IO.Autogrow.Input( "reference_assets", template=IO.Autogrow.TemplateNames( @@ -1898,7 +1944,7 @@ class ByteDance2ReferenceNode(IO.ComfyNode): return IO.Schema( node_id="ByteDance2ReferenceNode", display_name="ByteDance Seedance 2.0 Reference to Video", - category="api node/video/ByteDance", + category="video/partner/ByteDance", description="Generate, edit, or extend video using Seedance 2.0 with reference images, " "videos, and audio. Supports multimodal reference, video editing, and video extension.", inputs=[ @@ -2023,6 +2069,9 @@ class ByteDance2ReferenceNode(IO.ComfyNode): f"(audios={len(reference_audios)}, audio assets={len(reference_audio_assets)}). Maximum is 3." ) + for key in reference_images: + reference_images[key] = _prepare_seedance_image(reference_images[key]) + model_id = SEEDANCE_MODELS[model["model"]] has_video_input = total_videos > 0 @@ -2030,7 +2079,13 @@ class ByteDance2ReferenceNode(IO.ComfyNode): max_px = SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id, {}).get(model["resolution"], {}).get("max") if max_px: for key in reference_videos: - reference_videos[key] = resize_video_to_pixel_budget(reference_videos[key], max_px) + reference_videos[key] = downscale_video_to_max_pixels(reference_videos[key], max_px) + + if model.get("auto_upscale") and reference_videos: + min_px = SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id, {}).get(model["resolution"], {}).get("min") + if min_px: + for key in reference_videos: + reference_videos[key] = upscale_video_to_min_pixels(reference_videos[key], min_px) total_video_duration = 0.0 for i, key in enumerate(reference_videos, 1): @@ -2089,7 +2144,7 @@ class ByteDance2ReferenceNode(IO.ComfyNode): content.append( TaskVideoContent( video_url=TaskVideoContentUrl( - url=await upload_video_to_comfyapi( + url=await _seedance_virtual_library_upload_video_asset( cls, reference_videos[key], wait_label=f"Uploading video {i}", @@ -2186,7 +2241,7 @@ class ByteDanceCreateImageAsset(IO.ComfyNode): return IO.Schema( node_id="ByteDanceCreateImageAsset", display_name="ByteDance Create Image Asset", - category="api node/image/ByteDance", + category="image/partner/ByteDance", description=( "Create a Seedance 2.0 personal image asset. Uploads the input image and " "registers it in the given asset group. If group_id is empty, runs a real-person " @@ -2253,7 +2308,7 @@ class ByteDanceCreateVideoAsset(IO.ComfyNode): return IO.Schema( node_id="ByteDanceCreateVideoAsset", display_name="ByteDance Create Video Asset", - category="api node/video/ByteDance", + category="video/partner/ByteDance", description=( "Create a Seedance 2.0 personal video asset. Uploads the input video and " "registers it in the given asset group. If group_id is empty, runs a real-person " diff --git a/comfy_api_nodes/nodes_bytedance_llm.py b/comfy_api_nodes/nodes_bytedance_llm.py index fa7fe370a..007cac45f 100644 --- a/comfy_api_nodes/nodes_bytedance_llm.py +++ b/comfy_api_nodes/nodes_bytedance_llm.py @@ -144,7 +144,7 @@ class ByteDanceSeedNode(IO.ComfyNode): return IO.Schema( node_id="ByteDanceSeedNode", display_name="ByteDance Seed", - category="api node/text/ByteDance", + category="text/partner/ByteDance", essentials_category="Text Generation", description="Generate text responses with ByteDance's Seed 2.0 models. " "Provide a text prompt and optionally one or more images or videos for multimodal context.", diff --git a/comfy_api_nodes/nodes_elevenlabs.py b/comfy_api_nodes/nodes_elevenlabs.py index e452daf77..37eeb2601 100644 --- a/comfy_api_nodes/nodes_elevenlabs.py +++ b/comfy_api_nodes/nodes_elevenlabs.py @@ -69,7 +69,7 @@ class ElevenLabsSpeechToText(IO.ComfyNode): return IO.Schema( node_id="ElevenLabsSpeechToText", display_name="ElevenLabs Speech to Text", - category="api node/audio/ElevenLabs", + category="audio/partner/ElevenLabs", description="Transcribe audio to text. " "Supports automatic language detection, speaker diarization, and audio event tagging.", inputs=[ @@ -210,7 +210,7 @@ class ElevenLabsVoiceSelector(IO.ComfyNode): return IO.Schema( node_id="ElevenLabsVoiceSelector", display_name="ElevenLabs Voice Selector", - category="api node/audio/ElevenLabs", + category="audio/partner/ElevenLabs", description="Select a predefined ElevenLabs voice for text-to-speech generation.", inputs=[ IO.Combo.Input( @@ -239,7 +239,7 @@ class ElevenLabsTextToSpeech(IO.ComfyNode): return IO.Schema( node_id="ElevenLabsTextToSpeech", display_name="ElevenLabs Text to Speech", - category="api node/audio/ElevenLabs", + category="audio/partner/ElevenLabs", description="Convert text to speech.", inputs=[ IO.Custom(ELEVENLABS_VOICE).Input( @@ -414,7 +414,7 @@ class ElevenLabsAudioIsolation(IO.ComfyNode): return IO.Schema( node_id="ElevenLabsAudioIsolation", display_name="ElevenLabs Voice Isolation", - category="api node/audio/ElevenLabs", + category="audio/partner/ElevenLabs", description="Remove background noise from audio, isolating vocals or speech.", inputs=[ IO.Audio.Input( @@ -459,7 +459,7 @@ class ElevenLabsTextToSoundEffects(IO.ComfyNode): return IO.Schema( node_id="ElevenLabsTextToSoundEffects", display_name="ElevenLabs Text to Sound Effects", - category="api node/audio/ElevenLabs", + category="audio/partner/ElevenLabs", description="Generate sound effects from text descriptions.", inputs=[ IO.String.Input( @@ -555,7 +555,7 @@ class ElevenLabsInstantVoiceClone(IO.ComfyNode): return IO.Schema( node_id="ElevenLabsInstantVoiceClone", display_name="ElevenLabs Instant Voice Clone", - category="api node/audio/ElevenLabs", + category="audio/partner/ElevenLabs", description="Create a cloned voice from audio samples. " "Provide 1-8 audio recordings of the voice to clone.", inputs=[ @@ -658,7 +658,7 @@ class ElevenLabsSpeechToSpeech(IO.ComfyNode): return IO.Schema( node_id="ElevenLabsSpeechToSpeech", display_name="ElevenLabs Speech to Speech", - category="api node/audio/ElevenLabs", + category="audio/partner/ElevenLabs", description="Transform speech from one voice to another while preserving the original content and emotion.", inputs=[ IO.Custom(ELEVENLABS_VOICE).Input( @@ -793,7 +793,7 @@ class ElevenLabsTextToDialogue(IO.ComfyNode): return IO.Schema( node_id="ElevenLabsTextToDialogue", display_name="ElevenLabs Text to Dialogue", - category="api node/audio/ElevenLabs", + category="audio/partner/ElevenLabs", description="Generate multi-speaker dialogue from text. Each dialogue entry has its own text and voice.", inputs=[ IO.Float.Input( diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index d18c958a8..3cfd541b2 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -300,7 +300,7 @@ class GeminiNode(IO.ComfyNode): return IO.Schema( node_id="GeminiNode", display_name="Google Gemini", - category="api node/text/Gemini", + category="text/partner/Gemini", description="Generate text responses with Google's Gemini AI model. " "You can provide multiple types of inputs (text, images, audio, video) " "as context for generating more relevant and meaningful responses.", @@ -541,7 +541,7 @@ class GeminiInputFiles(IO.ComfyNode): return IO.Schema( node_id="GeminiInputFiles", display_name="Gemini Input Files", - category="api node/text/Gemini", + category="text/partner/Gemini", description="Loads and prepares input files to include as inputs for Gemini LLM nodes. " "The files will be read by the Gemini model when generating a response. " "The contents of the text file count toward the token limit. " @@ -598,7 +598,7 @@ class GeminiImage(IO.ComfyNode): return IO.Schema( node_id="GeminiImageNode", display_name="Nano Banana (Google Gemini Image)", - category="api node/image/Gemini", + category="image/partner/Gemini", description="Edit images synchronously via Google API.", inputs=[ IO.String.Input( @@ -731,7 +731,7 @@ class GeminiImage2(IO.ComfyNode): return IO.Schema( node_id="GeminiImage2Node", display_name="Nano Banana Pro (Google Gemini Image)", - category="api node/image/Gemini", + category="image/partner/Gemini", description="Generate or edit images synchronously via Google Vertex API.", inputs=[ IO.String.Input( @@ -869,7 +869,7 @@ class GeminiNanoBanana2(IO.ComfyNode): return IO.Schema( node_id="GeminiNanoBanana2", display_name="Nano Banana 2", - category="api node/image/Gemini", + category="image/partner/Gemini", description="Generate or edit images synchronously via Google Vertex API.", inputs=[ IO.String.Input( @@ -1085,7 +1085,7 @@ class GeminiNanoBanana2V2(IO.ComfyNode): return IO.Schema( node_id="GeminiNanoBanana2V2", display_name="Nano Banana 2", - category="api node/image/Gemini", + category="image/partner/Gemini", description="Generate or edit images synchronously via Google Vertex API.", inputs=[ IO.String.Input( diff --git a/comfy_api_nodes/nodes_grok.py b/comfy_api_nodes/nodes_grok.py index a103f24ee..43e3cdc26 100644 --- a/comfy_api_nodes/nodes_grok.py +++ b/comfy_api_nodes/nodes_grok.py @@ -49,7 +49,7 @@ class GrokImageNode(IO.ComfyNode): return IO.Schema( node_id="GrokImageNode", display_name="Grok Image", - category="api node/image/Grok", + category="image/partner/Grok", description="Generate images using Grok based on a text prompt", inputs=[ IO.Combo.Input( @@ -224,7 +224,7 @@ class GrokImageEditNode(IO.ComfyNode): return IO.Schema( node_id="GrokImageEditNode", display_name="Grok Image Edit", - category="api node/image/Grok", + category="image/partner/Grok", description="Modify an existing image based on a text prompt", inputs=[ IO.Combo.Input( @@ -366,7 +366,7 @@ class GrokImageEditNodeV2(IO.ComfyNode): return IO.Schema( node_id="GrokImageEditNodeV2", display_name="Grok Image Edit", - category="api node/image/Grok", + category="image/partner/Grok", description="Modify an existing image based on a text prompt", inputs=[ IO.String.Input( @@ -503,7 +503,7 @@ class GrokVideoNode(IO.ComfyNode): return IO.Schema( node_id="GrokVideoNode", display_name="Grok Video", - category="api node/video/Grok", + category="video/partner/Grok", description="Generate video from a prompt or an image", inputs=[ IO.Combo.Input("model", options=["grok-imagine-video", "grok-imagine-video-beta"]), @@ -615,7 +615,7 @@ class GrokVideoEditNode(IO.ComfyNode): return IO.Schema( node_id="GrokVideoEditNode", display_name="Grok Video Edit", - category="api node/video/Grok", + category="video/partner/Grok", description="Edit an existing video based on a text prompt.", inputs=[ IO.Combo.Input("model", options=["grok-imagine-video", "grok-imagine-video-beta"]), @@ -693,7 +693,7 @@ class GrokVideoReferenceNode(IO.ComfyNode): return IO.Schema( node_id="GrokVideoReferenceNode", display_name="Grok Reference-to-Video", - category="api node/video/Grok", + category="video/partner/Grok", description="Generate video guided by reference images as style and content references.", inputs=[ IO.String.Input( @@ -826,7 +826,7 @@ class GrokVideoExtendNode(IO.ComfyNode): return IO.Schema( node_id="GrokVideoExtendNode", display_name="Grok Video Extend", - category="api node/video/Grok", + category="video/partner/Grok", description="Extend an existing video with a seamless continuation based on a text prompt.", inputs=[ IO.String.Input( diff --git a/comfy_api_nodes/nodes_hitpaw.py b/comfy_api_nodes/nodes_hitpaw.py index bca5170e4..22e679c29 100644 --- a/comfy_api_nodes/nodes_hitpaw.py +++ b/comfy_api_nodes/nodes_hitpaw.py @@ -71,7 +71,7 @@ class HitPawGeneralImageEnhance(IO.ComfyNode): return IO.Schema( node_id="HitPawGeneralImageEnhance", display_name="HitPaw General Image Enhance", - category="api node/image/HitPaw", + category="image/partner/HitPaw", description="Upscale low-resolution images to super-resolution, eliminate artifacts and noise. " f"Maximum output: {MAX_MP_GENERATIVE} megapixels.", inputs=[ @@ -201,7 +201,7 @@ class HitPawVideoEnhance(IO.ComfyNode): return IO.Schema( node_id="HitPawVideoEnhance", display_name="HitPaw Video Enhance", - category="api node/video/HitPaw", + category="video/partner/HitPaw", description="Upscale low-resolution videos to high resolution, eliminate artifacts and noise. " "Prices shown are per second of video.", inputs=[ diff --git a/comfy_api_nodes/nodes_hunyuan3d.py b/comfy_api_nodes/nodes_hunyuan3d.py index 5fc31bccd..826a3bd2d 100644 --- a/comfy_api_nodes/nodes_hunyuan3d.py +++ b/comfy_api_nodes/nodes_hunyuan3d.py @@ -123,7 +123,7 @@ class TencentTextToModelNode(IO.ComfyNode): return IO.Schema( node_id="TencentTextToModelNode", display_name="Hunyuan3D: Text to Model", - category="api node/3d/Tencent", + category="3d/partner/Tencent", essentials_category="3D", inputs=[ IO.Combo.Input( @@ -242,7 +242,7 @@ class TencentImageToModelNode(IO.ComfyNode): return IO.Schema( node_id="TencentImageToModelNode", display_name="Hunyuan3D: Image(s) to Model", - category="api node/3d/Tencent", + category="3d/partner/Tencent", essentials_category="3D", inputs=[ IO.Combo.Input( @@ -415,7 +415,7 @@ class TencentModelTo3DUVNode(IO.ComfyNode): return IO.Schema( node_id="TencentModelTo3DUVNode", display_name="Hunyuan3D: Model to UV", - category="api node/3d/Tencent", + category="3d/partner/Tencent", description="Perform UV unfolding on a 3D model to generate UV texture. " "Input model must have less than 30000 faces.", inputs=[ @@ -505,7 +505,7 @@ class Tencent3DTextureEditNode(IO.ComfyNode): return IO.Schema( node_id="Tencent3DTextureEditNode", display_name="Hunyuan3D: 3D Texture Edit", - category="api node/3d/Tencent", + category="3d/partner/Tencent", description="After inputting the 3D model, perform 3D model texture redrawing.", inputs=[ IO.MultiType.Input( @@ -594,7 +594,7 @@ class Tencent3DPartNode(IO.ComfyNode): return IO.Schema( node_id="Tencent3DPartNode", display_name="Hunyuan3D: 3D Part", - category="api node/3d/Tencent", + category="3d/partner/Tencent", description="Automatically perform component identification and generation based on the model structure.", inputs=[ IO.MultiType.Input( @@ -666,7 +666,7 @@ class TencentSmartTopologyNode(IO.ComfyNode): return IO.Schema( node_id="TencentSmartTopologyNode", display_name="Hunyuan3D: Smart Topology", - category="api node/3d/Tencent", + category="3d/partner/Tencent", description="Perform smart retopology on a 3D model. " "Supports GLB/OBJ formats; max 200MB; recommended for high-poly models.", inputs=[ diff --git a/comfy_api_nodes/nodes_ideogram.py b/comfy_api_nodes/nodes_ideogram.py index 97c3609bd..edd9b9435 100644 --- a/comfy_api_nodes/nodes_ideogram.py +++ b/comfy_api_nodes/nodes_ideogram.py @@ -234,7 +234,7 @@ class IdeogramV1(IO.ComfyNode): return IO.Schema( node_id="IdeogramV1", display_name="Ideogram V1", - category="api node/image/Ideogram", + category="image/partner/Ideogram", description="Generates images using the Ideogram V1 model.", inputs=[ IO.String.Input( @@ -360,7 +360,7 @@ class IdeogramV2(IO.ComfyNode): return IO.Schema( node_id="IdeogramV2", display_name="Ideogram V2", - category="api node/image/Ideogram", + category="image/partner/Ideogram", description="Generates images using the Ideogram V2 model.", inputs=[ IO.String.Input( @@ -526,7 +526,7 @@ class IdeogramV3(IO.ComfyNode): return IO.Schema( node_id="IdeogramV3", display_name="Ideogram V3", - category="api node/image/Ideogram", + category="image/partner/Ideogram", description="Generates images using the Ideogram V3 model. " "Supports both regular image generation from text prompts and image editing with mask.", inputs=[ diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index 7586f1816..9925ec548 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -642,7 +642,7 @@ class KlingCameraControls(IO.ComfyNode): return IO.Schema( node_id="KlingCameraControls", display_name="Kling Camera Controls", - category="api node/video/Kling", + category="video/partner/Kling", description="Allows specifying configuration options for Kling Camera Controls and motion control effects.", inputs=[ IO.Combo.Input("camera_control_type", options=KlingCameraControlType), @@ -762,7 +762,7 @@ class KlingTextToVideoNode(IO.ComfyNode): return IO.Schema( node_id="KlingTextToVideoNode", display_name="Kling Text to Video", - category="api node/video/Kling", + category="video/partner/Kling", description="Kling Text to Video Node", inputs=[ IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), @@ -849,7 +849,7 @@ class OmniProTextToVideoNode(IO.ComfyNode): return IO.Schema( node_id="KlingOmniProTextToVideoNode", display_name="Kling 3.0 Omni Text to Video", - category="api node/video/Kling", + category="video/partner/Kling", description="Use text prompts to generate videos with the latest Kling model.", inputs=[ IO.Combo.Input("model_name", options=["kling-v3-omni", "kling-video-o1"]), @@ -998,7 +998,7 @@ class OmniProFirstLastFrameNode(IO.ComfyNode): return IO.Schema( node_id="KlingOmniProFirstLastFrameNode", display_name="Kling 3.0 Omni First-Last-Frame to Video", - category="api node/video/Kling", + category="video/partner/Kling", description="Use a start frame, an optional end frame, or reference images with the latest Kling model.", inputs=[ IO.Combo.Input("model_name", options=["kling-v3-omni", "kling-video-o1"]), @@ -1205,7 +1205,7 @@ class OmniProImageToVideoNode(IO.ComfyNode): return IO.Schema( node_id="KlingOmniProImageToVideoNode", display_name="Kling 3.0 Omni Image to Video", - category="api node/video/Kling", + category="video/partner/Kling", description="Use up to 7 reference images to generate a video with the latest Kling model.", inputs=[ IO.Combo.Input("model_name", options=["kling-v3-omni", "kling-video-o1"]), @@ -1374,7 +1374,7 @@ class OmniProVideoToVideoNode(IO.ComfyNode): return IO.Schema( node_id="KlingOmniProVideoToVideoNode", display_name="Kling 3.0 Omni Video to Video", - category="api node/video/Kling", + category="video/partner/Kling", description="Use a video and up to 4 reference images to generate a video with the latest Kling model.", inputs=[ IO.Combo.Input("model_name", options=["kling-v3-omni", "kling-video-o1"]), @@ -1485,7 +1485,7 @@ class OmniProEditVideoNode(IO.ComfyNode): return IO.Schema( node_id="KlingOmniProEditVideoNode", display_name="Kling 3.0 Omni Edit Video", - category="api node/video/Kling", + category="video/partner/Kling", essentials_category="Video Generation", description="Edit an existing video with the latest model from Kling.", inputs=[ @@ -1593,7 +1593,7 @@ class OmniProImageNode(IO.ComfyNode): return IO.Schema( node_id="KlingOmniProImageNode", display_name="Kling 3.0 Omni Image", - category="api node/image/Kling", + category="image/partner/Kling", description="Create or edit images with the latest model from Kling.", inputs=[ IO.Combo.Input("model_name", options=["kling-v3-omni", "kling-image-o1"]), @@ -1721,7 +1721,7 @@ class KlingCameraControlT2VNode(IO.ComfyNode): return IO.Schema( node_id="KlingCameraControlT2VNode", display_name="Kling Text to Video (Camera Control)", - category="api node/video/Kling", + category="video/partner/Kling", description="Transform text into cinematic videos with professional camera movements that simulate real-world cinematography. Control virtual camera actions including zoom, rotation, pan, tilt, and first-person view, while maintaining focus on your original text.", inputs=[ IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), @@ -1783,7 +1783,7 @@ class KlingImage2VideoNode(IO.ComfyNode): return IO.Schema( node_id="KlingImage2VideoNode", display_name="Kling Image(First Frame) to Video", - category="api node/video/Kling", + category="video/partner/Kling", inputs=[ IO.Image.Input("start_frame", tooltip="The reference image used to generate the video."), IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), @@ -1882,7 +1882,7 @@ class KlingCameraControlI2VNode(IO.ComfyNode): return IO.Schema( node_id="KlingCameraControlI2VNode", display_name="Kling Image to Video (Camera Control)", - category="api node/video/Kling", + category="video/partner/Kling", description="Transform still images into cinematic videos with professional camera movements that simulate real-world cinematography. Control virtual camera actions including zoom, rotation, pan, tilt, and first-person view, while maintaining focus on your original image.", inputs=[ IO.Image.Input( @@ -1953,7 +1953,7 @@ class KlingStartEndFrameNode(IO.ComfyNode): return IO.Schema( node_id="KlingStartEndFrameNode", display_name="Kling Start-End Frame to Video", - category="api node/video/Kling", + category="video/partner/Kling", description="Generate a video sequence that transitions between your provided start and end images. The node creates all frames in between, producing a smooth transformation from the first frame to the last.", inputs=[ IO.Image.Input( @@ -2047,7 +2047,7 @@ class KlingVideoExtendNode(IO.ComfyNode): return IO.Schema( node_id="KlingVideoExtendNode", display_name="Kling Video Extend", - category="api node/video/Kling", + category="video/partner/Kling", description="Kling Video Extend Node. Extend videos made by other Kling nodes. The video_id is created by using other Kling Nodes.", inputs=[ IO.String.Input( @@ -2128,7 +2128,7 @@ class KlingDualCharacterVideoEffectNode(IO.ComfyNode): return IO.Schema( node_id="KlingDualCharacterVideoEffectNode", display_name="Kling Dual Character Video Effects", - category="api node/video/Kling", + category="video/partner/Kling", description="Achieve different special effects when generating a video based on the effect_scene. First image will be positioned on left side, second on right side of the composite.", inputs=[ IO.Image.Input("image_left", tooltip="Left side image"), @@ -2218,7 +2218,7 @@ class KlingSingleImageVideoEffectNode(IO.ComfyNode): return IO.Schema( node_id="KlingSingleImageVideoEffectNode", display_name="Kling Video Effects", - category="api node/video/Kling", + category="video/partner/Kling", description="Achieve different special effects when generating a video based on the effect_scene.", inputs=[ IO.Image.Input( @@ -2291,7 +2291,7 @@ class KlingLipSyncAudioToVideoNode(IO.ComfyNode): return IO.Schema( node_id="KlingLipSyncAudioToVideoNode", display_name="Kling Lip Sync Video with Audio", - category="api node/video/Kling", + category="video/partner/Kling", essentials_category="Video Generation", description="Kling Lip Sync Audio to Video Node. Syncs mouth movements in a video file to the audio content of an audio file. When using, ensure that the audio contains clearly distinguishable vocals and that the video contains a distinct face. The audio file should not be larger than 5MB. The video file should not be larger than 100MB, should have height/width between 720px and 1920px, and should be between 2s and 10s in length.", inputs=[ @@ -2343,7 +2343,7 @@ class KlingLipSyncTextToVideoNode(IO.ComfyNode): return IO.Schema( node_id="KlingLipSyncTextToVideoNode", display_name="Kling Lip Sync Video with Text", - category="api node/video/Kling", + category="video/partner/Kling", description="Kling Lip Sync Text to Video Node. Syncs mouth movements in a video file to a text prompt. The video file should not be larger than 100MB, should have height/width between 720px and 1920px, and should be between 2s and 10s in length.", inputs=[ IO.Video.Input("video"), @@ -2411,7 +2411,7 @@ class KlingVirtualTryOnNode(IO.ComfyNode): return IO.Schema( node_id="KlingVirtualTryOnNode", display_name="Kling Virtual Try On", - category="api node/image/Kling", + category="image/partner/Kling", description="Kling Virtual Try On Node. Input a human image and a cloth image to try on the cloth on the human. You can merge multiple clothing item pictures into one image with a white background.", inputs=[ IO.Image.Input("human_image"), @@ -2478,7 +2478,7 @@ class KlingImageGenerationNode(IO.ComfyNode): return IO.Schema( node_id="KlingImageGenerationNode", display_name="Kling 3.0 Image", - category="api node/image/Kling", + category="image/partner/Kling", description="Kling Image Generation Node. Generate an image from a text prompt with an optional reference image.", inputs=[ IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), @@ -2615,7 +2615,7 @@ class TextToVideoWithAudio(IO.ComfyNode): return IO.Schema( node_id="KlingTextToVideoWithAudio", display_name="Kling 2.6 Text to Video with Audio", - category="api node/video/Kling", + category="video/partner/Kling", inputs=[ IO.Combo.Input("model_name", options=["kling-v2-6"]), IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt."), @@ -2683,7 +2683,7 @@ class ImageToVideoWithAudio(IO.ComfyNode): return IO.Schema( node_id="KlingImageToVideoWithAudio", display_name="Kling 2.6 Image(First Frame) to Video with Audio", - category="api node/video/Kling", + category="video/partner/Kling", inputs=[ IO.Combo.Input("model_name", options=["kling-v2-6"]), IO.Image.Input("start_frame"), @@ -2753,7 +2753,7 @@ class MotionControl(IO.ComfyNode): return IO.Schema( node_id="KlingMotionControl", display_name="Kling Motion Control", - category="api node/video/Kling", + category="video/partner/Kling", inputs=[ IO.String.Input("prompt", multiline=True), IO.Image.Input("reference_image"), @@ -2854,7 +2854,7 @@ class KlingVideoNode(IO.ComfyNode): return IO.Schema( node_id="KlingVideoNode", display_name="Kling 3.0 Video", - category="api node/video/Kling", + category="video/partner/Kling", description="Generate videos with Kling V3. " "Supports text-to-video and image-to-video with optional storyboard multi-prompt and audio generation.", inputs=[ @@ -3077,7 +3077,7 @@ class KlingFirstLastFrameNode(IO.ComfyNode): return IO.Schema( node_id="KlingFirstLastFrameNode", display_name="Kling 3.0 First-Last-Frame to Video", - category="api node/video/Kling", + category="video/partner/Kling", description="Generate videos with Kling V3 using first and last frames.", inputs=[ IO.String.Input("prompt", multiline=True, default=""), @@ -3202,7 +3202,7 @@ class KlingAvatarNode(IO.ComfyNode): return IO.Schema( node_id="KlingAvatarNode", display_name="Kling Avatar 2.0", - category="api node/video/Kling", + category="video/partner/Kling", description="Generate broadcast-style digital human videos from a single photo and an audio file.", inputs=[ IO.Image.Input( diff --git a/comfy_api_nodes/nodes_krea.py b/comfy_api_nodes/nodes_krea.py new file mode 100644 index 000000000..be04a272b --- /dev/null +++ b/comfy_api_nodes/nodes_krea.py @@ -0,0 +1,290 @@ +"""Krea image-generation nodes.""" + +import re + +from typing_extensions import override + +from comfy_api.latest import IO, ComfyExtension, Input +from comfy_api_nodes.apis.krea import ( + KreaAssetResponse, + KreaGenerateImageRequest, + KreaImageStyleReference, + KreaJob, + KreaMoodboard, +) +from comfy_api_nodes.util import ( + ApiEndpoint, + download_url_to_image_tensor, + poll_op, + sync_op, + tensor_to_bytesio, + validate_string, +) + + +class KreaIO: + STYLE_REF = "KREA_STYLE_REF" + + +async def _upload_image_to_krea_assets(cls: type[IO.ComfyNode], image: Input.Image) -> str: + """Upload an image to Krea's /assets endpoint and return the Krea-hosted image URL.""" + img_io = tensor_to_bytesio(image, total_pixels=2048 * 2048, mime_type="image/png") + response = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/krea/assets", method="POST"), + response_model=KreaAssetResponse, + files=[("file", (img_io.name, img_io, "image/png"))], + content_type="multipart/form-data", + max_retries=1, + wait_label="Uploading reference", + ) + return response.image_url + + +_MODEL_MEDIUM = "Krea 2 Medium" +_MODEL_LARGE = "Krea 2 Large" +_MODEL_ENDPOINTS: dict[str, str] = { + _MODEL_MEDIUM: "/proxy/krea/generate/image/krea/krea-2/medium", + _MODEL_LARGE: "/proxy/krea/generate/image/krea/krea-2/large", +} + +_ASPECT_RATIOS = ["1:1", "4:3", "3:2", "16:9", "2.35:1", "4:5", "2:3", "9:16"] +_RESOLUTIONS = ["1K"] +_CREATIVITY_LEVELS = ["raw", "low", "medium", "high"] +_KREA_QUEUED_STATUSES = ["backlogged", "queued", "scheduled"] + +_UUID_RE = re.compile(r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$") + + +def _krea_model_inputs() -> list: + """Nested inputs shared by both Krea 2 Medium and Large under the DynamicCombo.""" + return [ + IO.Combo.Input( + "aspect_ratio", + options=_ASPECT_RATIOS, + tooltip="Output aspect ratio.", + ), + IO.Combo.Input( + "resolution", + options=_RESOLUTIONS, + tooltip="Resolution scale.", + ), + IO.Combo.Input( + "creativity", + options=_CREATIVITY_LEVELS, + default="medium", + tooltip="Prompt interpretation strength: raw stays closest to the prompt; high is most creative.", + ), + IO.String.Input( + "moodboard_id", + default="", + tooltip="Optional Krea moodboard UUID (e.g. from the Krea website). " + "Leave empty to disable. Only one moodboard is supported per request.", + optional=True, + ), + IO.Float.Input( + "moodboard_strength", + default=0.35, + min=-0.5, + max=1.5, + step=0.05, + tooltip="Moodboard influence; ignored when moodboard_id is empty.", + optional=True, + ), + IO.Custom(KreaIO.STYLE_REF).Input( + "style_reference", + optional=True, + tooltip="Optional chain of style references (max 10) from Krea 2 Style Reference nodes.", + ), + ] + + +class Krea2ImageNode(IO.ComfyNode): + + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="Krea2ImageNode", + display_name="Krea 2 Image", + category="image/partner/Krea", + description=( + "Generate images via Krea 2 — pick Medium (expressive illustrations) or " + "Large (expressive photorealism). Supports an optional moodboard and up " + "to 10 chained image style references." + ), + inputs=[ + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Text prompt for the image.", + ), + IO.DynamicCombo.Input( + "model", + options=[ + IO.DynamicCombo.Option(_MODEL_MEDIUM, _krea_model_inputs()), + IO.DynamicCombo.Option(_MODEL_LARGE, _krea_model_inputs()), + ], + tooltip="Krea 2 Medium is best for expressive illustrations; " + "Krea 2 Large is best for expressive photorealism.", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + control_after_generate=True, + tooltip="Random seed for reproducibility.", + ), + ], + outputs=[IO.Image.Output()], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends( + widgets=["model", "model.moodboard_id"], + inputs=["model.style_reference"], + ), + expr=""" + ( + $isLarge := widgets.model = "krea 2 large"; + $hasMoodboard := $length($lookup(widgets, "model.moodboard_id")) > 0; + $hasStyle := $lookup(inputs, "model.style_reference").connected; + $usd := $hasMoodboard + ? ($isLarge ? 0.07 : 0.04) + : ($hasStyle + ? ($isLarge ? 0.065 : 0.035) + : ($isLarge ? 0.06 : 0.03)); + {"type":"usd","usd": $usd} + ) + """, + ), + ) + + @classmethod + async def execute( + cls, + prompt: str, + model: dict, + seed: int, + ) -> IO.NodeOutput: + validate_string(prompt, strip_whitespace=False, min_length=1) + + model_choice = model["model"] + endpoint_path = _MODEL_ENDPOINTS.get(model_choice) + if endpoint_path is None: + raise ValueError(f"Unknown Krea 2 model: {model_choice!r}") + + moodboards: list[KreaMoodboard] | None = None + mb_id = (model.get("moodboard_id") or "").strip() + if mb_id: + if not _UUID_RE.match(mb_id): + raise ValueError(f"moodboard_id must be a UUID (received {mb_id!r}); copy it from the Krea website.") + mb_strength = model.get("moodboard_strength") + moodboards = [KreaMoodboard(id=mb_id, strength=0.35 if mb_strength is None else float(mb_strength))] + + style_reference = model.get("style_reference") + image_style_references: list[KreaImageStyleReference] | None = None + if style_reference: + if len(style_reference) > 10: + raise ValueError(f"Krea 2 accepts at most 10 image_style_references; received {len(style_reference)}.") + image_style_references = [ + KreaImageStyleReference(url=ref["url"], strength=float(ref["strength"])) for ref in style_reference + ] + initial = await sync_op( + cls, + ApiEndpoint(path=endpoint_path, method="POST"), + response_model=KreaJob, + data=KreaGenerateImageRequest( + prompt=prompt, + aspect_ratio=model["aspect_ratio"], + resolution=model["resolution"], + seed=seed, + creativity=model["creativity"], + moodboards=moodboards, + image_style_references=image_style_references, + ), + ) + job = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/krea/jobs/{initial.job_id}", method="GET"), + response_model=KreaJob, + status_extractor=lambda r: r.status, + queued_statuses=_KREA_QUEUED_STATUSES, + ) + if not job.result or not job.result.urls: + raise RuntimeError(f"Krea 2 job {job.job_id} completed without any image URLs.") + image = await download_url_to_image_tensor(job.result.urls[0]) + return IO.NodeOutput(image) + + +class Krea2StyleReferenceNode(IO.ComfyNode): + + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="Krea2StyleReferenceNode", + display_name="Krea 2 Style Reference", + category="image/partner/Krea", + description=( + "Add an image style reference to a Krea 2 generation. Chain multiple Krea 2 " + "Style Reference nodes (max 10) and feed the final `style_reference` output " + "into Krea 2 Image. Each image is uploaded to ComfyAPI storage and passed as URL." + ), + inputs=[ + IO.Image.Input( + "image", + tooltip="Reference image whose style influences the generation.", + ), + IO.Float.Input( + "strength", + default=1.0, + min=-2.0, + max=2.0, + step=0.05, + tooltip="Reference strength; negative values invert the style influence.", + ), + IO.Custom(KreaIO.STYLE_REF).Input( + "style_reference", + optional=True, + tooltip="Optional incoming chain of style references; this node appends one more.", + ), + ], + outputs=[IO.Custom(KreaIO.STYLE_REF).Output(display_name="style_reference")], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + ) + + @classmethod + async def execute( + cls, + image: Input.Image, + strength: float, + style_reference: list[dict] | None = None, + ) -> IO.NodeOutput: + chain: list[dict] = list(style_reference) if style_reference else [] + if len(chain) >= 10: + raise ValueError("Krea 2 accepts at most 10 image_style_references in one generation.") + url = await _upload_image_to_krea_assets(cls, image) + chain.append({"url": url, "strength": float(strength)}) + return IO.NodeOutput(chain) + + +class KreaExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + Krea2ImageNode, + Krea2StyleReferenceNode, + ] + + +async def comfy_entrypoint() -> KreaExtension: + return KreaExtension() diff --git a/comfy_api_nodes/nodes_ltxv.py b/comfy_api_nodes/nodes_ltxv.py index 0a219af96..01791d354 100644 --- a/comfy_api_nodes/nodes_ltxv.py +++ b/comfy_api_nodes/nodes_ltxv.py @@ -50,7 +50,7 @@ class TextToVideoNode(IO.ComfyNode): return IO.Schema( node_id="LtxvApiTextToVideo", display_name="LTXV Text To Video", - category="api node/video/LTXV", + category="video/partner/LTXV", description="Professional-quality videos with customizable duration and resolution.", inputs=[ IO.Combo.Input("model", options=list(MODELS_MAP.keys())), @@ -127,7 +127,7 @@ class ImageToVideoNode(IO.ComfyNode): return IO.Schema( node_id="LtxvApiImageToVideo", display_name="LTXV Image To Video", - category="api node/video/LTXV", + category="video/partner/LTXV", description="Professional-quality videos with customizable duration and resolution based on start image.", inputs=[ IO.Image.Input("image", tooltip="First frame to be used for the video."), diff --git a/comfy_api_nodes/nodes_luma.py b/comfy_api_nodes/nodes_luma.py index d92a7c382..08ae9904c 100644 --- a/comfy_api_nodes/nodes_luma.py +++ b/comfy_api_nodes/nodes_luma.py @@ -46,7 +46,7 @@ class LumaReferenceNode(IO.ComfyNode): return IO.Schema( node_id="LumaReferenceNode", display_name="Luma Reference", - category="api node/image/Luma", + category="image/partner/Luma", description="Holds an image and weight for use with Luma Generate Image node.", inputs=[ IO.Image.Input( @@ -85,7 +85,7 @@ class LumaConceptsNode(IO.ComfyNode): return IO.Schema( node_id="LumaConceptsNode", display_name="Luma Concepts", - category="api node/video/Luma", + category="video/partner/Luma", description="Camera Concepts for use with Luma Text to Video and Luma Image to Video nodes.", inputs=[ IO.Combo.Input( @@ -134,7 +134,7 @@ class LumaImageGenerationNode(IO.ComfyNode): return IO.Schema( node_id="LumaImageNode", display_name="Luma Text to Image", - category="api node/image/Luma", + category="image/partner/Luma", description="Generates images synchronously based on prompt and aspect ratio.", inputs=[ IO.String.Input( @@ -278,7 +278,7 @@ class LumaImageModifyNode(IO.ComfyNode): return IO.Schema( node_id="LumaImageModifyNode", display_name="Luma Image to Image", - category="api node/image/Luma", + category="image/partner/Luma", description="Modifies images synchronously based on prompt and aspect ratio.", inputs=[ IO.Image.Input( @@ -371,7 +371,7 @@ class LumaTextToVideoGenerationNode(IO.ComfyNode): return IO.Schema( node_id="LumaVideoNode", display_name="Luma Text to Video", - category="api node/video/Luma", + category="video/partner/Luma", description="Generates videos synchronously based on prompt and output_size.", inputs=[ IO.String.Input( @@ -472,7 +472,7 @@ class LumaImageToVideoGenerationNode(IO.ComfyNode): return IO.Schema( node_id="LumaImageToVideoNode", display_name="Luma Image to Video", - category="api node/video/Luma", + category="video/partner/Luma", description="Generates videos synchronously based on prompt, input images, and output_size.", inputs=[ IO.String.Input( @@ -724,7 +724,7 @@ class LumaImageNode(IO.ComfyNode): return IO.Schema( node_id="LumaImageNode2", display_name="Luma UNI-1 Image", - category="api node/image/Luma", + category="image/partner/Luma", description="Generate images from text using the Luma UNI-1 model.", inputs=[ IO.String.Input( @@ -853,7 +853,7 @@ class LumaImageEditNode(IO.ComfyNode): return IO.Schema( node_id="LumaImageEditNode2", display_name="Luma UNI-1 Image Edit", - category="api node/image/Luma", + category="image/partner/Luma", description="Edit an existing image with a text prompt using the Luma UNI-1 model.", inputs=[ IO.Image.Input( diff --git a/comfy_api_nodes/nodes_magnific.py b/comfy_api_nodes/nodes_magnific.py index 38b881fea..a6aeb194a 100644 --- a/comfy_api_nodes/nodes_magnific.py +++ b/comfy_api_nodes/nodes_magnific.py @@ -61,7 +61,7 @@ class MagnificImageUpscalerCreativeNode(IO.ComfyNode): return IO.Schema( node_id="MagnificImageUpscalerCreativeNode", display_name="Magnific Image Upscale (Creative)", - category="api node/image/Magnific", + category="image/partner/Magnific", description="Prompt‑guided enhancement, stylization, and 2x/4x/8x/16x upscaling. " "Maximum output: 25.3 megapixels.", inputs=[ @@ -240,7 +240,7 @@ class MagnificImageUpscalerPreciseV2Node(IO.ComfyNode): return IO.Schema( node_id="MagnificImageUpscalerPreciseV2Node", display_name="Magnific Image Upscale (Precise V2)", - category="api node/image/Magnific", + category="image/partner/Magnific", description="High-fidelity upscaling with fine control over sharpness, grain, and detail. " "Maximum output: 10060×10060 pixels.", inputs=[ @@ -400,7 +400,7 @@ class MagnificImageStyleTransferNode(IO.ComfyNode): return IO.Schema( node_id="MagnificImageStyleTransferNode", display_name="Magnific Image Style Transfer", - category="api node/image/Magnific", + category="image/partner/Magnific", description="Transfer the style from a reference image to your input image.", inputs=[ IO.Image.Input("image", tooltip="The image to apply style transfer to."), @@ -549,7 +549,7 @@ class MagnificImageRelightNode(IO.ComfyNode): return IO.Schema( node_id="MagnificImageRelightNode", display_name="Magnific Image Relight", - category="api node/image/Magnific", + category="image/partner/Magnific", description="Relight an image with lighting adjustments and optional reference-based light transfer.", inputs=[ IO.Image.Input("image", tooltip="The image to relight."), @@ -789,7 +789,7 @@ class MagnificImageSkinEnhancerNode(IO.ComfyNode): return IO.Schema( node_id="MagnificImageSkinEnhancerNode", display_name="Magnific Image Skin Enhancer", - category="api node/image/Magnific", + category="image/partner/Magnific", description="Skin enhancement for portraits with multiple processing modes.", inputs=[ IO.Image.Input("image", tooltip="The portrait image to enhance."), diff --git a/comfy_api_nodes/nodes_meshy.py b/comfy_api_nodes/nodes_meshy.py index 3cf577f4a..4fb670404 100644 --- a/comfy_api_nodes/nodes_meshy.py +++ b/comfy_api_nodes/nodes_meshy.py @@ -33,7 +33,7 @@ class MeshyTextToModelNode(IO.ComfyNode): return IO.Schema( node_id="MeshyTextToModelNode", display_name="Meshy: Text to Model", - category="api node/3d/Meshy", + category="3d/partner/Meshy", inputs=[ IO.Combo.Input("model", options=["latest"]), IO.String.Input("prompt", multiline=True, default=""), @@ -145,7 +145,7 @@ class MeshyRefineNode(IO.ComfyNode): return IO.Schema( node_id="MeshyRefineNode", display_name="Meshy: Refine Draft Model", - category="api node/3d/Meshy", + category="3d/partner/Meshy", description="Refine a previously created draft model.", inputs=[ IO.Combo.Input("model", options=["latest"]), @@ -240,7 +240,7 @@ class MeshyImageToModelNode(IO.ComfyNode): return IO.Schema( node_id="MeshyImageToModelNode", display_name="Meshy: Image to Model", - category="api node/3d/Meshy", + category="3d/partner/Meshy", inputs=[ IO.Combo.Input("model", options=["latest"]), IO.Image.Input("image"), @@ -405,7 +405,7 @@ class MeshyMultiImageToModelNode(IO.ComfyNode): return IO.Schema( node_id="MeshyMultiImageToModelNode", display_name="Meshy: Multi-Image to Model", - category="api node/3d/Meshy", + category="3d/partner/Meshy", inputs=[ IO.Combo.Input("model", options=["latest"]), IO.Autogrow.Input( @@ -575,7 +575,7 @@ class MeshyRigModelNode(IO.ComfyNode): return IO.Schema( node_id="MeshyRigModelNode", display_name="Meshy: Rig Model", - category="api node/3d/Meshy", + category="3d/partner/Meshy", description="Provides a rigged character in standard formats. " "Auto-rigging is currently not suitable for untextured meshes, non-humanoid assets, " "or humanoid assets with unclear limb and body structure.", @@ -656,7 +656,7 @@ class MeshyAnimateModelNode(IO.ComfyNode): return IO.Schema( node_id="MeshyAnimateModelNode", display_name="Meshy: Animate Model", - category="api node/3d/Meshy", + category="3d/partner/Meshy", description="Apply a specific animation action to a previously rigged character.", inputs=[ IO.Custom("MESHY_RIGGED_TASK_ID").Input("rig_task_id"), @@ -722,7 +722,7 @@ class MeshyTextureNode(IO.ComfyNode): return IO.Schema( node_id="MeshyTextureNode", display_name="Meshy: Texture Model", - category="api node/3d/Meshy", + category="3d/partner/Meshy", inputs=[ IO.Combo.Input("model", options=["latest"]), IO.Custom("MESHY_TASK_ID").Input("meshy_task_id"), diff --git a/comfy_api_nodes/nodes_minimax.py b/comfy_api_nodes/nodes_minimax.py index b5d0b461f..338584148 100644 --- a/comfy_api_nodes/nodes_minimax.py +++ b/comfy_api_nodes/nodes_minimax.py @@ -101,7 +101,7 @@ class MinimaxTextToVideoNode(IO.ComfyNode): return IO.Schema( node_id="MinimaxTextToVideoNode", display_name="MiniMax Text to Video", - category="api node/video/MiniMax", + category="video/partner/MiniMax", description="Generates videos synchronously based on a prompt, and optional parameters.", inputs=[ IO.String.Input( @@ -163,7 +163,7 @@ class MinimaxImageToVideoNode(IO.ComfyNode): return IO.Schema( node_id="MinimaxImageToVideoNode", display_name="MiniMax Image to Video", - category="api node/video/MiniMax", + category="video/partner/MiniMax", description="Generates videos synchronously based on an image and prompt, and optional parameters.", inputs=[ IO.Image.Input( @@ -230,7 +230,7 @@ class MinimaxSubjectToVideoNode(IO.ComfyNode): return IO.Schema( node_id="MinimaxSubjectToVideoNode", display_name="MiniMax Subject to Video", - category="api node/video/MiniMax", + category="video/partner/MiniMax", description="Generates videos synchronously based on an image and prompt, and optional parameters.", inputs=[ IO.Image.Input( @@ -294,7 +294,7 @@ class MinimaxHailuoVideoNode(IO.ComfyNode): return IO.Schema( node_id="MinimaxHailuoVideoNode", display_name="MiniMax Hailuo Video", - category="api node/video/MiniMax", + category="video/partner/MiniMax", description="Generates videos from prompt, with optional start frame using the new MiniMax Hailuo-02 model.", inputs=[ IO.String.Input( diff --git a/comfy_api_nodes/nodes_openai.py b/comfy_api_nodes/nodes_openai.py index a5a188634..48c739dfe 100644 --- a/comfy_api_nodes/nodes_openai.py +++ b/comfy_api_nodes/nodes_openai.py @@ -99,7 +99,7 @@ class OpenAIDalle2(IO.ComfyNode): return IO.Schema( node_id="OpenAIDalle2", display_name="OpenAI DALL·E 2", - category="api node/image/OpenAI", + category="image/partner/OpenAI", description="Generates images synchronously via OpenAI's DALL·E 2 endpoint.", inputs=[ IO.String.Input( @@ -249,7 +249,7 @@ class OpenAIDalle3(IO.ComfyNode): return IO.Schema( node_id="OpenAIDalle3", display_name="OpenAI DALL·E 3", - category="api node/image/OpenAI", + category="image/partner/OpenAI", description="Generates images synchronously via OpenAI's DALL·E 3 endpoint.", inputs=[ IO.String.Input( @@ -371,7 +371,7 @@ class OpenAIGPTImage1(IO.ComfyNode): return IO.Schema( node_id="OpenAIGPTImage1", display_name="OpenAI GPT Image 2", - category="api node/image/OpenAI", + category="image/partner/OpenAI", description="Generates images synchronously via OpenAI's GPT Image endpoint.", is_deprecated=True, inputs=[ @@ -695,7 +695,7 @@ class OpenAIGPTImageNodeV2(IO.ComfyNode): return IO.Schema( node_id="OpenAIGPTImageNodeV2", display_name="OpenAI GPT Image 2", - category="api node/image/OpenAI", + category="image/partner/OpenAI", description="Generates images via OpenAI's GPT Image endpoint.", inputs=[ IO.String.Input( @@ -962,7 +962,7 @@ class OpenAIChatNode(IO.ComfyNode): return IO.Schema( node_id="OpenAIChatNode", display_name="OpenAI ChatGPT", - category="api node/text/OpenAI", + category="text/partner/OpenAI", essentials_category="Text Generation", description="Generate text responses from an OpenAI model.", inputs=[ @@ -1201,7 +1201,7 @@ class OpenAIInputFiles(IO.ComfyNode): return IO.Schema( node_id="OpenAIInputFiles", display_name="OpenAI ChatGPT Input Files", - category="api node/text/OpenAI", + category="text/partner/OpenAI", description="Loads and prepares input files (text, pdf, etc.) to include as inputs for the OpenAI Chat Node. The files will be read by the OpenAI model when generating a response. 🛈 TIP: Can be chained together with other OpenAI Input File nodes.", inputs=[ IO.Combo.Input( @@ -1248,7 +1248,7 @@ class OpenAIChatConfig(IO.ComfyNode): return IO.Schema( node_id="OpenAIChatConfig", display_name="OpenAI ChatGPT Advanced Options", - category="api node/text/OpenAI", + category="text/partner/OpenAI", description="Allows specifying advanced configuration options for the OpenAI Chat Nodes.", inputs=[ IO.Combo.Input( diff --git a/comfy_api_nodes/nodes_openrouter.py b/comfy_api_nodes/nodes_openrouter.py new file mode 100644 index 000000000..d2ebbef0d --- /dev/null +++ b/comfy_api_nodes/nodes_openrouter.py @@ -0,0 +1,374 @@ +"""API Nodes for OpenRouter LLM chat completions.""" + +from dataclasses import dataclass +from typing import Literal + +from typing_extensions import override + +from comfy_api.latest import IO, ComfyExtension, Input +from comfy_api_nodes.apis.openrouter import ( + OpenRouterChatRequest, + OpenRouterChatResponse, + OpenRouterContentBlock, + OpenRouterImageContent, + OpenRouterImageUrl, + OpenRouterMessage, + OpenRouterReasoningConfig, + OpenRouterTextContent, + OpenRouterVideoContent, + OpenRouterVideoUrl, + OpenRouterWebSearchOptions, +) +from comfy_api_nodes.util import ( + ApiEndpoint, + get_number_of_images, + sync_op, + upload_images_to_comfyapi, + upload_video_to_comfyapi, + validate_string, +) + +OPENROUTER_CHAT_ENDPOINT = "/proxy/openrouter/api/v1/chat/completions" + + +Profile = Literal["standard", "reasoning", "frontier_reasoning", "perplexity", "perplexity_reasoning"] + + +@dataclass(frozen=True) +class _ModelSpec: + slug: str # exact OpenRouter model id + profile: Profile + price_in: float # USD per token (prompt) + price_out: float # USD per token (completion) + max_images: int = 0 # 0 = no image input; otherwise max URL-passed images supported + max_videos: int = 0 # 0 = no video input; otherwise max URL-passed videos supported + + +MODELS: list[_ModelSpec] = [ + _ModelSpec("anthropic/claude-opus-4.7", "frontier_reasoning", 0.000005, 0.000025, max_images=20), + _ModelSpec("openai/gpt-5.5-pro", "frontier_reasoning", 0.00003, 0.00018, max_images=20), + _ModelSpec("openai/gpt-5.5", "frontier_reasoning", 0.000005, 0.00003, max_images=20), + _ModelSpec("google/gemini-3.5-flash", "reasoning", 0.0000015, 0.000009, max_images=20, max_videos=4), + _ModelSpec("x-ai/grok-4.20", "reasoning", 0.00000125, 0.0000025, max_images=20), + _ModelSpec("x-ai/grok-4.3", "reasoning", 0.00000125, 0.0000025, max_images=20), + _ModelSpec("deepseek/deepseek-v4-pro", "reasoning", 0.000000435, 0.00000087), + _ModelSpec("deepseek/deepseek-v4-flash", "reasoning", 0.000000112, 0.000000224), + _ModelSpec("deepseek/deepseek-v3.2", "reasoning", 0.000000252, 0.000000378), + _ModelSpec("qwen/qwen3.6-max-preview", "reasoning", 0.00000104, 0.00000624), + _ModelSpec("qwen/qwen3.6-plus", "reasoning", 0.000000325, 0.00000195, max_images=10, max_videos=4), + _ModelSpec("qwen/qwen3.6-flash", "reasoning", 0.0000001875, 0.000001125, max_images=10, max_videos=4), + _ModelSpec("mistralai/mistral-large-2512", "standard", 0.0000005, 0.0000015, max_images=8), + _ModelSpec("mistralai/mistral-medium-3-5", "reasoning", 0.0000015, 0.0000075, max_images=8), + _ModelSpec("z-ai/glm-4.6", "reasoning", 0.00000043, 0.00000174), + _ModelSpec("z-ai/glm-5", "reasoning", 0.0000006, 0.00000192), + _ModelSpec("moonshotai/kimi-k2.6", "reasoning", 0.00000073, 0.00000349, max_images=10), + _ModelSpec("moonshotai/kimi-k2-thinking", "reasoning", 0.0000006, 0.0000025), + _ModelSpec("perplexity/sonar-pro", "perplexity", 0.000003, 0.000015), + _ModelSpec("perplexity/sonar-reasoning-pro", "perplexity_reasoning", 0.000002, 0.000008), + _ModelSpec("perplexity/sonar-deep-research", "perplexity_reasoning", 0.000002, 0.000008), +] + +_MODELS_BY_SLUG: dict[str, _ModelSpec] = {m.slug: m for m in MODELS} +_REASONING_EFFORTS = ["off", "low", "medium", "high"] +_SEARCH_CONTEXT_SIZES = ["low", "medium", "high"] + + +def _reasoning_extra_inputs() -> list: + return [ + IO.Combo.Input( + "reasoning_effort", + options=_REASONING_EFFORTS, + default="off", + tooltip="Reasoning effort. 'off' disables reasoning entirely.", + advanced=True, + ), + ] + + +def _perplexity_extra_inputs() -> list: + return [ + IO.Combo.Input( + "search_context_size", + options=_SEARCH_CONTEXT_SIZES, + default="medium", + tooltip="How much web search context to retrieve. Larger = more grounded but slower/pricier.", + advanced=True, + ), + ] + + +def _profile_inputs(profile: Profile) -> list: + if profile == "standard": + return [] + if profile in ("reasoning", "frontier_reasoning"): + return _reasoning_extra_inputs() + if profile == "perplexity": + return _perplexity_extra_inputs() + if profile == "perplexity_reasoning": + return _perplexity_extra_inputs() + _reasoning_extra_inputs() + raise ValueError(f"Unknown profile: {profile}") + + +def _media_inputs(spec: _ModelSpec) -> list: + extras: list = [] + if spec.max_images > 0: + extras.append( + IO.Autogrow.Input( + "images", + template=IO.Autogrow.TemplateNames( + IO.Image.Input("image"), + names=[f"image_{i}" for i in range(1, spec.max_images + 1)], + min=0, + ), + tooltip=f"Optional reference image(s) — up to {spec.max_images}. Sent as URLs.", + ) + ) + if spec.max_videos > 0: + extras.append( + IO.Autogrow.Input( + "videos", + template=IO.Autogrow.TemplateNames( + IO.Video.Input("video"), + names=[f"video_{i}" for i in range(1, spec.max_videos + 1)], + min=0, + ), + tooltip=f"Optional reference video(s) — up to {spec.max_videos}. Sent as URLs.", + ) + ) + return extras + + +def _inputs_for_model(spec: _ModelSpec) -> list: + return _profile_inputs(spec.profile) + _media_inputs(spec) + + +def _build_model_options() -> list[IO.DynamicCombo.Option]: + return [IO.DynamicCombo.Option(spec.slug, _inputs_for_model(spec)) for spec in MODELS] + + +def _calculate_price(response: OpenRouterChatResponse) -> float | None: + if response.usage and response.usage.cost is not None: + return float(response.usage.cost) + return None + + +def _price_badge_jsonata() -> str: + rates_pairs = [] + for spec in MODELS: + prompt_per_1k = spec.price_in * 1000 + completion_per_1k = spec.price_out * 1000 + rates_pairs.append(f' "{spec.slug}": [{prompt_per_1k:.8g}, {completion_per_1k:.8g}]') + rates_block = ",\n".join(rates_pairs) + return ( + "(\n" + " $rates := {\n" + f"{rates_block}\n" + " };\n" + " $r := $lookup($rates, widgets.model);\n" + " $r ? {\n" + ' "type": "list_usd",\n' + ' "usd": $r,\n' + ' "format": { "approximate": true, "separator": "-", "suffix": " per 1K tokens" }\n' + ' } : {"type": "text", "text": "Token-based"}\n' + ")" + ) + + +async def _build_image_blocks( + cls: type[IO.ComfyNode], spec: _ModelSpec, images: list[Input.Image] +) -> list[OpenRouterImageContent]: + urls = await upload_images_to_comfyapi( + cls, + images, + max_images=spec.max_images, + total_pixels=2048 * 2048, + mime_type="image/png", + wait_label="Uploading reference images", + ) + return [OpenRouterImageContent(image_url=OpenRouterImageUrl(url=url)) for url in urls] + + +async def _build_video_blocks(cls: type[IO.ComfyNode], videos: list[Input.Video]) -> list[OpenRouterVideoContent]: + blocks: list[OpenRouterVideoContent] = [] + total = len(videos) + for idx, video in enumerate(videos): + label = "Uploading reference video" + if total > 1: + label = f"{label} ({idx + 1}/{total})" + url = await upload_video_to_comfyapi(cls, video, wait_label=label) + blocks.append(OpenRouterVideoContent(video_url=OpenRouterVideoUrl(url=url))) + return blocks + + +def _user_message(prompt: str, media_blocks: list[OpenRouterContentBlock]) -> OpenRouterMessage: + if not media_blocks: + return OpenRouterMessage(role="user", content=prompt) + blocks: list[OpenRouterContentBlock] = list(media_blocks) + blocks.append(OpenRouterTextContent(text=prompt)) + return OpenRouterMessage(role="user", content=blocks) + + +def _build_messages( + system_prompt: str, prompt: str, media_blocks: list[OpenRouterContentBlock] +) -> list[OpenRouterMessage]: + messages: list[OpenRouterMessage] = [] + if system_prompt: + messages.append(OpenRouterMessage(role="system", content=system_prompt)) + messages.append(_user_message(prompt, media_blocks)) + return messages + + +def _build_request( + slug: str, + system_prompt: str, + prompt: str, + media_blocks: list[OpenRouterContentBlock], + *, + seed: int, + reasoning_effort: str | None, + search_context_size: str | None, +) -> OpenRouterChatRequest: + reasoning_cfg: OpenRouterReasoningConfig | None = None + if reasoning_effort and reasoning_effort != "off": + # exclude=True asks providers to reason internally but not return the trace + reasoning_cfg = OpenRouterReasoningConfig(effort=reasoning_effort, exclude=True) + web_search_cfg: OpenRouterWebSearchOptions | None = None + if search_context_size: + web_search_cfg = OpenRouterWebSearchOptions(search_context_size=search_context_size) + return OpenRouterChatRequest( + model=slug, + messages=_build_messages(system_prompt, prompt, media_blocks), + seed=seed if seed > 0 else None, + reasoning=reasoning_cfg, + web_search_options=web_search_cfg, + ) + + +def _extract_text(response: OpenRouterChatResponse) -> str: + if response.error: + code = response.error.code if response.error.code is not None else "unknown" + raise ValueError(f"OpenRouter error ({code}): {response.error.message or 'no message'}") + if not response.choices: + raise ValueError("Empty response from OpenRouter (no choices).") + message = response.choices[0].message + if not message: + raise ValueError("Empty response from OpenRouter (no message).") + if message.refusal: + raise ValueError(f"Model refused to respond: {message.refusal}") + return message.content or "" + + +class OpenRouterLLMNode(IO.ComfyNode): + + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="OpenRouterLLMNode", + display_name="OpenRouter LLM", + category="text/partner/OpenRouter", + essentials_category="Text Generation", + description=( + "Generate text responses through OpenRouter. Routes to a curated set of popular " + "models from xAI, DeepSeek, Qwen, Mistral, Z.AI (GLM), Moonshot (Kimi), and " + "Perplexity Sonar." + ), + inputs=[ + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Text input to the model.", + ), + IO.DynamicCombo.Input( + "model", + options=_build_model_options(), + tooltip="The OpenRouter model used to generate the response.", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + control_after_generate=True, + tooltip="Seed for sampling. Set to 0 to omit. Most models treat this as a hint only.", + ), + IO.String.Input( + "system_prompt", + multiline=True, + default="", + optional=True, + advanced=True, + tooltip="Foundational instructions that dictate the model's behavior.", + ), + ], + outputs=[IO.String.Output()], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["model"]), + expr=_price_badge_jsonata(), + ), + ) + + @classmethod + async def execute( + cls, + prompt: str, + model: dict, + seed: int, + system_prompt: str = "", + ) -> IO.NodeOutput: + validate_string(prompt, strip_whitespace=True, min_length=1) + slug: str = model["model"] + spec = _MODELS_BY_SLUG.get(slug) + if spec is None: + raise ValueError(f"Unknown OpenRouter model: {slug}") + + reasoning_effort: str | None = model.get("reasoning_effort") + search_context_size: str | None = model.get("search_context_size") + + image_tensors: list[Input.Image] = [t for t in (model.get("images") or {}).values() if t is not None] + if image_tensors and sum(get_number_of_images(t) for t in image_tensors) > spec.max_images: + raise ValueError(f"Up to {spec.max_images} images are supported for {slug}.") + video_inputs: list[Input.Video] = [v for v in (model.get("videos") or {}).values() if v is not None] + if video_inputs and len(video_inputs) > spec.max_videos: + raise ValueError(f"Up to {spec.max_videos} videos are supported for {slug}.") + + media_blocks: list[OpenRouterContentBlock] = [] + if image_tensors: + media_blocks.extend(await _build_image_blocks(cls, spec, image_tensors)) + if video_inputs: + media_blocks.extend(await _build_video_blocks(cls, video_inputs)) + + request = _build_request( + slug, + system_prompt, + prompt, + media_blocks, + seed=seed, + reasoning_effort=reasoning_effort, + search_context_size=search_context_size, + ) + + response = await sync_op( + cls, + ApiEndpoint(path=OPENROUTER_CHAT_ENDPOINT, method="POST"), + response_model=OpenRouterChatResponse, + data=request, + price_extractor=_calculate_price, + ) + return IO.NodeOutput(_extract_text(response)) + + +class OpenRouterExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [OpenRouterLLMNode] + + +async def comfy_entrypoint() -> OpenRouterExtension: + return OpenRouterExtension() diff --git a/comfy_api_nodes/nodes_pixverse.py b/comfy_api_nodes/nodes_pixverse.py index e17a24ae7..3861cfedd 100644 --- a/comfy_api_nodes/nodes_pixverse.py +++ b/comfy_api_nodes/nodes_pixverse.py @@ -53,7 +53,7 @@ class PixverseTemplateNode(IO.ComfyNode): return IO.Schema( node_id="PixverseTemplateNode", display_name="PixVerse Template", - category="api node/video/PixVerse", + category="video/partner/PixVerse", inputs=[ IO.Combo.Input("template", options=list(pixverse_templates.keys())), ], @@ -74,7 +74,7 @@ class PixverseTextToVideoNode(IO.ComfyNode): return IO.Schema( node_id="PixverseTextToVideoNode", display_name="PixVerse Text to Video", - category="api node/video/PixVerse", + category="video/partner/PixVerse", description="Generates videos based on prompt and output_size.", inputs=[ IO.String.Input( @@ -192,7 +192,7 @@ class PixverseImageToVideoNode(IO.ComfyNode): return IO.Schema( node_id="PixverseImageToVideoNode", display_name="PixVerse Image to Video", - category="api node/video/PixVerse", + category="video/partner/PixVerse", description="Generates videos based on prompt and output_size.", inputs=[ IO.Image.Input("image"), @@ -310,7 +310,7 @@ class PixverseTransitionVideoNode(IO.ComfyNode): return IO.Schema( node_id="PixverseTransitionVideoNode", display_name="PixVerse Transition Video", - category="api node/video/PixVerse", + category="video/partner/PixVerse", description="Generates videos based on prompt and output_size.", inputs=[ IO.Image.Input("first_frame"), diff --git a/comfy_api_nodes/nodes_quiver.py b/comfy_api_nodes/nodes_quiver.py index 3269c0afe..ad045a7ef 100644 --- a/comfy_api_nodes/nodes_quiver.py +++ b/comfy_api_nodes/nodes_quiver.py @@ -62,7 +62,7 @@ class QuiverTextToSVGNode(IO.ComfyNode): return IO.Schema( node_id="QuiverTextToSVGNode", display_name="Quiver Text to SVG", - category="api node/image/Quiver", + category="image/partner/Quiver", description="Generate an SVG from a text prompt using Quiver AI.", inputs=[ IO.String.Input( @@ -177,7 +177,7 @@ class QuiverImageToSVGNode(IO.ComfyNode): return IO.Schema( node_id="QuiverImageToSVGNode", display_name="Quiver Image to SVG", - category="api node/image/Quiver", + category="image/partner/Quiver", description="Vectorize a raster image into SVG using Quiver AI.", inputs=[ IO.Image.Input( diff --git a/comfy_api_nodes/nodes_recraft.py b/comfy_api_nodes/nodes_recraft.py index c60cfbc4a..07387821d 100644 --- a/comfy_api_nodes/nodes_recraft.py +++ b/comfy_api_nodes/nodes_recraft.py @@ -178,7 +178,7 @@ class RecraftColorRGBNode(IO.ComfyNode): return IO.Schema( node_id="RecraftColorRGB", display_name="Recraft Color RGB", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Create Recraft Color by choosing specific RGB values.", inputs=[ IO.Int.Input("r", default=0, min=0, max=255, tooltip="Red value of color."), @@ -204,7 +204,7 @@ class RecraftControlsNode(IO.ComfyNode): return IO.Schema( node_id="RecraftControls", display_name="Recraft Controls", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Create Recraft Controls for customizing Recraft generation.", inputs=[ IO.Custom(RecraftIO.COLOR).Input("colors", optional=True), @@ -228,7 +228,7 @@ class RecraftStyleV3RealisticImageNode(IO.ComfyNode): return IO.Schema( node_id="RecraftStyleV3RealisticImage", display_name="Recraft Style - Realistic Image", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Select realistic_image style and optional substyle.", inputs=[ IO.Combo.Input("substyle", options=get_v3_substyles(cls.RECRAFT_STYLE)), @@ -253,7 +253,7 @@ class RecraftStyleV3DigitalIllustrationNode(RecraftStyleV3RealisticImageNode): return IO.Schema( node_id="RecraftStyleV3DigitalIllustration", display_name="Recraft Style - Digital Illustration", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Select realistic_image style and optional substyle.", inputs=[ IO.Combo.Input("substyle", options=get_v3_substyles(cls.RECRAFT_STYLE)), @@ -272,7 +272,7 @@ class RecraftStyleV3VectorIllustrationNode(RecraftStyleV3RealisticImageNode): return IO.Schema( node_id="RecraftStyleV3VectorIllustrationNode", display_name="Recraft Style - Realistic Image", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Select realistic_image style and optional substyle.", inputs=[ IO.Combo.Input("substyle", options=get_v3_substyles(cls.RECRAFT_STYLE)), @@ -291,7 +291,7 @@ class RecraftStyleV3LogoRasterNode(RecraftStyleV3RealisticImageNode): return IO.Schema( node_id="RecraftStyleV3LogoRaster", display_name="Recraft Style - Logo Raster", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Select realistic_image style and optional substyle.", inputs=[ IO.Combo.Input("substyle", options=get_v3_substyles(cls.RECRAFT_STYLE, include_none=False)), @@ -308,7 +308,7 @@ class RecraftStyleInfiniteStyleLibrary(IO.ComfyNode): return IO.Schema( node_id="RecraftStyleV3InfiniteStyleLibrary", display_name="Recraft Style - Infinite Style Library", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Choose style based on preexisting UUID from Recraft's Infinite Style Library.", inputs=[ IO.String.Input("style_id", default="", tooltip="UUID of style from Infinite Style Library."), @@ -331,7 +331,7 @@ class RecraftCreateStyleNode(IO.ComfyNode): return IO.Schema( node_id="RecraftCreateStyleNode", display_name="Recraft Create Style", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Create a custom style from reference images. " "Upload 1-5 images to use as style references. " "Total size of all images is limited to 5 MB.", @@ -400,7 +400,7 @@ class RecraftTextToImageNode(IO.ComfyNode): return IO.Schema( node_id="RecraftTextToImageNode", display_name="Recraft Text to Image", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Generates images synchronously based on prompt and resolution.", inputs=[ IO.String.Input("prompt", multiline=True, default="", tooltip="Prompt for the image generation."), @@ -512,7 +512,7 @@ class RecraftImageToImageNode(IO.ComfyNode): return IO.Schema( node_id="RecraftImageToImageNode", display_name="Recraft Image to Image", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Modify image based on prompt and strength.", inputs=[ IO.Image.Input("image"), @@ -630,7 +630,7 @@ class RecraftImageInpaintingNode(IO.ComfyNode): return IO.Schema( node_id="RecraftImageInpaintingNode", display_name="Recraft Image Inpainting", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Modify image based on prompt and mask.", inputs=[ IO.Image.Input("image"), @@ -732,7 +732,7 @@ class RecraftTextToVectorNode(IO.ComfyNode): return IO.Schema( node_id="RecraftTextToVectorNode", display_name="Recraft Text to Vector", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Generates SVG synchronously based on prompt and resolution.", inputs=[ IO.String.Input("prompt", default="", tooltip="Prompt for the image generation.", multiline=True), @@ -832,7 +832,7 @@ class RecraftVectorizeImageNode(IO.ComfyNode): return IO.Schema( node_id="RecraftVectorizeImageNode", display_name="Recraft Vectorize Image", - category="api node/image/Recraft", + category="image/partner/Recraft", essentials_category="Image Tools", description="Generates SVG synchronously from an input image.", inputs=[ @@ -876,7 +876,7 @@ class RecraftReplaceBackgroundNode(IO.ComfyNode): return IO.Schema( node_id="RecraftReplaceBackgroundNode", display_name="Recraft Replace Background", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Replace background on image, based on provided prompt.", inputs=[ IO.Image.Input("image"), @@ -963,7 +963,7 @@ class RecraftRemoveBackgroundNode(IO.ComfyNode): return IO.Schema( node_id="RecraftRemoveBackgroundNode", display_name="Recraft Remove Background", - category="api node/image/Recraft", + category="image/partner/Recraft", essentials_category="Image Tools", description="Remove background from image, and return processed image and mask.", inputs=[ @@ -1012,7 +1012,7 @@ class RecraftCrispUpscaleNode(IO.ComfyNode): return IO.Schema( node_id="RecraftCrispUpscaleNode", display_name="Recraft Crisp Upscale Image", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Upscale image synchronously.\n" "Enhances a given raster image using ‘crisp upscale’ tool, " "increasing image resolution, making the image sharper and cleaner.", @@ -1058,7 +1058,7 @@ class RecraftCreativeUpscaleNode(RecraftCrispUpscaleNode): return IO.Schema( node_id="RecraftCreativeUpscaleNode", display_name="Recraft Creative Upscale Image", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Upscale image synchronously.\n" "Enhances a given raster image using ‘creative upscale’ tool, " "boosting resolution with a focus on refining small details and faces.", @@ -1086,7 +1086,7 @@ class RecraftV4TextToImageNode(IO.ComfyNode): return IO.Schema( node_id="RecraftV4TextToImageNode", display_name="Recraft V4 Text to Image", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Generates images using Recraft V4 or V4 Pro models.", inputs=[ IO.String.Input( @@ -1210,7 +1210,7 @@ class RecraftV4TextToVectorNode(IO.ComfyNode): return IO.Schema( node_id="RecraftV4TextToVectorNode", display_name="Recraft V4 Text to Vector", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Generates SVG using Recraft V4 or V4 Pro models.", inputs=[ IO.String.Input( diff --git a/comfy_api_nodes/nodes_reve.py b/comfy_api_nodes/nodes_reve.py index a87395394..2b15eadd7 100644 --- a/comfy_api_nodes/nodes_reve.py +++ b/comfy_api_nodes/nodes_reve.py @@ -109,7 +109,7 @@ class ReveImageCreateNode(IO.ComfyNode): return IO.Schema( node_id="ReveImageCreateNode", display_name="Reve Image Create", - category="api node/image/Reve", + category="image/partner/Reve", description="Generate images from text descriptions using Reve.", inputs=[ IO.String.Input( @@ -200,7 +200,7 @@ class ReveImageEditNode(IO.ComfyNode): return IO.Schema( node_id="ReveImageEditNode", display_name="Reve Image Edit", - category="api node/image/Reve", + category="image/partner/Reve", description="Edit images using natural language instructions with Reve.", inputs=[ IO.Image.Input("image", tooltip="The image to edit."), @@ -300,7 +300,7 @@ class ReveImageRemixNode(IO.ComfyNode): return IO.Schema( node_id="ReveImageRemixNode", display_name="Reve Image Remix", - category="api node/image/Reve", + category="image/partner/Reve", description="Combine reference images with text prompts to create new images using Reve.", inputs=[ IO.Autogrow.Input( diff --git a/comfy_api_nodes/nodes_rodin.py b/comfy_api_nodes/nodes_rodin.py index 2b829b8db..e14955661 100644 --- a/comfy_api_nodes/nodes_rodin.py +++ b/comfy_api_nodes/nodes_rodin.py @@ -5,32 +5,37 @@ Rodin API docs: https://developer.hyper3d.ai/ """ -from inspect import cleandoc -import folder_paths as comfy_paths -import os import logging import math +import os +from inspect import cleandoc from io import BytesIO -from typing_extensions import override +from typing import Any + +import aiohttp from PIL import Image +from typing_extensions import override + +import folder_paths as comfy_paths +from comfy_api.latest import IO, ComfyExtension, Types from comfy_api_nodes.apis.rodin import ( - Rodin3DGenerateRequest, - Rodin3DGenerateResponse, + JobStatus, Rodin3DCheckStatusRequest, Rodin3DCheckStatusResponse, Rodin3DDownloadRequest, Rodin3DDownloadResponse, - JobStatus, + Rodin3DGen25Request, + Rodin3DGenerateRequest, + Rodin3DGenerateResponse, ) from comfy_api_nodes.util import ( - sync_op, - poll_op, ApiEndpoint, download_url_to_bytesio, download_url_to_file_3d, + poll_op, + sync_op, + validate_string, ) -from comfy_api.latest import ComfyExtension, IO, Types - COMMON_PARAMETERS = [ IO.Int.Input( @@ -51,40 +56,30 @@ COMMON_PARAMETERS = [ ] -def get_quality_mode(poly_count): - polycount = poly_count.split("-") - poly = polycount[1] - count = polycount[0] - if poly == "Triangle": - mesh_mode = "Raw" - elif poly == "Quad": - mesh_mode = "Quad" - else: - mesh_mode = "Quad" - - if count == "4K": - quality_override = 4000 - elif count == "8K": - quality_override = 8000 - elif count == "18K": - quality_override = 18000 - elif count == "50K": - quality_override = 50000 - elif count == "2K": - quality_override = 2000 - elif count == "20K": - quality_override = 20000 - elif count == "150K": - quality_override = 150000 - elif count == "500K": - quality_override = 500000 - else: - quality_override = 18000 - - return mesh_mode, quality_override +_QUALITY_MESH_OPTIONS: dict[str, tuple[str, int]] = { + "4K-Quad": ("Quad", 4000), + "8K-Quad": ("Quad", 8000), + "18K-Quad": ("Quad", 18000), + "50K-Quad": ("Quad", 50000), + "200K-Quad": ("Quad", 200000), + "2K-Triangle": ("Raw", 2000), + "20K-Triangle": ("Raw", 20000), + "150K-Triangle": ("Raw", 150000), + "200K-Triangle": ("Raw", 200000), + "500K-Triangle": ("Raw", 500000), + "1M-Triangle": ("Raw", 1000000), +} -def tensor_to_filelike(tensor, max_pixels: int = 2048*2048): +def get_quality_mode(poly_count: str) -> tuple[str, int]: + """Map a polygon-count preset like '18K-Quad' to (mesh_mode, quality_override). + + Falls back to ('Quad', 18000) for unknown labels; legacy parity. + """ + return _QUALITY_MESH_OPTIONS.get(poly_count, ("Quad", 18000)) + + +def tensor_to_filelike(tensor, max_pixels: int = 2048 * 2048): """ Converts a PyTorch tensor to a file-like object. @@ -96,8 +91,8 @@ def tensor_to_filelike(tensor, max_pixels: int = 2048*2048): - io.BytesIO: A file-like object containing the image data. """ array = tensor.cpu().numpy() - array = (array * 255).astype('uint8') - image = Image.fromarray(array, 'RGB') + array = (array * 255).astype("uint8") + image = Image.fromarray(array, "RGB") original_width, original_height = image.size original_pixels = original_width * original_height @@ -112,7 +107,7 @@ def tensor_to_filelike(tensor, max_pixels: int = 2048*2048): image = image.resize((new_width, new_height), Image.Resampling.LANCZOS) img_byte_arr = BytesIO() - image.save(img_byte_arr, format='PNG') # PNG is used for lossless compression + image.save(img_byte_arr, format="PNG") # PNG is used for lossless compression img_byte_arr.seek(0) return img_byte_arr @@ -145,11 +140,9 @@ async def create_generate_task( TAPose=ta_pose, ), files=[ - ( - "images", - open(image, "rb") if isinstance(image, str) else tensor_to_filelike(image) - ) - for image in images if image is not None + ("images", open(image, "rb") if isinstance(image, str) else tensor_to_filelike(image)) + for image in images + if image is not None ], content_type="multipart/form-data", ) @@ -177,6 +170,7 @@ def check_rodin_status(response: Rodin3DCheckStatusResponse) -> str: return "DONE" return "Generating" + def extract_progress(response: Rodin3DCheckStatusResponse) -> int | None: if not response.jobs: return None @@ -214,7 +208,7 @@ async def download_files(url_list, task_uuid: str) -> tuple[str | None, Types.Fi model_file_path = None file_3d = None - for i in url_list.list: + for i in url_list.items: file_path = os.path.join(save_path, i.name) if i.name.lower().endswith(".glb"): model_file_path = os.path.join(result_folder_name, i.name) @@ -236,7 +230,7 @@ class Rodin3D_Regular(IO.ComfyNode): return IO.Schema( node_id="Rodin3D_Regular", display_name="Rodin 3D Generate - Regular Generate", - category="api node/3d/Rodin", + category="3d/partner/Rodin", description=cleandoc(cls.__doc__ or ""), inputs=[ IO.Image.Input("Images"), @@ -295,7 +289,7 @@ class Rodin3D_Detail(IO.ComfyNode): return IO.Schema( node_id="Rodin3D_Detail", display_name="Rodin 3D Generate - Detail Generate", - category="api node/3d/Rodin", + category="3d/partner/Rodin", description=cleandoc(cls.__doc__ or ""), inputs=[ IO.Image.Input("Images"), @@ -354,7 +348,7 @@ class Rodin3D_Smooth(IO.ComfyNode): return IO.Schema( node_id="Rodin3D_Smooth", display_name="Rodin 3D Generate - Smooth Generate", - category="api node/3d/Rodin", + category="3d/partner/Rodin", description=cleandoc(cls.__doc__ or ""), inputs=[ IO.Image.Input("Images"), @@ -412,7 +406,7 @@ class Rodin3D_Sketch(IO.ComfyNode): return IO.Schema( node_id="Rodin3D_Sketch", display_name="Rodin 3D Generate - Sketch Generate", - category="api node/3d/Rodin", + category="3d/partner/Rodin", description=cleandoc(cls.__doc__ or ""), inputs=[ IO.Image.Input("Images"), @@ -474,7 +468,7 @@ class Rodin3D_Gen2(IO.ComfyNode): return IO.Schema( node_id="Rodin3D_Gen2", display_name="Rodin 3D Generate - Gen-2 Generate", - category="api node/3d/Rodin", + category="3d/partner/Rodin", description=cleandoc(cls.__doc__ or ""), inputs=[ IO.Image.Input("Images"), @@ -489,7 +483,16 @@ class Rodin3D_Gen2(IO.ComfyNode): IO.Combo.Input("Material_Type", options=["PBR", "Shaded"], default="PBR", optional=True), IO.Combo.Input( "Polygon_count", - options=["4K-Quad", "8K-Quad", "18K-Quad", "50K-Quad", "2K-Triangle", "20K-Triangle", "150K-Triangle", "500K-Triangle"], + options=[ + "4K-Quad", + "8K-Quad", + "18K-Quad", + "50K-Quad", + "2K-Triangle", + "20K-Triangle", + "150K-Triangle", + "500K-Triangle", + ], default="500K-Triangle", optional=True, ), @@ -542,6 +545,566 @@ class Rodin3D_Gen2(IO.ComfyNode): return IO.NodeOutput(model_path, file_3d) +def _rodin_multipart_parser(data: dict[str, Any]) -> aiohttp.FormData: + """Convert a Rodin request dict to an aiohttp form, fixing bool/list serialization. + + Booleans --> "true"/"false". Lists --> one field per element. + """ + form = aiohttp.FormData(default_to_multipart=True) + for key, value in data.items(): + if value is None: + continue + if isinstance(value, bool): + form.add_field(key, "true" if value else "false") + elif isinstance(value, list): + for item in value: + form.add_field(key, str(item)) + elif isinstance(value, (bytes, bytearray)): + form.add_field(key, value) + else: + form.add_field(key, str(value)) + return form + + +async def _create_gen25_task( + cls: type[IO.ComfyNode], + request: Rodin3DGen25Request, + images: list | None, +) -> tuple[str, str]: + """Submit a Gen-2.5 generate job; returns (task_uuid, subscription_key).""" + + if images is not None and len(images) > 5: + raise ValueError("Rodin Gen-2.5 supports at most 5 input images.") + + files = None + if images: + files = [ + ( + "images", + open(image, "rb") if isinstance(image, str) else tensor_to_filelike(image), + ) + for image in images + if image is not None + ] + + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/rodin/api/v2/rodin", method="POST"), + response_model=Rodin3DGenerateResponse, + data=request, + files=files, + content_type="multipart/form-data", + multipart_parser=_rodin_multipart_parser, + ) + + if not response.uuid or not response.jobs or not response.jobs.subscription_key: + raise RuntimeError(f"Rodin Gen-2.5 submit failed: message={response.message!r}") + return response.uuid, response.jobs.subscription_key + + +_PREVIEWABLE_3D_EXTS = {".glb", ".obj", ".fbx", ".stl", ".gltf"} + + +async def _download_gen25_files( + download_list: Rodin3DDownloadResponse, + task_uuid: str, + geometry_file_format: str, +) -> Types.File3D | None: + """Download every file in the list; return the File3D matching the chosen format.""" + + folder_name = f"Rodin3D_Gen25_{task_uuid}" + save_dir = os.path.join(comfy_paths.get_output_directory(), folder_name) + os.makedirs(save_dir, exist_ok=True) + + target_ext = f".{geometry_file_format.lower().lstrip('.')}" + file_3d: Types.File3D | None = None + + for item in download_list.items: + file_path = os.path.join(save_dir, item.name) + ext = os.path.splitext(item.name.lower())[1] + # Prefer the file matching the user's chosen format; fall back below. + if file_3d is None and ext == target_ext and ext in _PREVIEWABLE_3D_EXTS: + file_3d = await download_url_to_file_3d(item.url, target_ext.lstrip(".")) + with open(file_path, "wb") as f: + f.write(file_3d.get_bytes()) + continue + await download_url_to_bytesio(item.url, file_path) + + # If the chosen format wasn't found, surface any model file we did get. + if file_3d is None: + for item in download_list.items: + ext = os.path.splitext(item.name.lower())[1] + if ext in _PREVIEWABLE_3D_EXTS: + file_3d = await download_url_to_file_3d(item.url, ext.lstrip(".")) + break + return file_3d + + +_MODE_REGULAR = "Regular" +_MODE_FAST = "Fast" +_MODE_EXTREME_HIGH = "Extreme-High" + +_REGULAR_POLY_OPTIONS = [ + "Default", + "4K-Quad", + "8K-Quad", + "18K-Quad", + "50K-Quad", + "2K-Triangle", + "20K-Triangle", + "150K-Triangle", + "500K-Triangle", + "1M-Triangle", +] + +_TEXTURE_MODE_OPTIONS = ["Default", "legacy", "extreme-low", "low", "medium", "high"] +_GEOMETRY_FORMAT_OPTIONS = ["glb", "fbx", "obj", "stl"] +_MATERIAL_OPTIONS = ["PBR", "Shaded", "All", "None"] + + +def _build_mode_input(name: str = "mode") -> IO.DynamicCombo.Input: + return IO.DynamicCombo.Input( + name, + options=[ + IO.DynamicCombo.Option( + _MODE_REGULAR, + [ + IO.Combo.Input( + "tier", + options=["Gen-2.5-Low", "Gen-2.5-Medium", "Gen-2.5-High"], + default="Gen-2.5-High", + tooltip="Quality tier. Higher tiers produce higher-fidelity geometry.", + ), + IO.Combo.Input( + "polygon_count", + options=_REGULAR_POLY_OPTIONS, + default="Default", + tooltip="Preset face count. 'Default' uses the server's default for the selected tier.", + ), + IO.Boolean.Input( + "creative", + default=False, + tooltip="Creative mode (Medium/High only). Enhances generative robustness.", + ), + ], + ), + IO.DynamicCombo.Option( + _MODE_FAST, + [ + IO.Combo.Input( + "tier", + options=[ + "Gen-2.5-Extreme-Low", + "Gen-2.5-Low", + "Gen-2.5-Medium", + "Gen-2.5-High", + ], + default="Gen-2.5-Low", + ), + IO.Int.Input( + "mesh_faces", + default=20000, + min=1000, + max=20000, + display_mode=IO.NumberDisplay.number, + tooltip="Mesh face count (1K-20K in Fast mode).", + ), + ], + ), + IO.DynamicCombo.Option( + _MODE_EXTREME_HIGH, + [ + IO.Combo.Input("mesh_mode", options=["Raw", "Quad"], default="Raw"), + IO.Int.Input( + "mesh_faces", + default=1000000, + min=20000, + max=2000000, + display_mode=IO.NumberDisplay.number, + tooltip=( + "Mesh face count. Raw mode: 20K-2M. " + "Quad mode: keep under 200K (upstream may reject higher values)." + ), + ), + IO.Boolean.Input( + "is_micro", + default=False, + tooltip="Enable micro detail (Extreme-High only).", + ), + IO.Boolean.Input( + "creative", + default=False, + tooltip="Creative mode. Enhances generative robustness.", + ), + ], + ), + ], + tooltip=( + "Generation mode. Regular = balanced. Fast = 1K-20K faces for rapid prototyping. " + "Extreme-High = 20K-2M faces with optional micro details." + ), + ) + + +def _build_common_inputs(*, include_image_only: bool) -> list: + inputs: list = [ + IO.Combo.Input("material", options=_MATERIAL_OPTIONS, default="Shaded"), + IO.Combo.Input("geometry_file_format", options=_GEOMETRY_FORMAT_OPTIONS, default="glb"), + IO.Combo.Input( + "texture_mode", + options=_TEXTURE_MODE_OPTIONS, + default="Default", + optional=True, + tooltip="Texture quality preset. 'Default' uses the server's default for the selected tier.", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=65535, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + optional=True, + ), + IO.Boolean.Input( + "TAPose", default=False, optional=True, advanced=True, tooltip="T/A pose for human-like models." + ), + IO.Boolean.Input( + "hd_texture", default=False, optional=True, advanced=True, tooltip="High-quality texture enhancement." + ), + IO.Boolean.Input( + "texture_delight", + default=False, + optional=True, + advanced=True, + tooltip="Remove baked lighting from textures.", + ), + ] + if include_image_only: + inputs.append( + IO.Boolean.Input( + "use_original_alpha", + default=False, + optional=True, + advanced=True, + tooltip="Preserve image transparency.", + ) + ) + inputs.extend( + [ + IO.Boolean.Input( + "addon_highpack", + default=False, + optional=True, + advanced=True, + tooltip="HighPack addon: 4K textures and ~16x faces in Quad mode.", + ), + IO.Int.Input( + "bbox_width", + default=0, + min=0, + max=300, + display_mode=IO.NumberDisplay.number, + optional=True, + advanced=True, + tooltip="Bounding-box width (Y axis). Set to 0 with the others to skip bbox.", + ), + IO.Int.Input( + "bbox_height", + default=0, + min=0, + max=300, + display_mode=IO.NumberDisplay.number, + optional=True, + advanced=True, + tooltip="Bounding-box height (Z axis).", + ), + IO.Int.Input( + "bbox_length", + default=0, + min=0, + max=300, + display_mode=IO.NumberDisplay.number, + optional=True, + advanced=True, + tooltip="Bounding-box length (X axis).", + ), + IO.Int.Input( + "height_cm", + default=0, + min=0, + max=10000, + display_mode=IO.NumberDisplay.number, + optional=True, + advanced=True, + tooltip="Approximate model height in centimeters (0 to skip).", + ), + ] + ) + return inputs + + +_PRICE_EXPR = """ +( + $baseCredits := widgets.mode = "extreme-high" ? 1.0 : 0.5; + $addonCredits := widgets.addon_highpack ? 1.0 : 0.0; + $total := ($baseCredits * 1.5) + ($addonCredits * 0.8); + {"type":"usd","usd": $total} +) +""" + + +def _resolve_mode_params(mode_input: dict) -> dict: + """Translate the DynamicCombo `mode` payload into Gen-2.5 request fields. + + Returns a dict with: tier, quality_override, mesh_mode, geometry_instruct_mode, is_micro. + Missing keys mean "do not send" (so we don't override server defaults). + """ + selected = mode_input["mode"] + out: dict = {} + + if selected == _MODE_REGULAR: + out["tier"] = mode_input["tier"] + polygon = mode_input.get("polygon_count", "Default") + if polygon != "Default": + mesh_mode, faces = get_quality_mode(polygon) + out["mesh_mode"] = mesh_mode + out["quality_override"] = faces + if mode_input.get("creative"): + out["geometry_instruct_mode"] = "creative" + + elif selected == _MODE_FAST: + out["tier"] = mode_input["tier"] + out["mesh_mode"] = "Raw" + out["quality_override"] = int(mode_input["mesh_faces"]) + + elif selected == _MODE_EXTREME_HIGH: + out["tier"] = "Gen-2.5-Extreme-High" + out["mesh_mode"] = mode_input["mesh_mode"] + out["quality_override"] = int(mode_input["mesh_faces"]) + if mode_input.get("is_micro"): + out["is_micro"] = True + if mode_input.get("creative"): + out["geometry_instruct_mode"] = "creative" + return out + + +def _build_request( + *, + mode_input: dict, + material: str, + geometry_file_format: str, + texture_mode: str, + seed: int, + TAPose: bool, + hd_texture: bool, + texture_delight: bool, + addon_highpack: bool, + bbox_width: int, + bbox_height: int, + bbox_length: int, + height_cm: int, + prompt: str | None = None, + use_original_alpha: bool = False, +) -> Rodin3DGen25Request: + mode_params = _resolve_mode_params(mode_input) + + bbox = None + if bbox_width and bbox_height and bbox_length: + bbox = [bbox_width, bbox_height, bbox_length] + + return Rodin3DGen25Request( + tier=mode_params["tier"], + prompt=prompt or None, + seed=seed, + material=material, + geometry_file_format=geometry_file_format, + texture_mode=None if texture_mode == "Default" else texture_mode, + mesh_mode=mode_params.get("mesh_mode"), + quality_override=mode_params.get("quality_override"), + geometry_instruct_mode=mode_params.get("geometry_instruct_mode"), + bbox_condition=bbox, + height=height_cm or None, + TAPose=TAPose or None, + hd_texture=hd_texture or None, + texture_delight=texture_delight or None, + is_micro=mode_params.get("is_micro"), + use_original_alpha=use_original_alpha or None, + addons=["HighPack"] if addon_highpack else None, + ) + + +class Rodin3D_Gen25_Image(IO.ComfyNode): + + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="Rodin3D_Gen25_Image", + display_name="Rodin 3D Gen-2.5 - Image to 3D", + category="3d/partner/Rodin", + description=( + "Generate a 3D model from 1-5 reference images via Rodin Gen-2.5. " + "Pick a mode (Fast / Regular / Extreme-High) to tune quality vs. cost." + ), + inputs=[ + IO.Autogrow.Input( + "images", + template=IO.Autogrow.TemplatePrefix(IO.Image.Input("image"), prefix="image", min=1, max=5), + tooltip="1-5 images. The first image is used for materials when multi-view.", + ), + _build_mode_input(), + *_build_common_inputs(include_image_only=True), + ], + outputs=[IO.File3DAny.Output(display_name="model_file")], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["mode", "addon_highpack"]), + expr=_PRICE_EXPR, + ), + ) + + @classmethod + async def execute( + cls, + images: IO.Autogrow.Type, + mode: dict, + material: str, + geometry_file_format: str, + texture_mode: str, + seed: int, + TAPose: bool, + hd_texture: bool, + texture_delight: bool, + use_original_alpha: bool, + addon_highpack: bool, + bbox_width: int, + bbox_height: int, + bbox_length: int, + height_cm: int, + ) -> IO.NodeOutput: + image_tensors = [img for img in images.values() if img is not None] + if not image_tensors: + raise ValueError("Rodin Gen-2.5 Image-to-3D requires at least one image.") + + # Flatten multi-image tensors into individual frames; the API accepts each as a separate part. + flat_images: list = [] + for tensor in image_tensors: + if hasattr(tensor, "shape") and len(tensor.shape) == 4: + for i in range(tensor.shape[0]): + flat_images.append(tensor[i]) + else: + flat_images.append(tensor) + + if len(flat_images) > 5: + raise ValueError(f"Rodin Gen-2.5 accepts at most 5 images; received {len(flat_images)}.") + + request = _build_request( + mode_input=mode, + material=material, + geometry_file_format=geometry_file_format, + texture_mode=texture_mode, + seed=seed, + TAPose=TAPose, + hd_texture=hd_texture, + texture_delight=texture_delight, + addon_highpack=addon_highpack, + bbox_width=bbox_width, + bbox_height=bbox_height, + bbox_length=bbox_length, + height_cm=height_cm, + prompt=None, + use_original_alpha=use_original_alpha, + ) + + task_uuid, subscription_key = await _create_gen25_task(cls, request, flat_images) + await poll_for_task_status(subscription_key, cls) + download_list = await get_rodin_download_list(task_uuid, cls) + file_3d = await _download_gen25_files(download_list, task_uuid, geometry_file_format) + return IO.NodeOutput(file_3d) + + +class Rodin3D_Gen25_Text(IO.ComfyNode): + + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="Rodin3D_Gen25_Text", + display_name="Rodin 3D Gen-2.5 - Text to 3D", + category="3d/partner/Rodin", + description=( + "Generate a 3D model from a text prompt via Rodin Gen-2.5. " + "Pick a mode (Fast / Regular / Extreme-High) to tune quality vs. cost." + ), + inputs=[ + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Text prompt for the 3D model.", + ), + _build_mode_input(), + *_build_common_inputs(include_image_only=False), + ], + outputs=[IO.File3DAny.Output(display_name="model_file")], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["mode", "addon_highpack"]), + expr=_PRICE_EXPR, + ), + ) + + @classmethod + async def execute( + cls, + prompt: str, + mode: dict, + material: str, + geometry_file_format: str, + texture_mode: str, + seed: int, + TAPose: bool, + hd_texture: bool, + texture_delight: bool, + addon_highpack: bool, + bbox_width: int, + bbox_height: int, + bbox_length: int, + height_cm: int, + ) -> IO.NodeOutput: + validate_string(prompt, field_name="prompt", min_length=1, max_length=2500) + request = _build_request( + mode_input=mode, + material=material, + geometry_file_format=geometry_file_format, + texture_mode=texture_mode, + seed=seed, + TAPose=TAPose, + hd_texture=hd_texture, + texture_delight=texture_delight, + addon_highpack=addon_highpack, + bbox_width=bbox_width, + bbox_height=bbox_height, + bbox_length=bbox_length, + height_cm=height_cm, + prompt=prompt, + ) + task_uuid, subscription_key = await _create_gen25_task(cls, request, images=None) + await poll_for_task_status(subscription_key, cls) + download_list = await get_rodin_download_list(task_uuid, cls) + file_3d = await _download_gen25_files(download_list, task_uuid, geometry_file_format) + return IO.NodeOutput(file_3d) + + class Rodin3DExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[IO.ComfyNode]]: @@ -551,6 +1114,8 @@ class Rodin3DExtension(ComfyExtension): Rodin3D_Smooth, Rodin3D_Sketch, Rodin3D_Gen2, + Rodin3D_Gen25_Image, + Rodin3D_Gen25_Text, ] diff --git a/comfy_api_nodes/nodes_runway.py b/comfy_api_nodes/nodes_runway.py index 573170ba2..7357c733e 100644 --- a/comfy_api_nodes/nodes_runway.py +++ b/comfy_api_nodes/nodes_runway.py @@ -140,7 +140,7 @@ class RunwayImageToVideoNodeGen3a(IO.ComfyNode): return IO.Schema( node_id="RunwayImageToVideoNodeGen3a", display_name="Runway Image to Video (Gen3a Turbo)", - category="api node/video/Runway", + category="video/partner/Runway", description="Generate a video from a single starting frame using Gen3a Turbo model. " "Before diving in, review these best practices to ensure that " "your input selections will set your generation up for success: " @@ -234,7 +234,7 @@ class RunwayImageToVideoNodeGen4(IO.ComfyNode): return IO.Schema( node_id="RunwayImageToVideoNodeGen4", display_name="Runway Image to Video (Gen4 Turbo)", - category="api node/video/Runway", + category="video/partner/Runway", description="Generate a video from a single starting frame using Gen4 Turbo model. " "Before diving in, review these best practices to ensure that " "your input selections will set your generation up for success: " @@ -329,7 +329,7 @@ class RunwayFirstLastFrameNode(IO.ComfyNode): return IO.Schema( node_id="RunwayFirstLastFrameNode", display_name="Runway First-Last-Frame to Video", - category="api node/video/Runway", + category="video/partner/Runway", description="Upload first and last keyframes, draft a prompt, and generate a video. " "More complex transitions, such as cases where the Last frame is completely different " "from the First frame, may benefit from the longer 10s duration. " @@ -440,7 +440,7 @@ class RunwayTextToImageNode(IO.ComfyNode): return IO.Schema( node_id="RunwayTextToImageNode", display_name="Runway Text to Image", - category="api node/image/Runway", + category="image/partner/Runway", description="Generate an image from a text prompt using Runway's Gen 4 model. " "You can also include reference image to guide the generation.", inputs=[ diff --git a/comfy_api_nodes/nodes_sonilo.py b/comfy_api_nodes/nodes_sonilo.py index 5518f5902..bc31a0074 100644 --- a/comfy_api_nodes/nodes_sonilo.py +++ b/comfy_api_nodes/nodes_sonilo.py @@ -34,7 +34,7 @@ class SoniloVideoToMusic(IO.ComfyNode): return IO.Schema( node_id="SoniloVideoToMusic", display_name="Sonilo Video to Music", - category="api node/audio/Sonilo", + category="audio/partner/Sonilo", description="Generate music from video content using Sonilo's AI model. " "Analyzes the video and creates matching music.", inputs=[ @@ -99,7 +99,7 @@ class SoniloTextToMusic(IO.ComfyNode): return IO.Schema( node_id="SoniloTextToMusic", display_name="Sonilo Text to Music", - category="api node/audio/Sonilo", + category="audio/partner/Sonilo", description="Generate music from a text prompt using Sonilo's AI model. " "Leave duration at 0 to let the model infer it from the prompt.", inputs=[ diff --git a/comfy_api_nodes/nodes_sora.py b/comfy_api_nodes/nodes_sora.py index c1d485188..83cfca495 100644 --- a/comfy_api_nodes/nodes_sora.py +++ b/comfy_api_nodes/nodes_sora.py @@ -34,7 +34,7 @@ class OpenAIVideoSora2(IO.ComfyNode): return IO.Schema( node_id="OpenAIVideoSora2", display_name="OpenAI Sora - Video (DEPRECATED)", - category="api node/video/Sora", + category="video/partner/Sora", description=( "OpenAI video and audio generation.\n\n" "DEPRECATION NOTICE: OpenAI will stop serving the Sora v2 API in September 2026. " diff --git a/comfy_api_nodes/nodes_stability.py b/comfy_api_nodes/nodes_stability.py index 906d8ff35..a1753d647 100644 --- a/comfy_api_nodes/nodes_stability.py +++ b/comfy_api_nodes/nodes_stability.py @@ -62,7 +62,7 @@ class StabilityStableImageUltraNode(IO.ComfyNode): return IO.Schema( node_id="StabilityStableImageUltraNode", display_name="Stability AI Stable Image Ultra", - category="api node/image/Stability AI", + category="image/partner/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ IO.String.Input( @@ -197,7 +197,7 @@ class StabilityStableImageSD_3_5Node(IO.ComfyNode): return IO.Schema( node_id="StabilityStableImageSD_3_5Node", display_name="Stability AI Stable Diffusion 3.5 Image", - category="api node/image/Stability AI", + category="image/partner/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ IO.String.Input( @@ -354,7 +354,7 @@ class StabilityUpscaleConservativeNode(IO.ComfyNode): return IO.Schema( node_id="StabilityUpscaleConservativeNode", display_name="Stability AI Upscale Conservative", - category="api node/image/Stability AI", + category="image/partner/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ IO.Image.Input("image"), @@ -457,7 +457,7 @@ class StabilityUpscaleCreativeNode(IO.ComfyNode): return IO.Schema( node_id="StabilityUpscaleCreativeNode", display_name="Stability AI Upscale Creative", - category="api node/image/Stability AI", + category="image/partner/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ IO.Image.Input("image"), @@ -578,7 +578,7 @@ class StabilityUpscaleFastNode(IO.ComfyNode): return IO.Schema( node_id="StabilityUpscaleFastNode", display_name="Stability AI Upscale Fast", - category="api node/image/Stability AI", + category="image/partner/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ IO.Image.Input("image"), @@ -630,7 +630,7 @@ class StabilityTextToAudio(IO.ComfyNode): return IO.Schema( node_id="StabilityTextToAudio", display_name="Stability AI Text To Audio", - category="api node/audio/Stability AI", + category="audio/partner/Stability AI", essentials_category="Audio", description=cleandoc(cls.__doc__ or ""), inputs=[ @@ -708,7 +708,7 @@ class StabilityAudioToAudio(IO.ComfyNode): return IO.Schema( node_id="StabilityAudioToAudio", display_name="Stability AI Audio To Audio", - category="api node/audio/Stability AI", + category="audio/partner/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ IO.Combo.Input( @@ -802,7 +802,7 @@ class StabilityAudioInpaint(IO.ComfyNode): return IO.Schema( node_id="StabilityAudioInpaint", display_name="Stability AI Audio Inpaint", - category="api node/audio/Stability AI", + category="audio/partner/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ IO.Combo.Input( diff --git a/comfy_api_nodes/nodes_topaz.py b/comfy_api_nodes/nodes_topaz.py index e79c16d3c..d0906ee44 100644 --- a/comfy_api_nodes/nodes_topaz.py +++ b/comfy_api_nodes/nodes_topaz.py @@ -52,7 +52,7 @@ class TopazImageEnhance(IO.ComfyNode): return IO.Schema( node_id="TopazImageEnhance", display_name="Topaz Image Enhance", - category="api node/image/Topaz", + category="image/partner/Topaz", description="Industry-standard upscaling and image enhancement.", inputs=[ IO.Combo.Input("model", options=["Reimagine"]), @@ -235,7 +235,7 @@ class TopazVideoEnhance(IO.ComfyNode): return IO.Schema( node_id="TopazVideoEnhance", display_name="Topaz Video Enhance (Legacy)", - category="api node/video/Topaz", + category="video/partner/Topaz", description="Breathe new life into video with powerful upscaling and recovery technology.", inputs=[ IO.Video.Input("video"), @@ -475,7 +475,7 @@ class TopazVideoEnhanceV2(IO.ComfyNode): return IO.Schema( node_id="TopazVideoEnhanceV2", display_name="Topaz Video Enhance", - category="api node/video/Topaz", + category="video/partner/Topaz", description="Breathe new life into video with powerful upscaling and recovery technology.", inputs=[ IO.Video.Input("video"), diff --git a/comfy_api_nodes/nodes_tripo.py b/comfy_api_nodes/nodes_tripo.py index d6501dee4..6ee674a18 100644 --- a/comfy_api_nodes/nodes_tripo.py +++ b/comfy_api_nodes/nodes_tripo.py @@ -80,7 +80,7 @@ class TripoTextToModelNode(IO.ComfyNode): return IO.Schema( node_id="TripoTextToModelNode", display_name="Tripo: Text to Model", - category="api node/3d/Tripo", + category="3d/partner/Tripo", inputs=[ IO.String.Input("prompt", multiline=True), IO.String.Input("negative_prompt", multiline=True, optional=True), @@ -195,7 +195,7 @@ class TripoImageToModelNode(IO.ComfyNode): return IO.Schema( node_id="TripoImageToModelNode", display_name="Tripo: Image to Model", - category="api node/3d/Tripo", + category="3d/partner/Tripo", inputs=[ IO.Image.Input("image"), IO.Combo.Input( @@ -323,7 +323,7 @@ class TripoMultiviewToModelNode(IO.ComfyNode): return IO.Schema( node_id="TripoMultiviewToModelNode", display_name="Tripo: Multiview to Model", - category="api node/3d/Tripo", + category="3d/partner/Tripo", inputs=[ IO.Image.Input("image"), IO.Image.Input("image_left", optional=True), @@ -461,7 +461,7 @@ class TripoTextureNode(IO.ComfyNode): return IO.Schema( node_id="TripoTextureNode", display_name="Tripo: Texture model", - category="api node/3d/Tripo", + category="3d/partner/Tripo", inputs=[ IO.Custom("MODEL_TASK_ID").Input("model_task_id"), IO.Boolean.Input("texture", default=True, optional=True), @@ -528,7 +528,7 @@ class TripoRefineNode(IO.ComfyNode): return IO.Schema( node_id="TripoRefineNode", display_name="Tripo: Refine Draft model", - category="api node/3d/Tripo", + category="3d/partner/Tripo", description="Refine a draft model created by v1.4 Tripo models only.", inputs=[ IO.Custom("MODEL_TASK_ID").Input("model_task_id", tooltip="Must be a v1.4 Tripo model"), @@ -568,7 +568,7 @@ class TripoRigNode(IO.ComfyNode): return IO.Schema( node_id="TripoRigNode", display_name="Tripo: Rig model", - category="api node/3d/Tripo", + category="3d/partner/Tripo", inputs=[IO.Custom("MODEL_TASK_ID").Input("original_model_task_id")], outputs=[ IO.String.Output(display_name="model_file"), # for backward compatibility only @@ -605,7 +605,7 @@ class TripoRetargetNode(IO.ComfyNode): return IO.Schema( node_id="TripoRetargetNode", display_name="Tripo: Retarget rigged model", - category="api node/3d/Tripo", + category="3d/partner/Tripo", inputs=[ IO.Custom("RIG_TASK_ID").Input("original_model_task_id"), IO.Combo.Input( @@ -670,7 +670,7 @@ class TripoConversionNode(IO.ComfyNode): return IO.Schema( node_id="TripoConversionNode", display_name="Tripo: Convert model", - category="api node/3d/Tripo", + category="3d/partner/Tripo", inputs=[ IO.Custom("MODEL_TASK_ID,RIG_TASK_ID,RETARGET_TASK_ID").Input("original_model_task_id"), IO.Combo.Input("format", options=["GLTF", "USDZ", "FBX", "OBJ", "STL", "3MF"]), diff --git a/comfy_api_nodes/nodes_veo2.py b/comfy_api_nodes/nodes_veo2.py index 2ff75d9b2..068862397 100644 --- a/comfy_api_nodes/nodes_veo2.py +++ b/comfy_api_nodes/nodes_veo2.py @@ -45,7 +45,7 @@ class VeoVideoGenerationNode(IO.ComfyNode): return IO.Schema( node_id="VeoVideoGenerationNode", display_name="Google Veo 2 Video Generation", - category="api node/video/Veo", + category="video/partner/Veo", description="Generates videos from text prompts using Google's Veo 2 API", inputs=[ IO.String.Input( @@ -256,7 +256,7 @@ class Veo3VideoGenerationNode(IO.ComfyNode): return IO.Schema( node_id="Veo3VideoGenerationNode", display_name="Google Veo 3 Video Generation", - category="api node/video/Veo", + category="video/partner/Veo", description="Generates videos from text prompts using Google's Veo 3 API", inputs=[ IO.String.Input( @@ -468,7 +468,7 @@ class Veo3FirstLastFrameNode(IO.ComfyNode): return IO.Schema( node_id="Veo3FirstLastFrameNode", display_name="Google Veo 3 First-Last-Frame to Video", - category="api node/video/Veo", + category="video/partner/Veo", description="Generate video using prompt and first and last frames.", inputs=[ IO.String.Input( diff --git a/comfy_api_nodes/nodes_vidu.py b/comfy_api_nodes/nodes_vidu.py index 8d90cefeb..16f6113de 100644 --- a/comfy_api_nodes/nodes_vidu.py +++ b/comfy_api_nodes/nodes_vidu.py @@ -71,7 +71,7 @@ class ViduTextToVideoNode(IO.ComfyNode): return IO.Schema( node_id="ViduTextToVideoNode", display_name="Vidu Text To Video Generation", - category="api node/video/Vidu", + category="video/partner/Vidu", description="Generate video from a text prompt", inputs=[ IO.Combo.Input("model", options=["viduq1"], tooltip="Model name"), @@ -169,7 +169,7 @@ class ViduImageToVideoNode(IO.ComfyNode): return IO.Schema( node_id="ViduImageToVideoNode", display_name="Vidu Image To Video Generation", - category="api node/video/Vidu", + category="video/partner/Vidu", description="Generate video from image and optional prompt", inputs=[ IO.Combo.Input("model", options=["viduq1"], tooltip="Model name"), @@ -273,7 +273,7 @@ class ViduReferenceVideoNode(IO.ComfyNode): return IO.Schema( node_id="ViduReferenceVideoNode", display_name="Vidu Reference To Video Generation", - category="api node/video/Vidu", + category="video/partner/Vidu", description="Generate video from multiple images and a prompt", inputs=[ IO.Combo.Input("model", options=["viduq1"], tooltip="Model name"), @@ -388,7 +388,7 @@ class ViduStartEndToVideoNode(IO.ComfyNode): return IO.Schema( node_id="ViduStartEndToVideoNode", display_name="Vidu Start End To Video Generation", - category="api node/video/Vidu", + category="video/partner/Vidu", description="Generate a video from start and end frames and a prompt", inputs=[ IO.Combo.Input("model", options=["viduq1"], tooltip="Model name"), @@ -492,7 +492,7 @@ class Vidu2TextToVideoNode(IO.ComfyNode): return IO.Schema( node_id="Vidu2TextToVideoNode", display_name="Vidu2 Text-to-Video Generation", - category="api node/video/Vidu", + category="video/partner/Vidu", description="Generate video from a text prompt", inputs=[ IO.Combo.Input("model", options=["viduq2"]), @@ -584,7 +584,7 @@ class Vidu2ImageToVideoNode(IO.ComfyNode): return IO.Schema( node_id="Vidu2ImageToVideoNode", display_name="Vidu2 Image-to-Video Generation", - category="api node/video/Vidu", + category="video/partner/Vidu", description="Generate a video from an image and an optional prompt.", inputs=[ IO.Combo.Input("model", options=["viduq2-pro-fast", "viduq2-pro", "viduq2-turbo"]), @@ -714,7 +714,7 @@ class Vidu2ReferenceVideoNode(IO.ComfyNode): return IO.Schema( node_id="Vidu2ReferenceVideoNode", display_name="Vidu2 Reference-to-Video Generation", - category="api node/video/Vidu", + category="video/partner/Vidu", description="Generate a video from multiple reference images and a prompt.", inputs=[ IO.Combo.Input("model", options=["viduq2"]), @@ -849,7 +849,7 @@ class Vidu2StartEndToVideoNode(IO.ComfyNode): return IO.Schema( node_id="Vidu2StartEndToVideoNode", display_name="Vidu2 Start/End Frame-to-Video Generation", - category="api node/video/Vidu", + category="video/partner/Vidu", description="Generate a video from a start frame, an end frame, and a prompt.", inputs=[ IO.Combo.Input("model", options=["viduq2-pro-fast", "viduq2-pro", "viduq2-turbo"]), @@ -969,7 +969,7 @@ class ViduExtendVideoNode(IO.ComfyNode): return IO.Schema( node_id="ViduExtendVideoNode", display_name="Vidu Video Extension", - category="api node/video/Vidu", + category="video/partner/Vidu", description="Extend an existing video by generating additional frames.", inputs=[ IO.DynamicCombo.Input( @@ -1138,7 +1138,7 @@ class ViduMultiFrameVideoNode(IO.ComfyNode): return IO.Schema( node_id="ViduMultiFrameVideoNode", display_name="Vidu Multi-Frame Video Generation", - category="api node/video/Vidu", + category="video/partner/Vidu", description="Generate a video with multiple keyframe transitions.", inputs=[ IO.Combo.Input("model", options=["viduq2-pro", "viduq2-turbo"]), @@ -1284,7 +1284,7 @@ class Vidu3TextToVideoNode(IO.ComfyNode): return IO.Schema( node_id="Vidu3TextToVideoNode", display_name="Vidu Q3 Text-to-Video Generation", - category="api node/video/Vidu", + category="video/partner/Vidu", description="Generate video from a text prompt.", inputs=[ IO.DynamicCombo.Input( @@ -1429,7 +1429,7 @@ class Vidu3ImageToVideoNode(IO.ComfyNode): return IO.Schema( node_id="Vidu3ImageToVideoNode", display_name="Vidu Q3 Image-to-Video Generation", - category="api node/video/Vidu", + category="video/partner/Vidu", description="Generate a video from an image and an optional prompt.", inputs=[ IO.DynamicCombo.Input( @@ -1571,7 +1571,7 @@ class Vidu3StartEndToVideoNode(IO.ComfyNode): return IO.Schema( node_id="Vidu3StartEndToVideoNode", display_name="Vidu Q3 Start/End Frame-to-Video Generation", - category="api node/video/Vidu", + category="video/partner/Vidu", description="Generate a video from a start frame, an end frame, and a prompt.", inputs=[ IO.DynamicCombo.Input( diff --git a/comfy_api_nodes/nodes_wan.py b/comfy_api_nodes/nodes_wan.py index 68061bb5c..a235dc387 100644 --- a/comfy_api_nodes/nodes_wan.py +++ b/comfy_api_nodes/nodes_wan.py @@ -61,7 +61,7 @@ class WanTextToImageApi(IO.ComfyNode): return IO.Schema( node_id="WanTextToImageApi", display_name="Wan Text to Image", - category="api node/image/Wan", + category="image/partner/Wan", description="Generates an image based on a text prompt.", inputs=[ IO.Combo.Input( @@ -184,7 +184,7 @@ class WanImageToImageApi(IO.ComfyNode): return IO.Schema( node_id="WanImageToImageApi", display_name="Wan Image to Image", - category="api node/image/Wan", + category="image/partner/Wan", description="Generates an image from one or two input images and a text prompt. " "The output image is currently fixed at 1.6 MP, and its aspect ratio matches the input image(s).", inputs=[ @@ -312,7 +312,7 @@ class WanTextToVideoApi(IO.ComfyNode): return IO.Schema( node_id="WanTextToVideoApi", display_name="Wan Text to Video", - category="api node/video/Wan", + category="video/partner/Wan", description="Generates a video based on a text prompt.", inputs=[ IO.Combo.Input( @@ -495,7 +495,7 @@ class WanImageToVideoApi(IO.ComfyNode): return IO.Schema( node_id="WanImageToVideoApi", display_name="Wan Image to Video", - category="api node/video/Wan", + category="video/partner/Wan", description="Generates a video from the first frame and a text prompt.", inputs=[ IO.Combo.Input( @@ -674,7 +674,7 @@ class WanReferenceVideoApi(IO.ComfyNode): return IO.Schema( node_id="WanReferenceVideoApi", display_name="Wan Reference to Video", - category="api node/video/Wan", + category="video/partner/Wan", description="Use the character and voice from input videos, combined with a prompt, " "to generate a new video that maintains character consistency.", inputs=[ @@ -828,7 +828,7 @@ class Wan2TextToVideoApi(IO.ComfyNode): return IO.Schema( node_id="Wan2TextToVideoApi", display_name="Wan 2.7 Text to Video", - category="api node/video/Wan", + category="video/partner/Wan", description="Generates a video based on a text prompt using the Wan 2.7 model.", inputs=[ IO.DynamicCombo.Input( @@ -981,7 +981,7 @@ class Wan2ImageToVideoApi(IO.ComfyNode): return IO.Schema( node_id="Wan2ImageToVideoApi", display_name="Wan 2.7 Image to Video", - category="api node/video/Wan", + category="video/partner/Wan", description="Generate a video from a first-frame image, with optional last-frame image and audio.", inputs=[ IO.DynamicCombo.Input( @@ -1152,7 +1152,7 @@ class Wan2VideoContinuationApi(IO.ComfyNode): return IO.Schema( node_id="Wan2VideoContinuationApi", display_name="Wan 2.7 Video Continuation", - category="api node/video/Wan", + category="video/partner/Wan", description="Continue a video from where it left off, with optional last-frame control.", inputs=[ IO.DynamicCombo.Input( @@ -1319,7 +1319,7 @@ class Wan2VideoEditApi(IO.ComfyNode): return IO.Schema( node_id="Wan2VideoEditApi", display_name="Wan 2.7 Video Edit", - category="api node/video/Wan", + category="video/partner/Wan", description="Edit a video using text instructions, reference images, or style transfer.", inputs=[ IO.DynamicCombo.Input( @@ -1477,7 +1477,7 @@ class Wan2ReferenceVideoApi(IO.ComfyNode): return IO.Schema( node_id="Wan2ReferenceVideoApi", display_name="Wan 2.7 Reference to Video", - category="api node/video/Wan", + category="video/partner/Wan", description="Generate a video featuring a person or object from reference materials. " "Supports single-character performances and multi-character interactions.", inputs=[ @@ -1651,7 +1651,7 @@ class HappyHorseTextToVideoApi(IO.ComfyNode): return IO.Schema( node_id="HappyHorseTextToVideoApi", display_name="HappyHorse Text to Video", - category="api node/video/Wan", + category="video/partner/Wan", description="Generates a video based on a text prompt using the HappyHorse model.", inputs=[ IO.DynamicCombo.Input( @@ -1775,7 +1775,7 @@ class HappyHorseImageToVideoApi(IO.ComfyNode): return IO.Schema( node_id="HappyHorseImageToVideoApi", display_name="HappyHorse Image to Video", - category="api node/video/Wan", + category="video/partner/Wan", description="Generate a video from a first-frame image using the HappyHorse model.", inputs=[ IO.DynamicCombo.Input( @@ -1905,7 +1905,7 @@ class HappyHorseVideoEditApi(IO.ComfyNode): return IO.Schema( node_id="HappyHorseVideoEditApi", display_name="HappyHorse Video Edit", - category="api node/video/Wan", + category="video/partner/Wan", description="Edit a video using text instructions or reference images with the HappyHorse model. " "Output duration is 3-15s and matches the input video; inputs longer than 15s are truncated.", inputs=[ @@ -2046,7 +2046,7 @@ class HappyHorseReferenceVideoApi(IO.ComfyNode): return IO.Schema( node_id="HappyHorseReferenceVideoApi", display_name="HappyHorse Reference to Video", - category="api node/video/Wan", + category="video/partner/Wan", description="Generate a video featuring a person or object from reference materials with the HappyHorse " "model. Supports single-character performances and multi-character interactions.", inputs=[ diff --git a/comfy_api_nodes/nodes_wavespeed.py b/comfy_api_nodes/nodes_wavespeed.py index 65e45f60a..a250015c3 100644 --- a/comfy_api_nodes/nodes_wavespeed.py +++ b/comfy_api_nodes/nodes_wavespeed.py @@ -27,7 +27,7 @@ class WavespeedFlashVSRNode(IO.ComfyNode): return IO.Schema( node_id="WavespeedFlashVSRNode", display_name="FlashVSR Video Upscale", - category="api node/video/WaveSpeed", + category="video/partner/WaveSpeed", description="Fast, high-quality video upscaler that " "boosts resolution and restores clarity for low-resolution or blurry footage.", inputs=[ @@ -98,7 +98,7 @@ class WavespeedImageUpscaleNode(IO.ComfyNode): return IO.Schema( node_id="WavespeedImageUpscaleNode", display_name="WaveSpeed Image Upscale", - category="api node/image/WaveSpeed", + category="image/partner/WaveSpeed", description="Boost image resolution and quality, upscaling photos to 4K or 8K for sharp, detailed results.", inputs=[ IO.Combo.Input("model", options=["SeedVR2", "Ultimate"]), diff --git a/comfy_api_nodes/util/__init__.py b/comfy_api_nodes/util/__init__.py index f3584aba9..25cb88869 100644 --- a/comfy_api_nodes/util/__init__.py +++ b/comfy_api_nodes/util/__init__.py @@ -16,16 +16,17 @@ from .conversions import ( convert_mask_to_image, downscale_image_tensor, downscale_image_tensor_by_max_side, + downscale_video_to_max_pixels, image_tensor_pair_to_batch, pil_to_bytesio, resize_mask_to_image, - resize_video_to_pixel_budget, tensor_to_base64_string, tensor_to_bytesio, tensor_to_pil, text_filepath_to_base64_string, text_filepath_to_data_uri, trim_video, + upscale_video_to_min_pixels, video_to_base64_string, ) from .download_helpers import ( @@ -88,16 +89,17 @@ __all__ = [ "convert_mask_to_image", "downscale_image_tensor", "downscale_image_tensor_by_max_side", + "downscale_video_to_max_pixels", "image_tensor_pair_to_batch", "pil_to_bytesio", "resize_mask_to_image", - "resize_video_to_pixel_budget", "tensor_to_base64_string", "tensor_to_bytesio", "tensor_to_pil", "text_filepath_to_base64_string", "text_filepath_to_data_uri", "trim_video", + "upscale_video_to_min_pixels", "video_to_base64_string", # Validation utilities "get_image_dimensions", diff --git a/comfy_api_nodes/util/client.py b/comfy_api_nodes/util/client.py index 052301c33..57c501724 100644 --- a/comfy_api_nodes/util/client.py +++ b/comfy_api_nodes/util/client.py @@ -86,7 +86,7 @@ class _PollUIState: _RETRY_STATUS = {408, 500, 502, 503, 504} # status 429 is handled separately COMPLETED_STATUSES = ["succeeded", "succeed", "success", "completed", "finished", "done", "complete"] FAILED_STATUSES = ["cancelled", "canceled", "canceling", "fail", "failed", "error"] -QUEUED_STATUSES = ["created", "queued", "queueing", "submitted", "initializing", "wait"] +QUEUED_STATUSES = ["created", "queued", "queueing", "submitted", "initializing", "wait", "in_queue"] async def sync_op( diff --git a/comfy_api_nodes/util/conversions.py b/comfy_api_nodes/util/conversions.py index be5d5719b..5738df57f 100644 --- a/comfy_api_nodes/util/conversions.py +++ b/comfy_api_nodes/util/conversions.py @@ -415,14 +415,48 @@ def trim_video(video: Input.Video, duration_sec: float) -> Input.Video: raise RuntimeError(f"Failed to trim video: {str(e)}") from e -def resize_video_to_pixel_budget(video: Input.Video, total_pixels: int) -> Input.Video: - """Downscale a video to fit within ``total_pixels`` (w * h), preserving aspect ratio. +def downscale_video_to_max_pixels(video: Input.Video, max_pixels: int) -> Input.Video: + """Downscale a video to fit within ``max_pixels`` (w * h), preserving aspect ratio. Returns the original video object untouched when it already fits. Preserves frame rate, duration, and audio. Aspect ratio is preserved up to a fraction of a percent (even-dim rounding). """ src_w, src_h = video.get_dimensions() - scale_dims = _compute_downscale_dims(src_w, src_h, total_pixels) + scale_dims = _compute_downscale_dims(src_w, src_h, max_pixels) + if scale_dims is None: + return video + return _apply_video_scale(video, scale_dims) + + +def _compute_upscale_dims(src_w: int, src_h: int, total_pixels: int) -> tuple[int, int] | None: + """Return upscaled (w, h) with even dims meeting at least ``total_pixels``, or None if already large enough. + + Source aspect ratio is preserved; output may drift by a fraction of a percent because both dimensions + are rounded up to even values (many codecs require divisible-by-2). The result is guaranteed to be at + least ``total_pixels``. + """ + pixels = src_w * src_h + if pixels >= total_pixels: + return None + scale = math.sqrt(total_pixels / pixels) + new_w = math.ceil(src_w * scale) + new_h = math.ceil(src_h * scale) + if new_w % 2: + new_w += 1 + if new_h % 2: + new_h += 1 + return new_w, new_h + + +def upscale_video_to_min_pixels(video: Input.Video, min_pixels: int) -> Input.Video: + """Upscale a video to meet at least ``min_pixels`` (w * h), preserving aspect ratio. + + Returns the original video object untouched when it already meets the minimum. Preserves frame rate, + duration, and audio. Aspect ratio is preserved up to a fraction of a percent (even-dim rounding). + Note: upscaling a low-resolution source does not add real detail; downstream model quality may suffer. + """ + src_w, src_h = video.get_dimensions() + scale_dims = _compute_upscale_dims(src_w, src_h, min_pixels) if scale_dims is None: return video return _apply_video_scale(video, scale_dims) diff --git a/comfy_execution/graph.py b/comfy_execution/graph.py index c47f3c79b..479ee8a53 100644 --- a/comfy_execution/graph.py +++ b/comfy_execution/graph.py @@ -1,4 +1,3 @@ -from __future__ import annotations from typing import Type, Literal import nodes diff --git a/comfy_execution/progress.py b/comfy_execution/progress.py index f951a3350..731b8dc66 100644 --- a/comfy_execution/progress.py +++ b/comfy_execution/progress.py @@ -1,5 +1,3 @@ -from __future__ import annotations - from typing import TypedDict, Dict, Optional, Tuple from typing_extensions import override from PIL import Image diff --git a/comfy_execution/validation.py b/comfy_execution/validation.py index e73624bd1..ae9a2376c 100644 --- a/comfy_execution/validation.py +++ b/comfy_execution/validation.py @@ -1,4 +1,3 @@ -from __future__ import annotations from comfy_api.latest import IO diff --git a/comfy_extras/mediapipe/face_geometry.py b/comfy_extras/mediapipe/face_geometry.py new file mode 100644 index 000000000..4f3813430 --- /dev/null +++ b/comfy_extras/mediapipe/face_geometry.py @@ -0,0 +1,110 @@ +"""Pure-numpy port of MediaPipe's face_geometry (FACE_LANDMARK_PIPELINE mode) ++ weighted Procrustes solver. Computes the 4x4 facial transformation matrix. +""" + + +import math +import numpy as np + + +def _solve_weighted_orthogonal_problem(src: np.ndarray, tgt: np.ndarray, weights: np.ndarray) -> np.ndarray: + """Weighted orthogonal Procrustes (similarity). Returns 4x4 M with + `target ≈ M @ homogeneous(source)` in the weighted LS sense. fp64 for + SVD stability. Port of procrustes_solver.cc.""" + sqrt_w = np.sqrt(weights.astype(np.float64)) + w_total = float((sqrt_w ** 2).sum()) + ws = src.astype(np.float64) * sqrt_w + wt = tgt.astype(np.float64) * sqrt_w + + c_w = (ws @ sqrt_w) / w_total + centered = ws - np.outer(c_w, sqrt_w) + U, _S, Vt = np.linalg.svd(wt @ centered.T, full_matrices=True) + # Disallow reflection: flip the least-significant axis when det(U)·det(V)<0. + post, pre = U.copy(), Vt.T.copy() + if np.linalg.det(post) * np.linalg.det(pre) < 0: + post[:, 2] *= -1.0 + R = post @ pre.T + + denom = float((centered * ws).sum()) + if denom < 1e-12: + raise ValueError("Procrustes denominator collapsed (degenerate source).") + scale = float((R @ centered * wt).sum()) / denom + translation = ((wt - scale * (R @ ws)) @ sqrt_w) / w_total + + M = np.eye(4, dtype=np.float64) + M[:3, :3] = scale * R + M[:3, 3] = translation + return M + + +def _estimate_scale(canonical: np.ndarray, runtime: np.ndarray, weights: np.ndarray) -> float: + """scale = ‖first column of M[:3]‖ per geometry_pipeline.cc::EstimateScale.""" + return float(np.linalg.norm(_solve_weighted_orthogonal_problem(canonical, runtime, weights)[:3, 0])) + + +def solve_facial_transformation_matrix( + landmarks_normalized: np.ndarray, + canonical_vertices: np.ndarray, + procrustes_indices: np.ndarray, + procrustes_weights: np.ndarray, + image_width: int, + image_height: int, + # face_geometry_calculator_options.pbtxt defaults + vertical_fov_degrees: float = 63.0, + near: float = 1.0, +) -> np.ndarray: + """4x4 facial transformation matrix via two-pass scale recovery + `landmarks_normalized` is (N, 3) in MediaPipe normalized convention: x, y + in [0,1] with TOP-LEFT origin, z in width-scaled units. + """ + + h_near = 2.0 * near * math.tan(0.5 * math.radians(vertical_fov_degrees)) + w_near = image_width * h_near / image_height + + sub = procrustes_indices.astype(np.int64) + screen = landmarks_normalized[sub].T.astype(np.float64).copy() + canon = canonical_vertices[sub].T.astype(np.float64).copy() + weights = procrustes_weights.astype(np.float64) + + # ProjectXY (TOP_LEFT y-flip, then scale all 3 axes; z uses x-scale). + screen[1] = 1.0 - screen[1] + screen[0] = screen[0] * w_near - 0.5 * w_near + screen[1] = screen[1] * h_near - 0.5 * h_near + screen[2] = screen[2] * w_near + depth_offset = float(screen[2].mean()) + + def _unproject(s: np.ndarray, scale: float) -> np.ndarray: + s = s.copy() + s[2] = (s[2] - depth_offset + near) / scale + s[0] *= s[2] / near + s[1] *= s[2] / near + s[2] *= -1.0 + return s + + first = screen.copy() + first[2] *= -1.0 + s1 = _estimate_scale(canon, first, weights) # 1st pass: Procrustes on projected XY + s2 = _estimate_scale(canon, _unproject(screen, s1), weights) # 2nd pass: rescale z by s1, un-project XY + return _solve_weighted_orthogonal_problem(canon, _unproject(screen, s1 * s2), weights).astype(np.float32) + + +def transformation_matrix_from_detection(face_dict: dict, image_width: int, image_height: int, canonical_data: dict) -> np.ndarray: + """Adapt a FaceLandmarker face dict to MP's normalized convention and solve. + FaceMesh emits (x, y, z) in 192-canonical units; MP's geometry expects + z_norm = z_canonical * scale_x / image_width""" + + lmks_xy, lmks_3d = face_dict["landmarks_xy"], face_dict["landmarks_3d"] + aug = np.concatenate([lmks_3d[:, :2].astype(np.float64), np.ones((lmks_xy.shape[0], 1))], axis=1) + M, *_ = np.linalg.lstsq(aug, lmks_xy.astype(np.float64), rcond=None) + scale_x = float(np.linalg.norm(M[0])) + z_scale = scale_x / image_width if scale_x > 1e-6 else 1.0 / image_width + + normalized = np.empty((lmks_xy.shape[0], 3), dtype=np.float32) + normalized[:, 0] = lmks_xy[:, 0] / image_width + normalized[:, 1] = lmks_xy[:, 1] / image_height + normalized[:, 2] = lmks_3d[:, 2] * z_scale + return solve_facial_transformation_matrix( + normalized, canonical_data["canonical_vertices"], + canonical_data["procrustes_indices"], canonical_data["procrustes_weights"], + image_width=image_width, image_height=image_height, + ) diff --git a/comfy_extras/mediapipe/face_landmarker.py b/comfy_extras/mediapipe/face_landmarker.py new file mode 100644 index 000000000..e6b463c4c --- /dev/null +++ b/comfy_extras/mediapipe/face_landmarker.py @@ -0,0 +1,681 @@ +"""Pure-PyTorch port of MediaPipe's face_landmarker_v2_with_blendshapes.task: +BlazeFace detector → FaceMesh v2 → ARKit-52 blendshapes.""" + + +import math +from functools import lru_cache +from typing import List, Tuple + +import numpy as np +import torch +import torch.nn.functional as F +from scipy.special import expit +from torch import Tensor, nn + + +# Values below must stay verbatim with the published face_landmarker_v2 graph + +# face_blendshapes_graph.cc::kLandmarksSubsetIdxs +_BS_INPUT_INDICES: Tuple[int, ...] = ( + 0, 1, 4, 5, 6, 7, 8, 10, 13, 14, 17, 21, 33, 37, 39, 40, 46, 52, 53, 54, + 55, 58, 61, 63, 65, 66, 67, 70, 78, 80, 81, 82, 84, 87, 88, 91, 93, 95, + 103, 105, 107, 109, 127, 132, 133, 136, 144, 145, 146, 148, 149, 150, 152, + 153, 154, 155, 157, 158, 159, 160, 161, 162, 163, 168, 172, 173, 176, 178, + 181, 185, 191, 195, 197, 234, 246, 249, 251, 263, 267, 269, 270, 276, 282, + 283, 284, 285, 288, 291, 293, 295, 296, 297, 300, 308, 310, 311, 312, 314, + 317, 318, 321, 323, 324, 332, 334, 336, 338, 356, 361, 362, 365, 373, 374, + 375, 377, 378, 379, 380, 381, 382, 384, 385, 386, 387, 388, 389, 390, 397, + 398, 400, 402, 405, 409, 415, 454, 466, 468, 469, 470, 471, 472, 473, 474, + 475, 476, 477, +) + +# face_blendshapes_graph.cc::kCategoryNames +BLENDSHAPE_NAMES: Tuple[str, ...] = ( + "_neutral", "browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", + "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", + "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", + "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", + "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", + "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", + "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", + "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", + "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", + "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", + "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", + "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", + "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", +) + +# face_detection.pbtxt — short-range BlazeFace. +_BF_NUM_LAYERS = 4 +_BF_INPUT_SIZE = 128 +_BF_STRIDES = (8, 16, 16, 16) +_BF_ANCHOR_OFFSET_X = 0.5 +_BF_ANCHOR_OFFSET_Y = 0.5 +_BF_ASPECT_RATIOS = (1.0,) +_BF_INTERP_SCALE_AR = 1.0 +_BF_BOX_SCALE = 128.0 +_BF_KP_OFFSET = 4 +_BF_SCORE_CLIP = 100.0 +_BF_MIN_SCORE = 0.5 + +# face_detection_full_range.pbtxt — 48x48 grid at stride 4, 1 anchor/cell. +_BF_FR_INPUT_SIZE = 192 +_BF_FR_GRID = 48 +_BF_FR_NUM_ANCHORS = _BF_FR_GRID * _BF_FR_GRID +_BF_FR_BOX_SCALE = 192.0 +_BF_FR_SCORE_CLIP = 100.0 + +_FM_INPUT_SIZE = 192 + +# Face ROI: 1.5xbbox rect warped anisotropically into 192x192. +_FACE_LEFT_EYE_KP = 0 +_FACE_RIGHT_EYE_KP = 1 +_FACE_ROI_SCALE_X = 1.5 +_FACE_ROI_SCALE_Y = 1.5 +_FACE_ROI_TARGET_ANGLE = 0.0 + + +def _tf_same_pad(x: Tensor, kernel: int, stride: int) -> Tensor: + """TF SAME pad (asymmetric on stride-2; PyTorch's symmetric pad undershoots by 1 px).""" + H, W = x.shape[-2], x.shape[-1] + pad_h = max(((H + stride - 1) // stride - 1) * stride + kernel - H, 0) + pad_w = max(((W + stride - 1) // stride - 1) * stride + kernel - W, 0) + if pad_h == 0 and pad_w == 0: + return x + return F.pad(x, (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2)) + + +# BlazeFace short-range: stem 5x5/s2 → 16 BlazeBlocks → parallel heads at +# 16²x88 (2 anchors/cell) and 8²x96 (6/cell) = 896 anchors. (in, out, stride): +_BLAZEFACE_BLOCKS = [ + (24, 24, 1), (24, 28, 1), (28, 32, 2), (32, 36, 1), + (36, 42, 1), (42, 48, 2), (48, 56, 1), (56, 64, 1), + (64, 72, 1), (72, 80, 1), (80, 88, 1), (88, 96, 2), + (96, 96, 1), (96, 96, 1), (96, 96, 1), (96, 96, 1), +] + + +class BlazeFaceBlock(nn.Module): + """DW 3x3 + PW + residual. Residual max-pools on stride>1, channel-pads on out_ch>in_ch.""" + + def __init__(self, in_ch: int, out_ch: int, stride: int, device=None, dtype=None, operations=None): + super().__init__() + ops = operations if operations is not None else nn + self.in_ch, self.out_ch, self.stride = in_ch, out_ch, stride + self.depthwise = ops.Conv2d(in_ch, in_ch, 3, stride=stride, padding=0, groups=in_ch, bias=True, device=device, dtype=dtype) + self.pointwise = ops.Conv2d(in_ch, out_ch, 1, padding=0, bias=True, device=device, dtype=dtype) + + def forward(self, x: Tensor) -> Tensor: + residual = F.max_pool2d(x, 2, 2) if self.stride > 1 else x + if self.out_ch > self.in_ch: + residual = F.pad(residual, (0, 0, 0, 0, 0, self.out_ch - self.in_ch)) + x = _tf_same_pad(x, 3, self.stride) if self.stride > 1 else F.pad(x, (1, 1, 1, 1)) + return F.relu(self.pointwise(self.depthwise(x)) + residual) + + +class BlazeFace(nn.Module): + """Short-range BlazeFace: (B, 3, 128, 128) in [-1, 1] → 896 anchors x 17.""" + + def __init__(self, device=None, dtype=None, operations=None): + super().__init__() + ops = operations if operations is not None else nn + kw = dict(device=device, dtype=dtype) + self.stem = ops.Conv2d(3, 24, 5, stride=2, padding=0, bias=True, **kw) + self.blocks = nn.ModuleList(BlazeFaceBlock(i, o, s, device=device, dtype=dtype, operations=operations) + for (i, o, s) in _BLAZEFACE_BLOCKS) + # 16²x2 + 8²x6 = 512 + 384 = 896 anchors. + self.cls_16 = ops.Conv2d(88, 2, 1, padding=0, bias=True, **kw) + self.cls_8 = ops.Conv2d(96, 6, 1, padding=0, bias=True, **kw) + self.reg_16 = ops.Conv2d(88, 32, 1, padding=0, bias=True, **kw) + self.reg_8 = ops.Conv2d(96, 96, 1, padding=0, bias=True, **kw) + + def forward(self, image_chw_normalized: Tensor) -> tuple[Tensor, Tensor]: + x = F.relu(self.stem(_tf_same_pad(image_chw_normalized, 5, 2))) + # 16x16 tap is block-10 output (before the 88→96 stride-2 in block 11). + for i in range(11): + x = self.blocks[i](x) + feat_16 = x + for i in range(11, 16): + x = self.blocks[i](x) + feat_8 = x + + def flat(t, a, k): # NHWC flatten → (B, H*W*A, K) + B, _, H, W = t.shape + return t.permute(0, 2, 3, 1).reshape(B, H * W * a, k) + + cls = torch.cat([flat(self.cls_16(feat_16), 2, 1), flat(self.cls_8(feat_8), 6, 1)], dim=1) + reg = torch.cat([flat(self.reg_16(feat_16), 2, 16), flat(self.reg_8(feat_8), 6, 16)], dim=1) + return reg, cls + + +# BlazeFace full-range (face_detection_full_range_sparse.tflite): MobileNetV2-ish +# backbone + top-down FPN, 192² input → 2304 anchors at the 48x48 grid. +class FRBlock(nn.Module): + """Double inverted residual: DW → PW(mid) → DW → PW(out) [+ residual]. + + Per source tflite: dw* have no fused activation, pw1 is always ReLU, pw2 + is ReLU only when no residual (else ReLU fuses into the ADD). + """ + + def __init__(self, in_ch: int, mid_ch: int, out_ch: int, stride: int, device=None, dtype=None, operations=None): + super().__init__() + ops = operations if operations is not None else nn + kw = dict(device=device, dtype=dtype) + self.has_residual = (in_ch == out_ch and stride == 1) + self.dw1 = ops.Conv2d(in_ch, in_ch, 3, stride=stride, padding=0, groups=in_ch, bias=True, **kw) + self.pw1 = ops.Conv2d(in_ch, mid_ch, 1, padding=0, bias=True, **kw) + self.dw2 = ops.Conv2d(mid_ch, mid_ch, 3, stride=1, padding=0, groups=mid_ch, bias=True, **kw) + self.pw2 = ops.Conv2d(mid_ch, out_ch, 1, padding=0, bias=True, **kw) + + def forward(self, x: Tensor) -> Tensor: + residual = x if self.has_residual else None + x = F.relu(self.pw1(self.dw1(F.pad(x, (1, 1, 1, 1))))) + x = self.pw2(self.dw2(F.pad(x, (1, 1, 1, 1)))) + return F.relu(x + residual) if residual is not None else F.relu(x) + + +# (in_ch, mid_ch, out_ch, stride). Stages downsample 96²x32 → 48²x64 → 24²x128 +# → 12²x192 → 6²x384. Lateral taps at indices 4, 7, 10 (see _FR_LATERAL_*). +_FR_BACKBONE_BLOCKS = [ + (32, 8, 32, 1), (32, 8, 32, 1), # 96²x32 + (32, 16, 64, 2), (64, 16, 64, 1), (64, 16, 64, 1), # 48²x64 — tap[0] + (64, 32, 128, 2), (128, 32, 128, 1), (128, 32, 128, 1), # 24²x128 — tap[1] + (128, 48, 192, 2), (192, 48, 192, 1), (192, 48, 192, 1), # 12²x192 — tap[2] + (192, 96, 384, 2), (384, 96, 384, 1), (384, 96, 384, 1), (384, 96, 384, 1), # 6²x384 +] +_FR_LATERAL_TAP_INDICES = (4, 7, 10) +_FR_LATERAL_CHANNELS = ((64, 48), (128, 64), (192, 96)) # (in, out) per side-conv + +# Decoder blocks per FPN level (after upsample-and-merge with the lateral). +_FR_DECODER_BLOCKS = [ + [(96, 48, 96, 1), (96, 48, 96, 1)], # 12²x96 + [(64, 32, 64, 1), (64, 32, 64, 1)], # 24²x64 + [(48, 24, 48, 1)], # 48²x48 — feeds the heads +] + + +def _dcr_depth_to_space(t: Tensor, r: int, c_out: int) -> Tensor: + """TF DEPTH_TO_SPACE in DCR layout (input channels = (i, j, c_out)). + pixel_shuffle uses CRD which permutes output channels for c_out > 1.""" + B_, _, H_, W_ = t.shape + t = t.reshape(B_, r, r, c_out, H_, W_) + t = t.permute(0, 3, 4, 1, 5, 2).contiguous() + return t.reshape(B_, c_out, H_ * r, W_ * r) + + +class BlazeFaceFullRange(nn.Module): + """Full-range face detector: (B, 3, 192, 192) in [-1, 1] → 2304 anchors x 17 values.""" + + def __init__(self, device=None, dtype=None, operations=None): + super().__init__() + ops = operations if operations is not None else nn + kw = dict(device=device, dtype=dtype) + mk_block = lambda i, m, o, s: FRBlock(i, m, o, s, device=device, dtype=dtype, operations=operations) + self.stem = ops.Conv2d(3, 32, 3, stride=2, padding=0, bias=True, **kw) + self.backbone = nn.ModuleList(mk_block(i, m, o, s) for (i, m, o, s) in _FR_BACKBONE_BLOCKS) + self.lateral_convs = nn.ModuleList(ops.Conv2d(i, o, 1, padding=0, bias=True, **kw) for (i, o) in _FR_LATERAL_CHANNELS) + self.top_conv = ops.Conv2d(384, 96, 1, padding=0, bias=True, **kw) + self.decoder_levels = nn.ModuleList( + nn.ModuleList(mk_block(i, m, o, s) for (i, m, o, s) in lvl) for lvl in _FR_DECODER_BLOCKS + ) + # 96→64 before 12→24, 64→48 before 24→48. + self.decoder_reduce_convs = nn.ModuleList([ + ops.Conv2d(96, 64, 1, padding=0, bias=True, **kw), + ops.Conv2d(64, 48, 1, padding=0, bias=True, **kw), + ]) + # Heads mix 2x2-cell info via DW-stride-2 + depth_to_space block_size=2. + self.cls_conv = ops.Conv2d(48, 4, 1, padding=0, bias=True, **kw) + self.cls_dw = ops.Conv2d(4, 4, 3, stride=2, padding=0, groups=4, bias=True, **kw) + self.reg_conv = ops.Conv2d(48, 64, 1, padding=0, bias=True, **kw) + self.reg_dw = ops.Conv2d(64, 64, 3, stride=2, padding=0, groups=64, bias=True, **kw) + + def forward(self, image_chw_normalized: Tensor) -> tuple[Tensor, Tensor]: + # Symmetric pad-1 throughout (full-range tflite uses explicit TF PAD, not SAME). + x = F.relu(self.stem(F.pad(image_chw_normalized, (1, 1, 1, 1)))) + tap_set = set(_FR_LATERAL_TAP_INDICES) + laterals: list[Tensor] = [] + for i, blk in enumerate(self.backbone): + x = blk(x) + if i in tap_set: + laterals.append(x) + + # top_conv / lateral_convs / decoder_reduce_convs all have fused ReLU in the tflite. + p = F.relu(self.top_conv(x)) + laterals_rev = list(reversed(laterals)) + lateral_convs_rev = list(reversed(self.lateral_convs)) + for level in range(len(self.decoder_levels)): + lateral = laterals_rev[level] + p = F.interpolate(p, size=lateral.shape[-2:], mode="bilinear", align_corners=False) + p = p + F.relu(lateral_convs_rev[level](lateral)) + for blk in self.decoder_levels[level]: + p = blk(p) + if level < len(self.decoder_reduce_convs): + p = F.relu(self.decoder_reduce_convs[level](p)) + + c = self.cls_dw(F.pad(self.cls_conv(p), (1, 1, 1, 1))) + c = _dcr_depth_to_space(c, r=2, c_out=1) + r = self.reg_dw(F.pad(self.reg_conv(p), (1, 1, 1, 1))) + r = _dcr_depth_to_space(r, r=2, c_out=16) + B = c.shape[0] + cls_out = c.permute(0, 2, 3, 1).reshape(B, _BF_FR_NUM_ANCHORS, 1) + reg_out = r.permute(0, 2, 3, 1).reshape(B, _BF_FR_NUM_ANCHORS, 16) + return reg_out, cls_out + + +@lru_cache(maxsize=1) +def _blazeface_full_range_anchors() -> np.ndarray: + """2304 anchors over 48x48; anchor_w=anchor_h=1 (fixed_anchor_size).""" + feat = _BF_FR_GRID + yy, xx = np.meshgrid(np.arange(feat, dtype=np.float32), np.arange(feat, dtype=np.float32), indexing="ij") + cx, cy, ones = (xx + 0.5) / feat, (yy + 0.5) / feat, np.ones_like(xx) + return np.stack([cx, cy, ones, ones], axis=-1).reshape(_BF_FR_NUM_ANCHORS, 4) + + +def _decode_blazeface_full_range(regressors: np.ndarray, classificators: np.ndarray, + score_thresh: float = _BF_MIN_SCORE) -> np.ndarray: + """Same decode as short-range with 2304-anchor grid and box_scale=192.""" + scores = expit(np.clip(classificators[:, 0], -_BF_FR_SCORE_CLIP, _BF_FR_SCORE_CLIP)) + keep = scores >= score_thresh + if not keep.any(): + return np.empty((0, 17), dtype=np.float32) + r = regressors[keep] / _BF_FR_BOX_SCALE + a = _blazeface_full_range_anchors()[keep] + cxs, cys, aws, ahs = a[:, 0:1], a[:, 1:2], a[:, 2:3], a[:, 3:4] + xc, yc = r[:, 0:1] * aws + cxs, r[:, 1:2] * ahs + cys + w, h = r[:, 2:3] * aws, r[:, 3:4] * ahs + out = np.empty((r.shape[0], 17), dtype=np.float32) + out[:, 0:1], out[:, 1:2], out[:, 2:3], out[:, 3:4] = xc - w / 2, yc - h / 2, xc + w / 2, yc + h / 2 + out[:, 4:16:2] = r[:, _BF_KP_OFFSET::2] * aws + cxs + out[:, 5:16:2] = r[:, _BF_KP_OFFSET + 1::2] * ahs + cys + out[:, 16] = scores[keep] + return out + + +# FaceMesh (face_landmarks_detector.tflite): PReLU variant of BlazeBlock, +# 17 blocks, heads for 478x3 landmarks + presence. +_FACEMESH_BLOCKS = [ # (in_ch, out_ch, stride) + (16, 16, 1), (16, 16, 1), (16, 32, 2), (32, 32, 1), (32, 32, 1), (32, 64, 2), + (64, 64, 1), (64, 64, 1), (64, 128, 2), (128, 128, 1), (128, 128, 1), (128, 128, 2), + (128, 128, 1), (128, 128, 1), (128, 128, 2), (128, 128, 1), (128, 128, 1), +] + + +class FaceMeshBlock(nn.Module): + """PReLU BlazeBlock: PReLU between DW and PW, and after the residual add.""" + + def __init__(self, in_ch: int, out_ch: int, stride: int, device=None, dtype=None, operations=None): + super().__init__() + ops = operations if operations is not None else nn + kw = dict(device=device, dtype=dtype) + self.in_ch, self.out_ch, self.stride = in_ch, out_ch, stride + self.depthwise = ops.Conv2d(in_ch, in_ch, 3, stride=stride, padding=0, groups=in_ch, bias=True, **kw) + self.prelu_dwise = nn.PReLU(num_parameters=in_ch, **kw) + self.pointwise = ops.Conv2d(in_ch, out_ch, 1, padding=0, bias=True, **kw) + self.prelu_out = nn.PReLU(num_parameters=out_ch, **kw) + + def forward(self, x: Tensor) -> Tensor: + residual = F.max_pool2d(x, 2, 2) if self.stride > 1 else x + if self.out_ch > self.in_ch: + residual = F.pad(residual, (0, 0, 0, 0, 0, self.out_ch - self.in_ch)) + x = _tf_same_pad(x, 3, self.stride) if self.stride > 1 else F.pad(x, (1, 1, 1, 1)) + return self.prelu_out(self.pointwise(self.prelu_dwise(self.depthwise(x))) + residual) + + +class FaceMesh(nn.Module): + NUM_LANDMARKS = 478 + + def __init__(self, device=None, dtype=None, operations=None): + super().__init__() + ops = operations if operations is not None else nn + kw = dict(device=device, dtype=dtype) + self.stem = ops.Conv2d(3, 16, 3, stride=2, padding=0, bias=True, **kw) + self.prelu_stem = nn.PReLU(num_parameters=16, **kw) + self.blocks = nn.ModuleList(FaceMeshBlock(i, o, s, device=device, dtype=dtype, operations=operations) + for (i, o, s) in _FACEMESH_BLOCKS) + self.head_reduce = ops.Conv2d(128, 8, 1, padding=0, bias=True, **kw) + self.prelu_head_reduce = nn.PReLU(num_parameters=8, **kw) + self.head_block = FaceMeshBlock(8, 8, 1, device=device, dtype=dtype, operations=operations) + self.head_presence = ops.Conv2d(8, 1, 3, padding=0, bias=True, **kw) + self.head_landmarks = ops.Conv2d(8, self.NUM_LANDMARKS * 3, 3, padding=0, bias=True, **kw) + + def forward(self, face_chw_normalized: Tensor) -> tuple[Tensor, Tensor]: + """(B, 3, 192, 192) in [0, 1] → ((B, 478, 3) landmarks in 192-canonical, (B,) presence).""" + x = self.prelu_stem(self.stem(_tf_same_pad(face_chw_normalized, 3, 2))) + for blk in self.blocks: + x = blk(x) + x = self.prelu_head_reduce(self.head_reduce(x)) + x = self.head_block(x) + B = x.shape[0] + presence = self.head_presence(x).reshape(B) + lmks = self.head_landmarks(x).reshape(B, self.NUM_LANDMARKS, 3) + return lmks, presence + + +# FaceBlendshapes (MLP-Mixer "GhumMarkerPoserMlpMixerGeneral"): +# 146x2 → token-reduce 146→96 → embed 2→64 → +cls token → 4x mixer → cls→52. +_BS_NUM_INPUT_LANDMARKS = 146 +_BS_NUM_TOKENS_REDUCED = 96 +_BS_NUM_TOKENS = 97 # +1 cls +_BS_TOKEN_DIM = 64 +_BS_TOKEN_MIX_HIDDEN = 384 +_BS_CHANNEL_MIX_HIDDEN = 256 +_BS_NUM_BLENDSHAPES = 52 +_BS_LN_EPS = 1e-6 + + +class MlpMixerBlock(nn.Module): + """MLP-Mixer block: token-mixing MLP (over tokens) → channel-mixing MLP (over dim). + Both pre-LN, both residual. LN has no beta (bias=False) to match MP.""" + + def __init__(self, num_tokens: int, token_dim: int, token_hidden: int, channel_hidden: int, + device=None, dtype=None, operations=None): + super().__init__() + ops = operations if operations is not None else nn + kw = dict(device=device, dtype=dtype) + # bias=False → no LN beta (matches MP). + self.ln1 = ops.LayerNorm(token_dim, eps=_BS_LN_EPS, bias=False, **kw) + self.ln2 = ops.LayerNorm(token_dim, eps=_BS_LN_EPS, bias=False, **kw) + self.token_mlp1 = ops.Linear(num_tokens, token_hidden, bias=True, **kw) + self.token_mlp2 = ops.Linear(token_hidden, num_tokens, bias=True, **kw) + self.channel_mlp1 = ops.Linear(token_dim, channel_hidden, bias=True, **kw) + self.channel_mlp2 = ops.Linear(channel_hidden, token_dim, bias=True, **kw) + + def forward(self, x: Tensor) -> Tensor: + y = self.ln1(x).transpose(1, 2) + x = x + self.token_mlp2(F.relu(self.token_mlp1(y))).transpose(1, 2) + return x + self.channel_mlp2(F.relu(self.channel_mlp1(self.ln2(x)))) + + +class FaceBlendshapes(nn.Module): + def __init__(self, device=None, dtype=None, operations=None): + super().__init__() + ops = operations if operations is not None else nn + kw = dict(device=device, dtype=dtype) + self.token_reduce = ops.Linear(_BS_NUM_INPUT_LANDMARKS, _BS_NUM_TOKENS_REDUCED, bias=True, **kw) + self.token_embed = ops.Linear(2, _BS_TOKEN_DIM, bias=True, **kw) + self.cls_token = nn.Parameter(torch.zeros(1, 1, _BS_TOKEN_DIM, **kw)) + self.blocks = nn.ModuleList( + MlpMixerBlock(_BS_NUM_TOKENS, _BS_TOKEN_DIM, _BS_TOKEN_MIX_HIDDEN, _BS_CHANNEL_MIX_HIDDEN, + device=device, dtype=dtype, operations=operations) for _ in range(4) + ) + self.head = ops.Linear(_BS_TOKEN_DIM, _BS_NUM_BLENDSHAPES, bias=True, **kw) + + @staticmethod + def _input_normalize(landmarks_2d: Tensor) -> Tensor: + # Centroid-subtract → L2 scale → x0.5. The 0.5 is baked into training. + centroid = landmarks_2d.mean(dim=1, keepdim=True) + x = landmarks_2d - centroid + mag = torch.sqrt((x * x).sum(dim=-1, keepdim=True)) + scale = mag.mean(dim=1, keepdim=True) + return (x / scale.clamp(min=1e-12)) * 0.5 + + def forward(self, landmarks_2d: Tensor) -> Tensor: + """(B, 146, 2) → (B, 52) in [0, 1]. Input units don't matter (centroid + L2 normalize).""" + x = self._input_normalize(landmarks_2d) + x = self.token_reduce(x.transpose(1, 2)).transpose(1, 2) + x = self.token_embed(x) + cls = self.cls_token.expand(x.shape[0], -1, -1) + x = torch.cat([cls, x], dim=1) + for blk in self.blocks: + x = blk(x) + return torch.sigmoid(self.head(x[:, 0])) + + +@lru_cache(maxsize=1) +def _blazeface_anchors() -> np.ndarray: + """896 anchors per SsdAnchorsCalculator (fixed_anchor_size → anchor_w=anchor_h=1).""" + per_ar = len(_BF_ASPECT_RATIOS) + (1 if _BF_INTERP_SCALE_AR > 0 else 0) + layer_anchors: List[np.ndarray] = [] + layer = 0 + while layer < _BF_NUM_LAYERS: + stride = _BF_STRIDES[layer] + last = layer + while last < _BF_NUM_LAYERS and _BF_STRIDES[last] == stride: + last += 1 + per_cell = per_ar * (last - layer) + feat = (_BF_INPUT_SIZE + stride - 1) // stride + yy, xx = np.meshgrid(np.arange(feat, dtype=np.float32), np.arange(feat, dtype=np.float32), indexing="ij") + cx, cy, ones = (xx + _BF_ANCHOR_OFFSET_X) / feat, (yy + _BF_ANCHOR_OFFSET_Y) / feat, np.ones_like(xx) + cell = np.stack([cx, cy, ones, ones], axis=-1).reshape(-1, 4) + layer_anchors.append(np.repeat(cell, per_cell, axis=0)) + layer = last + out = np.concatenate(layer_anchors, axis=0) + assert out.shape == (896, 4), out.shape + return out + + +def _decode_blazeface(regressors: np.ndarray, classificators: np.ndarray, + score_thresh: float = _BF_MIN_SCORE) -> np.ndarray: + """Decode (regs (896,16), cls (896,1)) → (N, 17) = [xyxy, kp0x..kp5y, score] in [0, 1].""" + scores = expit(np.clip(classificators[:, 0], -_BF_SCORE_CLIP, _BF_SCORE_CLIP)) + keep = scores >= score_thresh + if not keep.any(): + return np.empty((0, 17), dtype=np.float32) + r = regressors[keep] / _BF_BOX_SCALE + a = _blazeface_anchors()[keep] # (N, 4) cx, cy, 1, 1 + cxs, cys, aws, ahs = a[:, 0:1], a[:, 1:2], a[:, 2:3], a[:, 3:4] + xc, yc = r[:, 0:1] * aws + cxs, r[:, 1:2] * ahs + cys + w, h = r[:, 2:3] * aws, r[:, 3:4] * ahs + out = np.empty((r.shape[0], 17), dtype=np.float32) + out[:, 0:1], out[:, 1:2], out[:, 2:3], out[:, 3:4] = xc - w / 2, yc - h / 2, xc + w / 2, yc + h / 2 + out[:, 4:16:2] = r[:, _BF_KP_OFFSET::2] * aws + cxs + out[:, 5:16:2] = r[:, _BF_KP_OFFSET + 1::2] * ahs + cys + out[:, 16] = scores[keep] + return out + + +def _weighted_nms(detections: np.ndarray, iou_thresh: float = 0.5) -> np.ndarray: + """MP weighted NMS — kept boxes are score-weighted averages of overlapping detections.""" + if detections.shape[0] == 0: + return detections + dets = detections[np.argsort(-detections[:, 16])] + N = dets.shape[0] + areas = np.clip(dets[:, 2] - dets[:, 0], 0, None) * np.clip(dets[:, 3] - dets[:, 1], 0, None) + kept: List[np.ndarray] = [] + used = np.zeros(N, dtype=bool) + for i in range(N): + if used[i]: + continue + ax1, ay1, ax2, ay2 = dets[i, 0:4] + merge_idx = [i] + for j in range(i + 1, N): + if used[j]: + continue + bx1, by1, bx2, by2 = dets[j, 0:4] + iw = max(0.0, min(ax2, bx2) - max(ax1, bx1)) + ih = max(0.0, min(ay2, by2) - max(ay1, by1)) + inter = iw * ih + union = areas[i] + areas[j] - inter + if union > 0 and inter / union > iou_thresh: # strict > matches MP + merge_idx.append(j) + used[j] = True + used[i] = True + cluster = dets[merge_idx] + ws = cluster[:, 16:17] + ws_sum = ws.sum() + merged = np.copy(cluster[0]) + if ws_sum > 0: + merged[:16] = (cluster[:, :16] * ws).sum(axis=0) / ws_sum + kept.append(merged) + return np.stack(kept, axis=0) if kept else np.empty((0, 17), dtype=np.float32) + + +def _detection_to_face_rect(detection: np.ndarray, image_w: int, image_h: int) -> Tuple[float, float, float, float, float]: + """Detection (normalized) → rotated 1.5xbbox ROI in image pixels (anisotropic).""" + xmin, ymin, xmax, ymax = detection[0:4] + lx = detection[4 + _FACE_LEFT_EYE_KP * 2 + 0] * image_w + ly = detection[4 + _FACE_LEFT_EYE_KP * 2 + 1] * image_h + rx = detection[4 + _FACE_RIGHT_EYE_KP * 2 + 0] * image_w + ry = detection[4 + _FACE_RIGHT_EYE_KP * 2 + 1] * image_h + # Image-y-down convention: angle = target - atan2(-dy, dx). + angle = _FACE_ROI_TARGET_ANGLE - math.atan2(ly - ry, rx - lx) + return (float((xmin + xmax) * 0.5 * image_w), + float((ymin + ymax) * 0.5 * image_h), + float((xmax - xmin) * image_w * _FACE_ROI_SCALE_X), + float((ymax - ymin) * image_h * _FACE_ROI_SCALE_Y), + float(angle)) + + +def _sample_warp(image_chw: Tensor, src_x: Tensor, src_y: Tensor, padding_mode: str) -> Tensor: + """Bilinear-sample image_chw at corner-aligned (src_x, src_y).""" + H, W = int(image_chw.shape[-2]), int(image_chw.shape[-1]) + grid = torch.stack([(2.0 * src_x + 1.0) / W - 1.0, + (2.0 * src_y + 1.0) / H - 1.0], dim=-1).unsqueeze(0) + return F.grid_sample(image_chw.unsqueeze(0), grid, mode="bilinear", + align_corners=False, padding_mode=padding_mode).squeeze(0) + + +def _warp_face_crop(image_chw: Tensor, cx: float, cy: float, width: float, height: float, + angle: float, output_size: int = _FM_INPUT_SIZE) -> Tensor: + """Rotated rect → output_size² with BORDER_REPLICATE. image_chw must be in [0, 1].""" + s_x, s_y = width / output_size, height / output_size + cos_a, sin_a = math.cos(angle), math.sin(angle) + arange = torch.arange(output_size, dtype=image_chw.dtype, device=image_chw.device) - output_size * 0.5 + v_grid, u_grid = torch.meshgrid(arange, arange, indexing="ij") + src_x = cx + u_grid * s_x * cos_a - v_grid * s_y * sin_a + src_y = cy + u_grid * s_x * sin_a + v_grid * s_y * cos_a + return _sample_warp(image_chw, src_x, src_y, "border") + + +def _blazeface_input_warp(image_chw_raw: Tensor, target: int = _BF_INPUT_SIZE) -> Tuple[Tensor, float, float, float]: + """Centered max(W,H) square → target² with BORDER_ZERO + [-1, 1] norm. + + Sub-pixel grid_sample matters; integer-pad-then-resize drifts the bbox ~5%. + Returns (warped, sub_rect_cx, sub_rect_cy, sub_rect_size) — the triplet maps + tensor-normalized [0,1] detections back to image pixels. + """ + H, W = int(image_chw_raw.shape[1]), int(image_chw_raw.shape[2]) + sub_rect_size = float(max(W, H)) + sub_rect_cx, sub_rect_cy = W * 0.5, H * 0.5 + s = sub_rect_size / target + arange = torch.arange(target, dtype=image_chw_raw.dtype, device=image_chw_raw.device) - target * 0.5 + v_grid, u_grid = torch.meshgrid(arange, arange, indexing="ij") + out = _sample_warp(image_chw_raw, sub_rect_cx + u_grid * s, sub_rect_cy + v_grid * s, "zeros") + return (out / 127.5) - 1.0, sub_rect_cx, sub_rect_cy, sub_rect_size + + +class FaceLandmarker(nn.Module): + """BlazeFace → FaceMesh v2 → blendshapes. `detector_variant` selects 'short' + (128², ≤2m) or 'full' (192² FPN, ≤5m). State dict uses inner-module prefixes + `detector.*` / `mesh.*` / `blendshapes.*`; the outer FaceLandmarkerModel + wrapper rewrites `detector_{variant}.*` keys to `detector.*` before loading. + """ + + def __init__(self, device=None, dtype=None, operations=None, detector_variant: str = "short"): + super().__init__() + det_cls = {"short": BlazeFace, "full": BlazeFaceFullRange}.get(detector_variant) + + self.detector_variant = detector_variant + self.detector = det_cls(device=device, dtype=dtype, operations=operations) + self.mesh = FaceMesh(device=device, dtype=dtype, operations=operations) + self.blendshapes = FaceBlendshapes(device=device, dtype=dtype, operations=operations) + self.register_buffer("_bs_idx", torch.tensor(_BS_INPUT_INDICES, dtype=torch.long), persistent=False) + + def run_detector_batch(self, images_rgb_uint8: List[np.ndarray], + score_thresh: float = _BF_MIN_SCORE, + iou_thresh: float = 0.5): + """Batched detector pass. Returns (img_raws, sub_rects, sizes, per_frame_decoded) + where per_frame_decoded[b] is (N, 17) in tensor-normalized [0,1] coords.""" + if not images_rgb_uint8: + return [], [], [], [] + device, dtype = self.detector.stem.weight.device, self.detector.stem.weight.dtype + det_input_size, decode_fn = ((_BF_FR_INPUT_SIZE, _decode_blazeface_full_range) + if self.detector_variant == "full" + else (_BF_INPUT_SIZE, _decode_blazeface)) + + # Same-size frames: stack once and transfer once. Variable size falls back + # to per-image (only triggers for SAM3DBody's head crops). + sizes = [tuple(img.shape[:2]) for img in images_rgb_uint8] + if len(set(sizes)) == 1: + batch_chw = torch.from_numpy(np.stack(images_rgb_uint8, axis=0)).to(device, dtype).movedim(-1, -3).contiguous() + img_raws = [batch_chw[bi] for bi in range(batch_chw.shape[0])] + else: + img_raws = [torch.from_numpy(img).to(device, dtype).movedim(-1, -3).contiguous() for img in images_rgb_uint8] + + warps = [_blazeface_input_warp(img_raw, det_input_size) for img_raw in img_raws] + det_crops = [w[0] for w in warps] + sub_rects = [(w[1], w[2], w[3]) for w in warps] + + regs_b, cls_b = self.detector(torch.stack(det_crops, dim=0)) + regs_np, cls_np = regs_b.float().cpu().numpy(), cls_b.float().cpu().numpy() + per_frame = [] + for b in range(len(images_rgb_uint8)): + decoded = decode_fn(regs_np[b], cls_np[b], score_thresh=score_thresh) + per_frame.append(_weighted_nms(decoded, iou_thresh=iou_thresh) if decoded.shape[0] > 0 else decoded) + return img_raws, sub_rects, sizes, per_frame + + def detect_batch(self, images_rgb_uint8: List[np.ndarray], num_faces: int = 1, + score_thresh: float = _BF_MIN_SCORE) -> List[List[dict]]: + """Full pipeline batched across `images_rgb_uint8`. Returns one face-dict + list per image (empty if nothing detected). Face dict: + bbox_xyxy (4,) image pixels, blendshapes {52} ∈ [0,1], + landmarks_xy (478, 2) image pixels, landmarks_3d (478, 3) in + 192-canonical (pre-transformation) units, presence float (raw logit). + """ + img_raws, sub_rects, sizes, per_frame_dets = self.run_detector_batch( + images_rgb_uint8, score_thresh=score_thresh, + ) + # tensor-normalized → image-normalized [0,1] for _detection_to_face_rect. + for b, decoded in enumerate(per_frame_dets): + if decoded.shape[0] == 0: + continue + cx, cy, size = sub_rects[b] + H, W = sizes[b] + sx0, sy0 = cx - size * 0.5, cy - size * 0.5 + decoded[:, 0:16:2] = (sx0 + size * decoded[:, 0:16:2]) / W + decoded[:, 1:16:2] = (sy0 + size * decoded[:, 1:16:2]) / H + if num_faces > 0: + per_frame_dets[b] = decoded[: int(num_faces)] + + # Collect every detected face across all frames into one mesh input. + face_params: List[Tuple[int, float, float, float, float, float, float]] = [] + mesh_crops: List[Tensor] = [] + for b, dets in enumerate(per_frame_dets): + if dets.shape[0] == 0: + continue + H, W = sizes[b] + img_for_mesh = img_raws[b] / 255.0 + for det in dets: + cx, cy, w, h, angle = _detection_to_face_rect(det, W, H) + mesh_crops.append(_warp_face_crop(img_for_mesh, cx, cy, w, h, angle, _FM_INPUT_SIZE)) + face_params.append((b, float(det[16]), cx, cy, w, h, angle)) + + results: List[List[dict]] = [[] for _ in range(len(images_rgb_uint8))] + if not mesh_crops: + return results + + lmks_canon_b, presence_b = self.mesh(torch.stack(mesh_crops, dim=0)) + bs_out_b = self.blendshapes(lmks_canon_b[:, self._bs_idx, :2]) + + # Batched canonical→image affine + params_t = torch.tensor( + [(cx, cy, w, h, math.cos(a), math.sin(a)) for (_b, _s, cx, cy, w, h, a) in face_params], + device=lmks_canon_b.device, dtype=lmks_canon_b.dtype, + ) + cxs, cys, ws, hs, cos_a, sin_a = params_t.unbind(dim=1) + inv = 1.0 / _FM_INPUT_SIZE + u = lmks_canon_b[..., 0] - _FM_INPUT_SIZE * 0.5 + v = lmks_canon_b[..., 1] - _FM_INPUT_SIZE * 0.5 + lmks_xy_t = torch.stack([ + cxs[:, None] + u * (ws * inv * cos_a)[:, None] - v * (hs * inv * sin_a)[:, None], + cys[:, None] + u * (ws * inv * sin_a)[:, None] + v * (hs * inv * cos_a)[:, None], + ], dim=-1) + + lmks_xy_np = lmks_xy_t.float().cpu().numpy() + lmks_canon_np = lmks_canon_b.float().cpu().numpy() + presence_np = presence_b.float().cpu().numpy() + bs_np = bs_out_b.float().cpu().numpy() + + for i, (b, score, *_) in enumerate(face_params): + lmks_xy = lmks_xy_np[i] + mn, mx = lmks_xy.min(0), lmks_xy.max(0) + results[b].append({ + "bbox_xyxy": np.array([mn[0], mn[1], mx[0], mx[1]], dtype=np.float32), + "blendshapes": dict(zip(BLENDSHAPE_NAMES, bs_np[i].tolist())), + "landmarks_xy": lmks_xy, + "landmarks_3d": lmks_canon_np[i], + "presence": float(presence_np[i]), + "score": score, + }) + return results diff --git a/comfy_extras/nodes_ace.py b/comfy_extras/nodes_ace.py index 247d9ae8a..044077b18 100644 --- a/comfy_extras/nodes_ace.py +++ b/comfy_extras/nodes_ace.py @@ -11,7 +11,7 @@ class TextEncodeAceStepAudio(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="TextEncodeAceStepAudio", - category="conditioning", + category="model/conditioning", inputs=[ IO.Clip.Input("clip"), IO.String.Input("tags", multiline=True, dynamic_prompts=True), @@ -33,7 +33,7 @@ class TextEncodeAceStepAudio15(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="TextEncodeAceStepAudio1.5", - category="conditioning", + category="model/conditioning", inputs=[ IO.Clip.Input("clip"), IO.String.Input("tags", multiline=True, dynamic_prompts=True), @@ -67,7 +67,7 @@ class EmptyAceStepLatentAudio(IO.ComfyNode): return IO.Schema( node_id="EmptyAceStepLatentAudio", display_name="Empty Ace Step 1.0 Latent Audio", - category="latent/audio", + category="model/latent/audio", inputs=[ IO.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.1), IO.Int.Input( @@ -90,7 +90,7 @@ class EmptyAceStep15LatentAudio(IO.ComfyNode): return IO.Schema( node_id="EmptyAceStep1.5LatentAudio", display_name="Empty Ace Step 1.5 Latent Audio", - category="latent/audio", + category="model/latent/audio", inputs=[ IO.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.01), IO.Int.Input( diff --git a/comfy_extras/nodes_advanced_samplers.py b/comfy_extras/nodes_advanced_samplers.py index 20717ca38..77a561e30 100644 --- a/comfy_extras/nodes_advanced_samplers.py +++ b/comfy_extras/nodes_advanced_samplers.py @@ -45,7 +45,7 @@ class SamplerLCMUpscale(io.ComfyNode): def define_schema(cls) -> io.Schema: return io.Schema( node_id="SamplerLCMUpscale", - category="sampling/samplers", + category="model/sampling/samplers", inputs=[ io.Float.Input("scale_ratio", default=1.0, min=0.1, max=20.0, step=0.01, advanced=True), io.Int.Input("scale_steps", default=-1, min=-1, max=1000, step=1, advanced=True), @@ -91,7 +91,7 @@ class SamplerLCM(io.ComfyNode): def define_schema(cls) -> io.Schema: return io.Schema( node_id="SamplerLCM", - category="sampling/samplers", + category="model/sampling/samplers", description=("LCM sampler with tunable per-step noise. s_noise is a multiplier on the model's training noise scale"), inputs=[ io.Float.Input("s_noise", default=1.0, min=0.0, max=64.0, step=0.01, diff --git a/comfy_extras/nodes_align_your_steps.py b/comfy_extras/nodes_align_your_steps.py index 307f41337..f89a809bb 100644 --- a/comfy_extras/nodes_align_your_steps.py +++ b/comfy_extras/nodes_align_your_steps.py @@ -29,7 +29,7 @@ class AlignYourStepsScheduler(io.ComfyNode): return io.Schema( node_id="AlignYourStepsScheduler", search_aliases=["AYS scheduler"], - category="sampling/schedulers", + category="model/sampling/schedulers", inputs=[ io.Combo.Input("model_type", options=["SD1", "SDXL", "SVD"]), io.Int.Input("steps", default=10, min=1, max=10000), diff --git a/comfy_extras/nodes_apg.py b/comfy_extras/nodes_apg.py index fd561d360..4a352038a 100644 --- a/comfy_extras/nodes_apg.py +++ b/comfy_extras/nodes_apg.py @@ -16,7 +16,7 @@ class APG(io.ComfyNode): return io.Schema( node_id="APG", display_name="Adaptive Projected Guidance", - category="sampling/custom_sampling", + category="model/sampling/custom_sampling", inputs=[ io.Model.Input("model"), io.Float.Input( diff --git a/comfy_extras/nodes_ar_video.py b/comfy_extras/nodes_ar_video.py index 1a15facfa..c22359eb2 100644 --- a/comfy_extras/nodes_ar_video.py +++ b/comfy_extras/nodes_ar_video.py @@ -19,7 +19,7 @@ class EmptyARVideoLatent(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="EmptyARVideoLatent", - category="latent/video", + category="model/latent/video", inputs=[ io.Int.Input("width", default=832, min=16, max=8192, step=16), io.Int.Input("height", default=480, min=16, max=8192, step=16), @@ -53,7 +53,7 @@ class SamplerARVideo(io.ComfyNode): return io.Schema( node_id="SamplerARVideo", display_name="Sampler AR Video", - category="sampling/samplers", + category="model/sampling/samplers", inputs=[ io.Int.Input( "num_frame_per_block", @@ -85,7 +85,7 @@ class ARVideoI2V(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="ARVideoI2V", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Model.Input("model"), io.Vae.Input("vae"), diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py index 2d6b3c7ea..ff078f74c 100644 --- a/comfy_extras/nodes_audio.py +++ b/comfy_extras/nodes_audio.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import av import torchaudio import torch @@ -18,7 +16,7 @@ class EmptyLatentAudio(IO.ComfyNode): return IO.Schema( node_id="EmptyLatentAudio", display_name="Empty Latent Audio", - category="latent/audio", + category="model/latent/audio", essentials_category="Audio", inputs=[ IO.Float.Input("seconds", default=47.6, min=1.0, max=1000.0, step=0.1), @@ -43,7 +41,7 @@ class ConditioningStableAudio(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="ConditioningStableAudio", - category="conditioning", + category="model/conditioning", inputs=[ IO.Conditioning.Input("positive"), IO.Conditioning.Input("negative"), @@ -72,7 +70,7 @@ class VAEEncodeAudio(IO.ComfyNode): node_id="VAEEncodeAudio", search_aliases=["audio to latent"], display_name="VAE Encode Audio", - category="latent/audio", + category="model/latent/audio", inputs=[ IO.Audio.Input("audio"), IO.Vae.Input("vae"), @@ -117,7 +115,7 @@ class VAEDecodeAudio(IO.ComfyNode): node_id="VAEDecodeAudio", search_aliases=["latent to audio"], display_name="VAE Decode Audio", - category="latent/audio", + category="model/latent/audio", inputs=[ IO.Latent.Input("samples"), IO.Vae.Input("vae"), @@ -139,7 +137,7 @@ class VAEDecodeAudioTiled(IO.ComfyNode): node_id="VAEDecodeAudioTiled", search_aliases=["latent to audio"], display_name="VAE Decode Audio (Tiled)", - category="latent/audio", + category="model/latent/audio", inputs=[ IO.Latent.Input("samples"), IO.Vae.Input("vae"), @@ -543,7 +541,7 @@ class AudioConcat(IO.ComfyNode): return IO.Schema( node_id="AudioConcat", search_aliases=["join audio", "combine audio", "append audio"], - display_name="Audio Concat", + display_name="Concatenate Audio", description="Concatenates the audio1 to audio2 in the specified direction.", category="audio", inputs=[ @@ -597,7 +595,7 @@ class AudioMerge(IO.ComfyNode): return IO.Schema( node_id="AudioMerge", search_aliases=["mix audio", "overlay audio", "layer audio"], - display_name="Audio Merge", + display_name="Merge Audio", description="Combine two audio tracks by overlaying their waveforms.", category="audio", inputs=[ @@ -667,8 +665,9 @@ class AudioAdjustVolume(IO.ComfyNode): return IO.Schema( node_id="AudioAdjustVolume", search_aliases=["audio gain", "loudness", "audio level"], - display_name="Audio Adjust Volume", + display_name="Adjust Audio Volume", category="audio", + description="Adjust the volume of the audio by a specified amount in decibels (dB).", inputs=[ IO.Audio.Input("audio"), IO.Int.Input( diff --git a/comfy_extras/nodes_audio_encoder.py b/comfy_extras/nodes_audio_encoder.py index 6a85da89b..2ae30d321 100644 --- a/comfy_extras/nodes_audio_encoder.py +++ b/comfy_extras/nodes_audio_encoder.py @@ -11,7 +11,7 @@ class AudioEncoderLoader(io.ComfyNode): return io.Schema( node_id="AudioEncoderLoader", display_name="Load Audio Encoder", - category="loaders", + category="model/loaders", inputs=[ io.Combo.Input( "audio_encoder_name", @@ -36,7 +36,7 @@ class AudioEncoderEncode(io.ComfyNode): def define_schema(cls) -> io.Schema: return io.Schema( node_id="AudioEncoderEncode", - category="conditioning", + category="model/conditioning", inputs=[ io.AudioEncoder.Input("audio_encoder"), io.Audio.Input("audio"), diff --git a/comfy_extras/nodes_bg_removal.py b/comfy_extras/nodes_bg_removal.py index 793fd802b..9dc9ad854 100644 --- a/comfy_extras/nodes_bg_removal.py +++ b/comfy_extras/nodes_bg_removal.py @@ -11,7 +11,7 @@ class LoadBackgroundRemovalModel(IO.ComfyNode): return IO.Schema( node_id="LoadBackgroundRemovalModel", display_name="Load Background Removal Model", - category="loaders", + category="model/loaders", inputs=[ IO.Combo.Input("bg_removal_name", options=sorted(files), tooltip="The model used to remove backgrounds from images"), ], diff --git a/comfy_extras/nodes_camera_trajectory.py b/comfy_extras/nodes_camera_trajectory.py index 34b78e81b..13a1448f4 100644 --- a/comfy_extras/nodes_camera_trajectory.py +++ b/comfy_extras/nodes_camera_trajectory.py @@ -153,7 +153,7 @@ class WanCameraEmbedding(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanCameraEmbedding", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Combo.Input( "camera_pose", diff --git a/comfy_extras/nodes_cfg.py b/comfy_extras/nodes_cfg.py index 4ebb4b51e..b585c560f 100644 --- a/comfy_extras/nodes_cfg.py +++ b/comfy_extras/nodes_cfg.py @@ -57,24 +57,55 @@ class CFGNorm(io.ComfyNode): inputs=[ io.Model.Input("model"), io.Float.Input("strength", default=1.0, min=0.0, max=100.0, step=0.01), + io.Boolean.Input( + "pre_cfg", + default=False, + optional=True, + tooltip=( + "If true, rescale the combined noise BEFORE the sampler's CFG combine, " + "without clamping (can amplify). Matches the norm-scaled CFG used by " + "models like Lens. Default false keeps the original post-CFG x0-space " + "attenuate-only behavior." + ), + ), ], outputs=[io.Model.Output(display_name="patched_model")], is_experimental=True, ) @classmethod - def execute(cls, model, strength) -> io.NodeOutput: + def execute(cls, model, strength, pre_cfg=False) -> io.NodeOutput: m = model.clone() - def cfg_norm(args): - cond_p = args['cond_denoised'] - pred_text_ = args["denoised"] + if pre_cfg: + def cfg_norm_pre(args): + cond = args["cond"] + uncond = args["uncond"] + cond_scale = args["cond_scale"] + comb = uncond + cond_scale * (cond - uncond) + cond_norm = torch.linalg.vector_norm(cond, dim=1, keepdim=True) + comb_norm = torch.linalg.vector_norm(comb, dim=1, keepdim=True) + rescale = torch.where( + comb_norm > 0, + cond_norm / comb_norm.clamp_min(1e-12), + torch.ones_like(comb_norm), + ) + rescaled = comb * rescale + # strength blends back toward standard linear CFG (1.0 = full rescale). + if strength != 1.0: + rescaled = strength * rescaled + (1.0 - strength) * comb + return rescaled + m.set_model_sampler_cfg_function(cfg_norm_pre) + else: + def cfg_norm(args): + cond_p = args['cond_denoised'] + pred_text_ = args["denoised"] - norm_full_cond = torch.norm(cond_p, dim=1, keepdim=True) - norm_pred_text = torch.norm(pred_text_, dim=1, keepdim=True) - scale = (norm_full_cond / (norm_pred_text + 1e-8)).clamp(min=0.0, max=1.0) - return pred_text_ * scale * strength + norm_full_cond = torch.norm(cond_p, dim=1, keepdim=True) + norm_pred_text = torch.norm(pred_text_, dim=1, keepdim=True) + scale = (norm_full_cond / (norm_pred_text + 1e-8)).clamp(min=0.0, max=1.0) + return pred_text_ * scale * strength - m.set_model_sampler_post_cfg_function(cfg_norm) + m.set_model_sampler_post_cfg_function(cfg_norm) return io.NodeOutput(m) diff --git a/comfy_extras/nodes_chroma_radiance.py b/comfy_extras/nodes_chroma_radiance.py index 509436062..ca427e5cb 100644 --- a/comfy_extras/nodes_chroma_radiance.py +++ b/comfy_extras/nodes_chroma_radiance.py @@ -13,7 +13,7 @@ class EmptyChromaRadianceLatentImage(io.ComfyNode): def define_schema(cls) -> io.Schema: return io.Schema( node_id="EmptyChromaRadianceLatentImage", - category="latent/chroma_radiance", + category="model/latent/chroma_radiance", inputs=[ io.Int.Input(id="width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16), io.Int.Input(id="height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16), @@ -33,7 +33,7 @@ class ChromaRadianceOptions(io.ComfyNode): def define_schema(cls) -> io.Schema: return io.Schema( node_id="ChromaRadianceOptions", - category="model_patches/chroma_radiance", + category="model/patch/chroma_radiance", description="Allows setting advanced options for the Chroma Radiance model.", inputs=[ io.Model.Input(id="model"), diff --git a/comfy_extras/nodes_color.py b/comfy_extras/nodes_color.py index 80ba121cd..01a05035e 100644 --- a/comfy_extras/nodes_color.py +++ b/comfy_extras/nodes_color.py @@ -8,7 +8,7 @@ class ColorToRGBInt(io.ComfyNode): return io.Schema( node_id="ColorToRGBInt", display_name="Color to RGB Int", - category="utils", + category="utilities", description="Convert a color to a RGB integer value.", inputs=[ io.Color.Input("color"), diff --git a/comfy_extras/nodes_context_windows.py b/comfy_extras/nodes_context_windows.py index f7ca833dc..d9e32b9d9 100644 --- a/comfy_extras/nodes_context_windows.py +++ b/comfy_extras/nodes_context_windows.py @@ -1,4 +1,3 @@ -from __future__ import annotations from comfy_api.latest import ComfyExtension, io import comfy.context_windows import nodes @@ -10,7 +9,7 @@ class ContextWindowsManualNode(io.ComfyNode): return io.Schema( node_id="ContextWindowsManual", display_name="Context Windows (Manual)", - category="model_patches", + category="model/patch", description="Manually set context windows.", inputs=[ io.Model.Input("model", tooltip="The model to apply context windows to during sampling."), diff --git a/comfy_extras/nodes_controlnet.py b/comfy_extras/nodes_controlnet.py index 847cb0bdf..17d965405 100644 --- a/comfy_extras/nodes_controlnet.py +++ b/comfy_extras/nodes_controlnet.py @@ -9,7 +9,7 @@ class SetUnionControlNetType(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SetUnionControlNetType", - category="conditioning/controlnet", + category="model/conditioning/controlnet", inputs=[ io.ControlNet.Input("control_net"), io.Combo.Input("type", options=["auto"] + list(UNION_CONTROLNET_TYPES.keys())), @@ -39,7 +39,7 @@ class ControlNetInpaintingAliMamaApply(io.ComfyNode): return io.Schema( node_id="ControlNetInpaintingAliMamaApply", search_aliases=["masked controlnet"], - category="conditioning/controlnet", + category="model/conditioning/controlnet", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), diff --git a/comfy_extras/nodes_cosmos.py b/comfy_extras/nodes_cosmos.py index 7dd129d19..d754ab442 100644 --- a/comfy_extras/nodes_cosmos.py +++ b/comfy_extras/nodes_cosmos.py @@ -13,7 +13,7 @@ class EmptyCosmosLatentVideo(io.ComfyNode): def define_schema(cls) -> io.Schema: return io.Schema( node_id="EmptyCosmosLatentVideo", - category="latent/video", + category="model/latent/video", inputs=[ io.Int.Input("width", default=1280, min=16, max=nodes.MAX_RESOLUTION, step=16), io.Int.Input("height", default=704, min=16, max=nodes.MAX_RESOLUTION, step=16), @@ -45,7 +45,7 @@ class CosmosImageToVideoLatent(io.ComfyNode): def define_schema(cls) -> io.Schema: return io.Schema( node_id="CosmosImageToVideoLatent", - category="conditioning/inpaint", + category="model/conditioning/inpaint", inputs=[ io.Vae.Input("vae"), io.Int.Input("width", default=1280, min=16, max=nodes.MAX_RESOLUTION, step=16), @@ -88,7 +88,7 @@ class CosmosPredict2ImageToVideoLatent(io.ComfyNode): def define_schema(cls) -> io.Schema: return io.Schema( node_id="CosmosPredict2ImageToVideoLatent", - category="conditioning/inpaint", + category="model/conditioning/inpaint", inputs=[ io.Vae.Input("vae"), io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16), diff --git a/comfy_extras/nodes_curve.py b/comfy_extras/nodes_curve.py index 9803e8034..aa2d94bb6 100644 --- a/comfy_extras/nodes_curve.py +++ b/comfy_extras/nodes_curve.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import numpy as np from comfy_api.latest import ComfyExtension, io @@ -13,7 +11,7 @@ class CurveEditor(io.ComfyNode): return io.Schema( node_id="CurveEditor", display_name="Curve Editor", - category="utils", + category="utilities", inputs=[ io.Curve.Input("curve"), io.Histogram.Input("histogram", optional=True), @@ -40,7 +38,7 @@ class ImageHistogram(io.ComfyNode): return io.Schema( node_id="ImageHistogram", display_name="Image Histogram", - category="utils", + category="utilities", inputs=[ io.Image.Input("image"), ], diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py index 10b56b91c..c3346bf09 100644 --- a/comfy_extras/nodes_custom_sampler.py +++ b/comfy_extras/nodes_custom_sampler.py @@ -17,7 +17,7 @@ class BasicScheduler(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="BasicScheduler", - category="sampling/schedulers", + category="model/sampling/schedulers", inputs=[ io.Model.Input("model"), io.Combo.Input("scheduler", options=comfy.samplers.SCHEDULER_NAMES), @@ -47,7 +47,7 @@ class KarrasScheduler(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="KarrasScheduler", - category="sampling/schedulers", + category="model/sampling/schedulers", inputs=[ io.Int.Input("steps", default=20, min=1, max=10000), io.Float.Input("sigma_max", default=14.614642, min=0.0, max=5000.0, step=0.01, round=False, advanced=True), @@ -69,7 +69,7 @@ class ExponentialScheduler(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="ExponentialScheduler", - category="sampling/schedulers", + category="model/sampling/schedulers", inputs=[ io.Int.Input("steps", default=20, min=1, max=10000), io.Float.Input("sigma_max", default=14.614642, min=0.0, max=5000.0, step=0.01, round=False, advanced=True), @@ -90,7 +90,7 @@ class PolyexponentialScheduler(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="PolyexponentialScheduler", - category="sampling/schedulers", + category="model/sampling/schedulers", inputs=[ io.Int.Input("steps", default=20, min=1, max=10000), io.Float.Input("sigma_max", default=14.614642, min=0.0, max=5000.0, step=0.01, round=False, advanced=True), @@ -112,7 +112,7 @@ class LaplaceScheduler(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LaplaceScheduler", - category="sampling/schedulers", + category="model/sampling/schedulers", inputs=[ io.Int.Input("steps", default=20, min=1, max=10000), io.Float.Input("sigma_max", default=14.614642, min=0.0, max=5000.0, step=0.01, round=False, advanced=True), @@ -136,7 +136,7 @@ class SDTurboScheduler(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SDTurboScheduler", - category="sampling/schedulers", + category="model/sampling/schedulers", inputs=[ io.Model.Input("model"), io.Int.Input("steps", default=1, min=1, max=10), @@ -160,7 +160,7 @@ class BetaSamplingScheduler(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="BetaSamplingScheduler", - category="sampling/schedulers", + category="model/sampling/schedulers", inputs=[ io.Model.Input("model"), io.Int.Input("steps", default=20, min=1, max=10000), @@ -182,7 +182,7 @@ class VPScheduler(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="VPScheduler", - category="sampling/schedulers", + category="model/sampling/schedulers", inputs=[ io.Int.Input("steps", default=20, min=1, max=10000), io.Float.Input("beta_d", default=19.9, min=0.0, max=5000.0, step=0.01, round=False, advanced=True), #TODO: fix default values @@ -204,7 +204,7 @@ class SplitSigmas(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SplitSigmas", - category="sampling/sigmas", + category="model/sampling/sigmas", inputs=[ io.Sigmas.Input("sigmas"), io.Int.Input("step", default=0, min=0, max=10000), @@ -228,7 +228,7 @@ class SplitSigmasDenoise(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SplitSigmasDenoise", - category="sampling/sigmas", + category="model/sampling/sigmas", inputs=[ io.Sigmas.Input("sigmas"), io.Float.Input("denoise", default=1.0, min=0.0, max=1.0, step=0.01), @@ -254,7 +254,7 @@ class FlipSigmas(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="FlipSigmas", - category="sampling/sigmas", + category="model/sampling/sigmas", inputs=[io.Sigmas.Input("sigmas")], outputs=[io.Sigmas.Output()] ) @@ -276,7 +276,7 @@ class SetFirstSigma(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SetFirstSigma", - category="sampling/sigmas", + category="model/sampling/sigmas", inputs=[ io.Sigmas.Input("sigmas"), io.Float.Input("sigma", default=136.0, min=0.0, max=20000.0, step=0.001, round=False), @@ -298,7 +298,7 @@ class ExtendIntermediateSigmas(io.ComfyNode): return io.Schema( node_id="ExtendIntermediateSigmas", search_aliases=["interpolate sigmas"], - category="sampling/sigmas", + category="model/sampling/sigmas", inputs=[ io.Sigmas.Input("sigmas"), io.Int.Input("steps", default=2, min=1, max=100), @@ -351,7 +351,7 @@ class SamplingPercentToSigma(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SamplingPercentToSigma", - category="sampling/sigmas", + category="model/sampling/sigmas", inputs=[ io.Model.Input("model"), io.Float.Input("sampling_percent", default=0.0, min=0.0, max=1.0, step=0.0001), @@ -379,7 +379,7 @@ class KSamplerSelect(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="KSamplerSelect", - category="sampling/samplers", + category="model/sampling/samplers", inputs=[io.Combo.Input("sampler_name", options=comfy.samplers.SAMPLER_NAMES)], outputs=[io.Sampler.Output()] ) @@ -396,7 +396,7 @@ class SamplerDPMPP_3M_SDE(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SamplerDPMPP_3M_SDE", - category="sampling/samplers", + category="model/sampling/samplers", inputs=[ io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False, advanced=True), io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False, advanced=True), @@ -421,7 +421,7 @@ class SamplerDPMPP_2M_SDE(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SamplerDPMPP_2M_SDE", - category="sampling/samplers", + category="model/sampling/samplers", inputs=[ io.Combo.Input("solver_type", options=['midpoint', 'heun']), io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False, advanced=True), @@ -448,7 +448,7 @@ class SamplerDPMPP_SDE(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SamplerDPMPP_SDE", - category="sampling/samplers", + category="model/sampling/samplers", inputs=[ io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False, advanced=True), io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False, advanced=True), @@ -474,7 +474,7 @@ class SamplerDPMPP_2S_Ancestral(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SamplerDPMPP_2S_Ancestral", - category="sampling/samplers", + category="model/sampling/samplers", inputs=[ io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False), io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False), @@ -494,7 +494,7 @@ class SamplerEulerAncestral(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SamplerEulerAncestral", - category="sampling/samplers", + category="model/sampling/samplers", inputs=[ io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False, advanced=True), io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False, advanced=True), @@ -515,7 +515,7 @@ class SamplerEulerAncestralCFGPP(io.ComfyNode): return io.Schema( node_id="SamplerEulerAncestralCFGPP", display_name="SamplerEulerAncestralCFG++", - category="sampling/samplers", + category="model/sampling/samplers", inputs=[ io.Float.Input("eta", default=1.0, min=0.0, max=1.0, step=0.01, round=False), io.Float.Input("s_noise", default=1.0, min=0.0, max=10.0, step=0.01, round=False), @@ -537,7 +537,7 @@ class SamplerLMS(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SamplerLMS", - category="sampling/samplers", + category="model/sampling/samplers", inputs=[io.Int.Input("order", default=4, min=1, max=100, advanced=True)], outputs=[io.Sampler.Output()] ) @@ -554,7 +554,7 @@ class SamplerDPMAdaptative(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SamplerDPMAdaptative", - category="sampling/samplers", + category="model/sampling/samplers", inputs=[ io.Int.Input("order", default=3, min=2, max=3, advanced=True), io.Float.Input("rtol", default=0.05, min=0.0, max=100.0, step=0.01, round=False, advanced=True), @@ -585,7 +585,7 @@ class SamplerER_SDE(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SamplerER_SDE", - category="sampling/samplers", + category="model/sampling/samplers", inputs=[ io.Combo.Input("solver_type", options=["ER-SDE", "Reverse-time SDE", "ODE"]), io.Int.Input("max_stage", default=3, min=1, max=3, advanced=True), @@ -623,7 +623,7 @@ class SamplerSASolver(io.ComfyNode): return io.Schema( node_id="SamplerSASolver", search_aliases=["sde"], - category="sampling/samplers", + category="model/sampling/samplers", inputs=[ io.Model.Input("model"), io.Float.Input("eta", default=1.0, min=0.0, max=10.0, step=0.01, round=False, advanced=True), @@ -668,7 +668,7 @@ class SamplerSEEDS2(io.ComfyNode): return io.Schema( node_id="SamplerSEEDS2", search_aliases=["sde", "exp heun"], - category="sampling/samplers", + category="model/sampling/samplers", inputs=[ io.Combo.Input("solver_type", options=["phi_1", "phi_2"]), io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False, tooltip="Stochastic strength", advanced=True), @@ -727,7 +727,7 @@ class SamplerCustom(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SamplerCustom", - category="sampling/custom_sampling", + category="model/sampling/custom_sampling", inputs=[ io.Model.Input("model"), io.Boolean.Input("add_noise", default=True, advanced=True), @@ -795,7 +795,7 @@ class BasicGuider(io.ComfyNode): return io.Schema( node_id="BasicGuider", display_name="Basic Guider", - category="sampling/guiders", + category="model/sampling/guiders", inputs=[ io.Model.Input("model"), io.Conditioning.Input("conditioning"), @@ -817,7 +817,7 @@ class CFGGuider(io.ComfyNode): return io.Schema( node_id="CFGGuider", display_name="CFG Guider", - category="sampling/guiders", + category="model/sampling/guiders", inputs=[ io.Model.Input("model"), io.Conditioning.Input("positive"), @@ -872,7 +872,7 @@ class DualCFGGuider(io.ComfyNode): node_id="DualCFGGuider", search_aliases=["dual prompt guidance"], display_name="Dual CFG Guider", - category="sampling/guiders", + category="model/sampling/guiders", inputs=[ io.Model.Input("model"), io.Conditioning.Input("cond1"), @@ -900,7 +900,7 @@ class DisableNoise(io.ComfyNode): return io.Schema( node_id="DisableNoise", search_aliases=["zero noise"], - category="sampling/noise", + category="model/sampling/noise", inputs=[], outputs=[io.Noise.Output()] ) @@ -917,7 +917,7 @@ class RandomNoise(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="RandomNoise", - category="sampling/noise", + category="model/sampling/noise", inputs=[io.Int.Input("noise_seed", default=0, min=0, max=0xffffffffffffffff, control_after_generate=True)], outputs=[io.Noise.Output()] ) @@ -934,7 +934,7 @@ class SamplerCustomAdvanced(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SamplerCustomAdvanced", - category="sampling/custom_sampling", + category="model/sampling/custom_sampling", inputs=[ io.Noise.Input("noise"), io.Guider.Input("guider"), diff --git a/comfy_extras/nodes_dataset.py b/comfy_extras/nodes_dataset.py index 98ed25d7e..104d16d91 100644 --- a/comfy_extras/nodes_dataset.py +++ b/comfy_extras/nodes_dataset.py @@ -47,8 +47,10 @@ class LoadImageDataSetFromFolderNode(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LoadImageDataSetFromFolder", - display_name="Load Image Dataset from Folder", - category="dataset", + search_aliases=["load folder", "load from folder", "load dataset", "load images", "import dataset"], + display_name="Load Image (from Folder)", + category="image", + description="Load a dataset of images from a specified folder and return a list of images. Supported formats: PNG, JPG, JPEG, WEBP.", is_experimental=True, inputs=[ io.Combo.Input( @@ -84,14 +86,16 @@ class LoadImageTextDataSetFromFolderNode(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LoadImageTextDataSetFromFolder", - display_name="Load Image and Text Dataset from Folder", - category="dataset", + search_aliases=["load folder", "load from folder", "load dataset", "load images", "import dataset"], + display_name="Load Image-Text (from Folder)", + category="image", + description="Load a dataset of pairs of images and text captions from a specified folder and return them as a list. Supported formats: PNG, JPG, JPEG, WEBP.", is_experimental=True, inputs=[ io.Combo.Input( "folder", options=folder_paths.get_input_subfolders(), - tooltip="The folder to load images from.", + tooltip="The folder to load images and text captions from.", ) ], outputs=[ @@ -153,7 +157,7 @@ class LoadImageTextDataSetFromFolderNode(io.ComfyNode): return io.NodeOutput(output_tensor, captions) -def save_images_to_folder(image_list, output_dir, prefix="image"): +def save_images_to_folder(image_list, output_dir, prefix="image", overwrite=True): """Utility function to save a list of image tensors to disk. Args: @@ -193,7 +197,11 @@ def save_images_to_folder(image_list, output_dir, prefix="image"): raise ValueError(f"Expected torch.Tensor, got {type(img_tensor)}") # Save image - filename = f"{prefix}_{idx:05d}.png" + if overwrite: + filename = f"{prefix}_{idx:05d}.png" + else: + _, _, counter, _, resolved_prefix = folder_paths.get_save_image_path(prefix, output_dir) + filename = f"{resolved_prefix}_{counter:05}_{idx:05d}.png" filepath = os.path.join(output_dir, filename) img.save(filepath) saved_files.append(filename) @@ -206,8 +214,10 @@ class SaveImageDataSetToFolderNode(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SaveImageDataSetToFolder", - display_name="Save Image Dataset to Folder", - category="dataset", + search_aliases=["save folder", "save to folder", "save dataset", "save images", "export dataset"], + display_name="Save Image (to Folder) (DEPRECATED)", + category="image", + description="Save a dataset of images to a specified folder. Supported formats: PNG.", is_experimental=True, is_output_node=True, is_input_list=True, # Receive images as list @@ -224,18 +234,26 @@ class SaveImageDataSetToFolderNode(io.ComfyNode): tooltip="Prefix for saved image filenames.", advanced=True, ), + io.Combo.Input( + "mode", + default="overwrite", + options=["overwrite", "increment"], + tooltip="Whether to overwrite existing files or increment filenames to avoid overwriting." + ), ], outputs=[], + is_deprecated=True, # This node is redundant and superseded by existing Save Image nodes where the target folder can be specified in the filename_prefix ) @classmethod - def execute(cls, images, folder_name, filename_prefix): + def execute(cls, images, folder_name, filename_prefix, mode): # Extract scalar values folder_name = folder_name[0] filename_prefix = filename_prefix[0] + mode = mode[0] output_dir = os.path.join(folder_paths.get_output_directory(), folder_name) - saved_files = save_images_to_folder(images, output_dir, filename_prefix) + saved_files = save_images_to_folder(images, output_dir, filename_prefix, mode=='overwrite') logging.info(f"Saved {len(saved_files)} images to {output_dir}.") return io.NodeOutput() @@ -246,14 +264,20 @@ class SaveImageTextDataSetToFolderNode(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SaveImageTextDataSetToFolder", - display_name="Save Image and Text Dataset to Folder", - category="dataset", + search_aliases=["save folder", "save to folder", "save dataset", "save images", "save text", "export dataset"], + display_name="Save Image-Text (to Folder)", + category="image", + description="Save a dataset of pairs of images and text captions to a specified folder. Images are saved as PNG files and captions are saved as TXT files with the same filename_prefix.", is_experimental=True, is_output_node=True, is_input_list=True, # Receive both images and texts as lists inputs=[ io.Image.Input("images", tooltip="List of images to save."), - io.String.Input("texts", tooltip="List of text captions to save."), + io.String.Input("texts", + optional=True, + force_input=True, + tooltip="List of text captions to save." + ), io.String.Input( "folder_name", default="dataset", @@ -265,25 +289,33 @@ class SaveImageTextDataSetToFolderNode(io.ComfyNode): tooltip="Prefix for saved image filenames.", advanced=True, ), + io.Combo.Input( + "mode", + default="overwrite", + options=["overwrite", "increment"], + tooltip="Whether to overwrite existing files or increment filenames to avoid overwriting." + ), ], outputs=[], ) @classmethod - def execute(cls, images, texts, folder_name, filename_prefix): + def execute(cls, images, folder_name, filename_prefix, mode, texts=None): # Extract scalar values folder_name = folder_name[0] filename_prefix = filename_prefix[0] + mode = mode[0] output_dir = os.path.join(folder_paths.get_output_directory(), folder_name) - saved_files = save_images_to_folder(images, output_dir, filename_prefix) + saved_files = save_images_to_folder(images, output_dir, filename_prefix, mode=='overwrite') # Save captions - for idx, (filename, caption) in enumerate(zip(saved_files, texts)): - caption_filename = filename.replace(".png", ".txt") - caption_path = os.path.join(output_dir, caption_filename) - with open(caption_path, "w", encoding="utf-8") as f: - f.write(caption) + if texts: + for idx, (filename, caption) in enumerate(zip(saved_files, texts)): + caption_filename = filename.replace(".png", ".txt") + caption_path = os.path.join(output_dir, caption_filename) + with open(caption_path, "w", encoding="utf-8") as f: + f.write(caption) logging.info(f"Saved {len(saved_files)} images and captions to {output_dir}.") return io.NodeOutput() @@ -314,11 +346,13 @@ class ImageProcessingNode(io.ComfyNode): Child classes should set: node_id: Unique node identifier (required) + search_aliases: List of search aliases (optional) display_name: Display name (optional, defaults to node_id) description: Node description (optional) extra_inputs: List of additional io.Input objects beyond "images" (optional) is_group_process: None (auto-detect), True (group), or False (individual) (optional) is_output_list: True (list output) or False (single output) (optional, default True) + is_deprecated: True if the node is deprecated (optional, default False) Child classes must implement ONE of: _process(cls, image, **kwargs) -> tensor (for single-item processing) @@ -326,12 +360,13 @@ class ImageProcessingNode(io.ComfyNode): """ node_id = None + search_aliases = [] display_name = None description = None extra_inputs = [] is_group_process = None # None = auto-detect, True/False = explicit is_output_list = None # None = auto-detect based on processing mode - + is_deprecated = False @classmethod def _detect_processing_mode(cls): """Detect whether this node uses group or individual processing. @@ -402,8 +437,10 @@ class ImageProcessingNode(io.ComfyNode): return io.Schema( node_id=cls.node_id, + search_aliases=cls.search_aliases, display_name=cls.display_name or cls.node_id, - category="dataset/image", + category=cls.category, + description=cls.description, is_experimental=True, is_input_list=is_group, # True for group, False for individual inputs=inputs, @@ -472,11 +509,13 @@ class TextProcessingNode(io.ComfyNode): Child classes should set: node_id: Unique node identifier (required) + search_aliases: List of search aliases (optional) display_name: Display name (optional, defaults to node_id) description: Node description (optional) extra_inputs: List of additional io.Input objects beyond "texts" (optional) is_group_process: None (auto-detect), True (group), or False (individual) (optional) is_output_list: True (list output) or False (single output) (optional, default True) + is_deprecated: True if the node is deprecated (optional, default False) Child classes must implement ONE of: _process(cls, text, **kwargs) -> str (for single-item processing) @@ -484,12 +523,13 @@ class TextProcessingNode(io.ComfyNode): """ node_id = None + search_aliases = [] display_name = None description = None extra_inputs = [] is_group_process = None # None = auto-detect, True/False = explicit is_output_list = None # None = auto-detect based on processing mode - + is_deprecated = False @classmethod def _detect_processing_mode(cls): """Detect whether this node uses group or individual processing. @@ -552,7 +592,7 @@ class TextProcessingNode(io.ComfyNode): return io.Schema( node_id=cls.node_id, display_name=cls.display_name or cls.node_id, - category="dataset/text", + category="text", is_experimental=True, is_input_list=is_group, # True for group, False for individual inputs=inputs, @@ -627,15 +667,17 @@ class TextProcessingNode(io.ComfyNode): class ResizeImagesByShorterEdgeNode(ImageProcessingNode): node_id = "ResizeImagesByShorterEdge" - display_name = "Resize Images by Shorter Edge" - description = "Resize images so that the shorter edge matches the specified length while preserving aspect ratio." + display_name = "Resize Images by Shorter Edge (DEPRECATED)" + category = "image/transform" + description = "Resize images so that the shorter edge matches the specified dimension while preserving aspect ratio." + is_deprecated = True # This node is superseded by Resize Image/Mask with resize_type = scale shorter dimension extra_inputs = [ io.Int.Input( "shorter_edge", default=512, min=1, max=8192, - tooltip="Target length for the shorter edge.", + tooltip="Target dimension for the shorter edge.", ), ] @@ -655,15 +697,17 @@ class ResizeImagesByShorterEdgeNode(ImageProcessingNode): class ResizeImagesByLongerEdgeNode(ImageProcessingNode): node_id = "ResizeImagesByLongerEdge" - display_name = "Resize Images by Longer Edge" - description = "Resize images so that the longer edge matches the specified length while preserving aspect ratio." + display_name = "Resize Images by Longer Edge (DEPRECATED)" + category = "image/transform" + description = "Resize images so that the longer edge matches the specified dimension while preserving aspect ratio." + is_deprecated = True # This node is superseded by Resize Image/Mask with resize_type = scale longer dimension extra_inputs = [ io.Int.Input( "longer_edge", default=1024, min=1, max=8192, - tooltip="Target length for the longer edge.", + tooltip="Target dimension for the longer edge.", ), ] @@ -686,8 +730,10 @@ class ResizeImagesByLongerEdgeNode(ImageProcessingNode): class CenterCropImagesNode(ImageProcessingNode): node_id = "CenterCropImages" - display_name = "Center Crop Images" - description = "Center crop all images to the specified dimensions." + search_aliases=["crop", "cut", "trim"] + display_name="Crop Image (Center)" + category="image/transform" + description = "Center crop an image to the specified dimensions." extra_inputs = [ io.Int.Input("width", default=512, min=1, max=8192, tooltip="Crop width."), io.Int.Input("height", default=512, min=1, max=8192, tooltip="Crop height."), @@ -706,10 +752,11 @@ class CenterCropImagesNode(ImageProcessingNode): class RandomCropImagesNode(ImageProcessingNode): node_id = "RandomCropImages" - display_name = "Random Crop Images" - description = ( - "Randomly crop all images to the specified dimensions (for data augmentation)." - ) + search_aliases=["crop", "cut", "trim"] + display_name = "Crop Image (Random)" + category="image/transform" + description = "Randomly crop an image to the specified dimensions." + extra_inputs = [ io.Int.Input("width", default=512, min=1, max=8192, tooltip="Crop width."), io.Int.Input("height", default=512, min=1, max=8192, tooltip="Crop height."), @@ -734,7 +781,9 @@ class RandomCropImagesNode(ImageProcessingNode): class NormalizeImagesNode(ImageProcessingNode): node_id = "NormalizeImages" - display_name = "Normalize Images" + search_aliases=["normalize", "normalize colors"] + display_name = "Normalize Image Colors" + category = "image/color" description = "Normalize images using mean and standard deviation." extra_inputs = [ io.Float.Input( @@ -762,8 +811,10 @@ class NormalizeImagesNode(ImageProcessingNode): class AdjustBrightnessNode(ImageProcessingNode): node_id = "AdjustBrightness" + search_aliases=["brightness"] display_name = "Adjust Brightness" - description = "Adjust brightness of all images." + category="image/adjustments" + description = "Adjust the brightness of an image." extra_inputs = [ io.Float.Input( "factor", @@ -781,8 +832,10 @@ class AdjustBrightnessNode(ImageProcessingNode): class AdjustContrastNode(ImageProcessingNode): node_id = "AdjustContrast" + search_aliases=["contrast"] display_name = "Adjust Contrast" - description = "Adjust contrast of all images." + category="image/adjustments" + description = "Adjust the contrast of an image." extra_inputs = [ io.Float.Input( "factor", @@ -800,8 +853,10 @@ class AdjustContrastNode(ImageProcessingNode): class ShuffleDatasetNode(ImageProcessingNode): node_id = "ShuffleDataset" - display_name = "Shuffle Image Dataset" - description = "Randomly shuffle the order of images in the dataset." + search_aliases=["shuffle", "randomize", "mix"] + display_name = "Shuffle Images List" + category = "image/batch" + description = "Randomly shuffle the order of images in a list." is_group_process = True # Requires full list to shuffle extra_inputs = [ io.Int.Input( @@ -823,13 +878,15 @@ class ShuffleImageTextDatasetNode(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="ShuffleImageTextDataset", - display_name="Shuffle Image-Text Dataset", - category="dataset/image", + search_aliases=["shuffle", "randomize", "mix"], + display_name = "Shuffle Pairs of Image-Text", + category = "image/batch", + description = "Randomly shuffle the order of pairs of image-text in a list.", is_experimental=True, is_input_list=True, inputs=[ io.Image.Input("images", tooltip="List of images to shuffle."), - io.String.Input("texts", tooltip="List of texts to shuffle."), + io.String.Input("texts", tooltip="List of texts to shuffle.", force_input=True), io.Int.Input( "seed", default=0, @@ -865,8 +922,11 @@ class ShuffleImageTextDatasetNode(io.ComfyNode): class TextToLowercaseNode(TextProcessingNode): node_id = "TextToLowercase" - display_name = "Text to Lowercase" - description = "Convert all texts to lowercase." + search_aliases=["lowercase"] + display_name = "Convert Text to Lowercase (DEPRECATED)" + category = "text" + description = "Convert text to lowercase." + is_deprecated = True # This node is superseded by the Convert Text Case node @classmethod def _process(cls, text): @@ -875,8 +935,11 @@ class TextToLowercaseNode(TextProcessingNode): class TextToUppercaseNode(TextProcessingNode): node_id = "TextToUppercase" - display_name = "Text to Uppercase" - description = "Convert all texts to uppercase." + search_aliases=["uppercase"] + display_name = "Convert Text to Uppercase (DEPRECATED)" + category = "text" + description = "Convert text to uppercase." + is_deprecated = True # This node is superseded by the Convert Text Case node @classmethod def _process(cls, text): @@ -885,8 +948,10 @@ class TextToUppercaseNode(TextProcessingNode): class TruncateTextNode(TextProcessingNode): node_id = "TruncateText" + search_aliases=["truncate", "cut", "shorten"] display_name = "Truncate Text" - description = "Truncate all texts to a maximum length." + category = "text" + description = "Truncate text to a maximum length." extra_inputs = [ io.Int.Input( "max_length", default=77, min=1, max=10000, tooltip="Maximum text length." @@ -900,8 +965,10 @@ class TruncateTextNode(TextProcessingNode): class AddTextPrefixNode(TextProcessingNode): node_id = "AddTextPrefix" - display_name = "Add Text Prefix" + display_name = "Add Text Prefix (DEPRECATED)" + category = "text" description = "Add a prefix to all texts." + is_deprecated = True # This node is superseded by the Concatenate Text node extra_inputs = [ io.String.Input("prefix", default="", tooltip="Prefix to add."), ] @@ -913,8 +980,10 @@ class AddTextPrefixNode(TextProcessingNode): class AddTextSuffixNode(TextProcessingNode): node_id = "AddTextSuffix" - display_name = "Add Text Suffix" + display_name = "Add Text Suffix (DEPRECATED)" + category = "text" description = "Add a suffix to all texts." + is_deprecated = True # This node is superseded by the Concatenate Text node extra_inputs = [ io.String.Input("suffix", default="", tooltip="Suffix to add."), ] @@ -926,8 +995,10 @@ class AddTextSuffixNode(TextProcessingNode): class ReplaceTextNode(TextProcessingNode): node_id = "ReplaceText" - display_name = "Replace Text" + display_name = "Replace Text (DEPRECATED)" + category = "text" description = "Replace text in all texts." + is_deprecated = True # This node is superseded by the other Replace Text node extra_inputs = [ io.String.Input("find", default="", tooltip="Text to find."), io.String.Input("replace", default="", tooltip="Text to replace with."), @@ -940,8 +1011,10 @@ class ReplaceTextNode(TextProcessingNode): class StripWhitespaceNode(TextProcessingNode): node_id = "StripWhitespace" - display_name = "Strip Whitespace" + display_name = "Strip Whitespace (DEPRECATED)" + category = "text" description = "Strip leading and trailing whitespace from all texts." + is_deprecated = True # This node is superseded by the Trim Text node @classmethod def _process(cls, text): @@ -952,11 +1025,13 @@ class StripWhitespaceNode(TextProcessingNode): class ImageDeduplicationNode(ImageProcessingNode): - """Remove duplicate or very similar images from the dataset using perceptual hashing.""" + """Remove duplicate or very similar images from a list using perceptual hashing.""" node_id = "ImageDeduplication" - display_name = "Image Deduplication" - description = "Remove duplicate or very similar images from the dataset." + search_aliases=["deduplicate", "remove duplicates", "similarity filter"] + display_name = "Deduplicate Images" + category = "image/batch" + description = "Remove duplicate or very similar images from a list." is_group_process = True # Requires full list to compare images extra_inputs = [ io.Float.Input( @@ -1026,7 +1101,9 @@ class ImageGridNode(ImageProcessingNode): """Combine multiple images into a single grid/collage.""" node_id = "ImageGrid" - display_name = "Image Grid" + search_aliases=["grid", "collage", "combine"] + display_name = "Make Image Grid" + category="image/batch" description = "Arrange multiple images into a grid layout." is_group_process = True # Requires full list to create grid is_output_list = False # Outputs single grid image @@ -1102,9 +1179,12 @@ class MergeImageListsNode(ImageProcessingNode): """Merge multiple image lists into a single list.""" node_id = "MergeImageLists" - display_name = "Merge Image Lists" + search_aliases=["list", "merge list", "make list"] + display_name = "Merge Image Lists (DEPRECATED)" + category = "image/batch" description = "Concatenate multiple image lists into one." is_group_process = True # Receives images as list + is_deprecated = True # This node is superseded by the Create List node @classmethod def _group_process(cls, images): @@ -1119,9 +1199,11 @@ class MergeTextListsNode(TextProcessingNode): """Merge multiple text lists into a single list.""" node_id = "MergeTextLists" - display_name = "Merge Text Lists" + display_name = "Merge Text Lists (DEPRECATED)" + category = "text" description = "Concatenate multiple text lists into one." is_group_process = True # Receives texts as list + is_deprecated = True # This node is superseded by the Create List node @classmethod def _group_process(cls, texts): @@ -1142,8 +1224,10 @@ class ResolutionBucket(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="ResolutionBucket", + search_aliases=["bucket by resolution", "group by resolution", "batch by resolution"], display_name="Resolution Bucket", - category="dataset", + category="model/training", + description="Group latents and conditionings into buckets", is_experimental=True, is_input_list=True, inputs=[ @@ -1236,7 +1320,8 @@ class MakeTrainingDataset(io.ComfyNode): node_id="MakeTrainingDataset", search_aliases=["encode dataset"], display_name="Make Training Dataset", - category="dataset", + category="model/training", + description="Encode images with VAE and texts with CLIP to create a training dataset of latents and conditionings.", is_experimental=True, is_input_list=True, # images and texts as lists inputs=[ @@ -1251,6 +1336,7 @@ class MakeTrainingDataset(io.ComfyNode): "texts", optional=True, tooltip="List of text captions. Can be length n (matching images), 1 (repeated for all), or omitted (uses empty string).", + force_input=True ), ], outputs=[ @@ -1320,9 +1406,10 @@ class SaveTrainingDataset(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SaveTrainingDataset", - search_aliases=["export training data"], + search_aliases=["export dataset", "save dataset"], display_name="Save Training Dataset", - category="dataset", + category="model/training", + description="Save encoded training dataset (latents + conditioning) to disk for efficient loading during training.", is_experimental=True, is_output_node=True, is_input_list=True, # Receive lists @@ -1424,7 +1511,8 @@ class LoadTrainingDataset(io.ComfyNode): node_id="LoadTrainingDataset", search_aliases=["import dataset", "training data"], display_name="Load Training Dataset", - category="dataset", + category="model/training", + description="Load encoded training dataset (latents + conditioning) from disk for use in training.", is_experimental=True, inputs=[ io.String.Input( diff --git a/comfy_extras/nodes_eps.py b/comfy_extras/nodes_eps.py index 0fb3871c8..8c397f132 100644 --- a/comfy_extras/nodes_eps.py +++ b/comfy_extras/nodes_eps.py @@ -18,7 +18,7 @@ class EpsilonScaling(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="Epsilon Scaling", - category="model_patches/unet", + category="model/patch/unet", inputs=[ io.Model.Input("model"), io.Float.Input( @@ -84,7 +84,7 @@ class TemporalScoreRescaling(io.ComfyNode): return io.Schema( node_id="TemporalScoreRescaling", display_name="TSR - Temporal Score Rescaling", - category="model_patches/unet", + category="model/patch/unet", inputs=[ io.Model.Input("model"), io.Float.Input( diff --git a/comfy_extras/nodes_flux.py b/comfy_extras/nodes_flux.py index 997f21c09..afc663b22 100644 --- a/comfy_extras/nodes_flux.py +++ b/comfy_extras/nodes_flux.py @@ -40,7 +40,7 @@ class EmptyFlux2LatentImage(io.ComfyNode): return io.Schema( node_id="EmptyFlux2LatentImage", display_name="Empty Flux 2 Latent", - category="latent", + category="model/latent", inputs=[ io.Int.Input("width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16), io.Int.Input("height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16), @@ -215,7 +215,7 @@ class Flux2Scheduler(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="Flux2Scheduler", - category="sampling/schedulers", + category="model/sampling/schedulers", inputs=[ io.Int.Input("steps", default=20, min=1, max=4096), io.Int.Input("width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=1), diff --git a/comfy_extras/nodes_frame_interpolation.py b/comfy_extras/nodes_frame_interpolation.py index 9dd34cfb8..4d5bca17e 100644 --- a/comfy_extras/nodes_frame_interpolation.py +++ b/comfy_extras/nodes_frame_interpolation.py @@ -19,7 +19,7 @@ class FrameInterpolationModelLoader(io.ComfyNode): return io.Schema( node_id="FrameInterpolationModelLoader", display_name="Load Frame Interpolation Model", - category="loaders", + category="model/loaders", inputs=[ io.Combo.Input("model_name", options=folder_paths.get_filename_list("frame_interpolation"), tooltip="Select a frame interpolation model to load. Models must be placed in the 'frame_interpolation' folder."), diff --git a/comfy_extras/nodes_freelunch.py b/comfy_extras/nodes_freelunch.py index 248efdef3..ccbd1fd90 100644 --- a/comfy_extras/nodes_freelunch.py +++ b/comfy_extras/nodes_freelunch.py @@ -29,7 +29,7 @@ class FreeU(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="FreeU", - category="model_patches/unet", + category="model/patch/unet", inputs=[ IO.Model.Input("model"), IO.Float.Input("b1", default=1.1, min=0.0, max=10.0, step=0.01, advanced=True), @@ -76,7 +76,7 @@ class FreeU_V2(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="FreeU_V2", - category="model_patches/unet", + category="model/patch/unet", inputs=[ IO.Model.Input("model"), IO.Float.Input("b1", default=1.3, min=0.0, max=10.0, step=0.01, advanced=True), diff --git a/comfy_extras/nodes_gits.py b/comfy_extras/nodes_gits.py index 0b7666524..434a24387 100644 --- a/comfy_extras/nodes_gits.py +++ b/comfy_extras/nodes_gits.py @@ -340,7 +340,7 @@ class GITSScheduler(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="GITSScheduler", - category="sampling/schedulers", + category="model/sampling/schedulers", inputs=[ io.Float.Input("coeff", default=1.20, min=0.80, max=1.50, step=0.05, advanced=True), io.Int.Input("steps", default=10, min=2, max=1000), diff --git a/comfy_extras/nodes_hidream_o1.py b/comfy_extras/nodes_hidream_o1.py index f393745f6..8648d2e26 100644 --- a/comfy_extras/nodes_hidream_o1.py +++ b/comfy_extras/nodes_hidream_o1.py @@ -14,7 +14,7 @@ class EmptyHiDreamO1LatentImage(io.ComfyNode): return io.Schema( node_id="EmptyHiDreamO1LatentImage", display_name="Empty HiDream-O1 Latent Image", - category="latent/image", + category="model/latent/image", description=( "Empty pixel-space latent for HiDream-O1-Image. The model was " "trained at ~4 megapixels; lower resolutions go off-distribution " @@ -47,7 +47,7 @@ class HiDreamO1ReferenceImages(io.ComfyNode): return io.Schema( node_id="HiDreamO1ReferenceImages", display_name="HiDream-O1 Reference Images", - category="conditioning/image", + category="model/conditioning/image", description=( "Attach 1-10 reference images to conditioning, one for edit instruction" "or multiple for subject-driven personalization." diff --git a/comfy_extras/nodes_hunyuan.py b/comfy_extras/nodes_hunyuan.py index 9e4873be5..16fff12af 100644 --- a/comfy_extras/nodes_hunyuan.py +++ b/comfy_extras/nodes_hunyuan.py @@ -41,7 +41,7 @@ class EmptyHunyuanLatentVideo(io.ComfyNode): return io.Schema( node_id="EmptyHunyuanLatentVideo", display_name="Empty HunyuanVideo 1.0 Latent", - category="latent/video", + category="model/latent/video", inputs=[ io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16), io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), @@ -81,7 +81,7 @@ class HunyuanVideo15ImageToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="HunyuanVideo15ImageToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -132,7 +132,7 @@ class HunyuanVideo15SuperResolution(io.ComfyNode): return io.Schema( node_id="HunyuanVideo15SuperResolution", display_name="Hunyuan Video 1.5 Super Resolution", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -178,7 +178,7 @@ class LatentUpscaleModelLoader(io.ComfyNode): return io.Schema( node_id="LatentUpscaleModelLoader", display_name="Load Latent Upscale Model", - category="loaders", + category="model/loaders", inputs=[ io.Combo.Input("model_name", options=folder_paths.get_filename_list("latent_upscale_models")), ], @@ -227,7 +227,7 @@ class HunyuanVideo15LatentUpscaleWithModel(io.ComfyNode): return io.Schema( node_id="HunyuanVideo15LatentUpscaleWithModel", display_name="Hunyuan Video 15 Latent Upscale With Model", - category="latent", + category="model/latent", inputs=[ io.LatentUpscaleModel.Input("model"), io.Latent.Input("samples"), @@ -308,7 +308,7 @@ class HunyuanImageToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="HunyuanImageToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Vae.Input("vae"), @@ -359,7 +359,7 @@ class EmptyHunyuanImageLatent(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="EmptyHunyuanImageLatent", - category="latent", + category="model/latent", inputs=[ io.Int.Input("width", default=2048, min=64, max=nodes.MAX_RESOLUTION, step=32), io.Int.Input("height", default=2048, min=64, max=nodes.MAX_RESOLUTION, step=32), @@ -384,7 +384,7 @@ class HunyuanRefinerLatent(io.ComfyNode): return io.Schema( node_id="HunyuanRefinerLatent", display_name="Hunyuan Latent Refiner", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), diff --git a/comfy_extras/nodes_hunyuan3d.py b/comfy_extras/nodes_hunyuan3d.py index 403eb855b..60e530626 100644 --- a/comfy_extras/nodes_hunyuan3d.py +++ b/comfy_extras/nodes_hunyuan3d.py @@ -12,7 +12,7 @@ class EmptyLatentHunyuan3Dv2(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="EmptyLatentHunyuan3Dv2", - category="latent/3d", + category="model/latent/3d", inputs=[ IO.Int.Input("resolution", default=3072, min=1, max=8192), IO.Int.Input("batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch."), @@ -35,7 +35,7 @@ class Hunyuan3Dv2Conditioning(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="Hunyuan3Dv2Conditioning", - category="conditioning/3d_models", + category="model/conditioning/3d_models", inputs=[ IO.ClipVisionOutput.Input("clip_vision_output"), ], @@ -60,7 +60,7 @@ class Hunyuan3Dv2ConditioningMultiView(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="Hunyuan3Dv2ConditioningMultiView", - category="conditioning/3d_models", + category="model/conditioning/3d_models", inputs=[ IO.ClipVisionOutput.Input("front", optional=True), IO.ClipVisionOutput.Input("left", optional=True), @@ -97,7 +97,7 @@ class VAEDecodeHunyuan3D(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="VAEDecodeHunyuan3D", - category="latent/3d", + category="model/latent/3d", inputs=[ IO.Latent.Input("samples"), IO.Vae.Input("vae"), @@ -419,15 +419,17 @@ class VoxelToMeshBasic(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="VoxelToMeshBasic", - display_name="Voxel to Mesh (Basic)", + display_name="Voxel to Mesh (Basic) (DEPRECATED)", category="3d", + description="Converts a voxel grid to a mesh.", + is_deprecated=True, # This node is superseded by the Voxel To Mesh node inputs=[ IO.Voxel.Input("voxel"), IO.Float.Input("threshold", default=0.6, min=-1.0, max=1.0, step=0.01), ], outputs=[ IO.Mesh.Output(), - ] + ], ) @classmethod @@ -453,9 +455,10 @@ class VoxelToMesh(IO.ComfyNode): node_id="VoxelToMesh", display_name="Voxel to Mesh", category="3d", + description="Converts a voxel grid to a mesh.", inputs=[ IO.Voxel.Input("voxel"), - IO.Combo.Input("algorithm", options=["surface net", "basic"], advanced=True), + IO.Combo.Input("algorithm", options=["surface net", "basic"]), IO.Float.Input("threshold", default=0.6, min=-1.0, max=1.0, step=0.01), ], outputs=[ diff --git a/comfy_extras/nodes_hypernetwork.py b/comfy_extras/nodes_hypernetwork.py index 44a9c6f97..2d3f1bd05 100644 --- a/comfy_extras/nodes_hypernetwork.py +++ b/comfy_extras/nodes_hypernetwork.py @@ -103,7 +103,7 @@ class HypernetworkLoader(IO.ComfyNode): return IO.Schema( node_id="HypernetworkLoader", display_name="Load Hypernetwork", - category="loaders", + category="model/loaders", inputs=[ IO.Model.Input("model"), IO.Combo.Input("hypernetwork_name", options=folder_paths.get_filename_list("hypernetworks")), diff --git a/comfy_extras/nodes_hypertile.py b/comfy_extras/nodes_hypertile.py index 354d96db1..2a96416be 100644 --- a/comfy_extras/nodes_hypertile.py +++ b/comfy_extras/nodes_hypertile.py @@ -27,7 +27,7 @@ class HyperTile(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="HyperTile", - category="model_patches/unet", + category="model/patch/unet", inputs=[ io.Model.Input("model"), io.Int.Input("tile_size", default=256, min=1, max=2048, advanced=True), diff --git a/comfy_extras/nodes_images.py b/comfy_extras/nodes_images.py index 6326c5be8..469a7be55 100644 --- a/comfy_extras/nodes_images.py +++ b/comfy_extras/nodes_images.py @@ -1,17 +1,23 @@ -from __future__ import annotations - import nodes import folder_paths +import av import json + import os import re import math +import numpy as np +import struct import torch + +import zlib import comfy.utils +from fractions import Fraction from server import PromptServer from comfy_api.latest import ComfyExtension, IO, UI +from comfy.cli_args import args from typing_extensions import override SVG = IO.SVG.Type # TODO: temporary solution for backward compatibility, will be removed later. @@ -55,9 +61,10 @@ class ImageCropV2(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="ImageCropV2", - search_aliases=["trim"], + search_aliases=["crop", "cut", "trim"], display_name="Crop Image", category="image/transform", + description = "Crop an image to the specified dimensions.", essentials_category="Image Tools", has_intermediate_output=True, inputs=[ @@ -88,7 +95,7 @@ class BoundingBox(IO.ComfyNode): return IO.Schema( node_id="PrimitiveBoundingBox", display_name="Bounding Box", - category="utils/primitive", + category="utilities/primitive", inputs=[ IO.Int.Input("x", default=0, min=0, max=MAX_RESOLUTION), IO.Int.Input("y", default=0, min=0, max=MAX_RESOLUTION), @@ -834,6 +841,405 @@ class ImageMergeTileList(IO.ComfyNode): return IO.NodeOutput(merged_image) +# --------------------------------------------------------------------------- +# Format specifications +# --------------------------------------------------------------------------- + +# Maps (file_format, bit_depth, has_alpha) -> (numpy dtype scale, av pixel format, +# stream pix_fmt). Keeps the encode path declarative instead of branchy. +_FORMAT_SPECS = { + ("png", "8-bit", False): {"scale": 255.0, "dtype": np.uint8, "frame_fmt": "rgb24", "stream_fmt": "rgb24"}, + ("png", "8-bit", True): {"scale": 255.0, "dtype": np.uint8, "frame_fmt": "rgba", "stream_fmt": "rgba"}, + ("png", "16-bit", False): {"scale": 65535.0, "dtype": np.uint16, "frame_fmt": "rgb48le", "stream_fmt": "rgb48be"}, + ("png", "16-bit", True): {"scale": 65535.0, "dtype": np.uint16, "frame_fmt": "rgba64le", "stream_fmt": "rgba64be"}, + ("exr", "32-bit float", False): {"scale": 1.0, "dtype": np.float32, "frame_fmt": "gbrpf32le", "stream_fmt": "gbrpf32le"}, + ("exr", "32-bit float", True): {"scale": 1.0, "dtype": np.float32, "frame_fmt": "gbrapf32le", "stream_fmt": "gbrapf32le"}, +} + + +# --------------------------------------------------------------------------- +# Color transforms +# --------------------------------------------------------------------------- + +def srgb_to_linear(t: torch.Tensor) -> torch.Tensor: + """Inverse sRGB EOTF (IEC 61966-2-1). Operates on RGB channels only; + alpha (if present as the 4th channel) is passed through unchanged.""" + if t.shape[-1] == 4: + rgb, alpha = t[..., :3], t[..., 3:] + return torch.cat([srgb_to_linear(rgb), alpha], dim=-1) + + # Piecewise: linear toe below 0.04045, gamma curve above. + low = t / 12.92 + high = ((t.clamp(min=0.0) + 0.055) / 1.055) ** 2.4 + return torch.where(t <= 0.04045, low, high) + + +# HLG OETF constants from BT.2100 Table 5. +_HLG_A = 0.17883277 +_HLG_B = 0.28466892 +_HLG_C = 0.55991072928 # = 0.5 - a*ln(4*a) + + +def hlg_to_linear(t: torch.Tensor) -> torch.Tensor: + """Inverse HLG OETF (BT.2100). Maps a non-linear HLG signal in [0, 1] to + *scene*-linear light in [0, 1]. Per BT.2100 Note 5a, this is the correct + transform when converting HLG to a linear scene-light representation + (rather than display-light, which would also involve the HLG OOTF). + + Operates on RGB channels only; alpha is passed through unchanged.""" + if t.shape[-1] == 4: + rgb, alpha = t[..., :3], t[..., 3:] + return torch.cat([hlg_to_linear(rgb), alpha], dim=-1) + + # Piecewise: sqrt branch below 0.5, log branch above. + # Clamp inside the log branch so negative / out-of-range values don't blow up; + # values above 1.0 are allowed and extrapolate naturally. + low = (t ** 2) / 3.0 + high = (torch.exp((t.clamp(min=_HLG_C) - _HLG_C) / _HLG_A) + _HLG_B) / 12.0 + return torch.where(t <= 0.5, low, high) + + +# --------------------------------------------------------------------------- +# Metadata injection +# --------------------------------------------------------------------------- + +_PNG_SIGNATURE = b"\x89PNG\r\n\x1a\n" + + +def _png_chunk(chunk_type: bytes, data: bytes) -> bytes: + """Build a single PNG chunk: length | type | data | CRC32(type+data).""" + crc = zlib.crc32(chunk_type + data) & 0xFFFFFFFF + return struct.pack(">I", len(data)) + chunk_type + data + struct.pack(">I", crc) + + +def _png_text_chunk(keyword: str, text: str) -> bytes: + """tEXt chunk: latin-1 keyword + NUL + latin-1 text.""" + payload = keyword.encode("latin-1") + b"\x00" + text.encode("latin-1", errors="replace") + return _png_chunk(b"tEXt", payload) + + +def inject_png_metadata(png_bytes: bytes, prompt: dict | None, extra_pnginfo: dict | None) -> bytes: + """Insert ComfyUI prompt/workflow as tEXt chunks right after IHDR.""" + if not png_bytes.startswith(_PNG_SIGNATURE): + return png_bytes + + chunks: list[bytes] = [] + if prompt is not None: + chunks.append(_png_text_chunk("prompt", json.dumps(prompt))) + if extra_pnginfo: + for key, value in extra_pnginfo.items(): + chunks.append(_png_text_chunk(key, json.dumps(value))) + if not chunks: + return png_bytes + + # IHDR is always the first chunk; insert ours immediately after it. + ihdr_length = struct.unpack(">I", png_bytes[8:12])[0] + ihdr_end = 8 + 8 + ihdr_length + 4 # signature + (len+type) + data + crc + return png_bytes[:ihdr_end] + b"".join(chunks) + png_bytes[ihdr_end:] + + +# Standard chromaticities (CIE 1931 xy) for the colorspaces this node writes. +# Each tuple is (Rx, Ry, Gx, Gy, Bx, By, Wx, Wy). All share D65 white point. +_CHROMATICITIES = { + # ITU-R BT.709 / sRGB primaries + "Rec.709": (0.6400, 0.3300, 0.3000, 0.6000, 0.1500, 0.0600, 0.3127, 0.3290), + # ITU-R BT.2020 (UHDTV / wide-gamut HDR) primaries + "Rec.2020": (0.7080, 0.2920, 0.1700, 0.7970, 0.1310, 0.0460, 0.3127, 0.3290), +} + + +def _pack_chromaticities(primaries: tuple) -> bytes: + """Serialize 8 chromaticity floats into the EXR `chromaticities` payload.""" + return struct.pack("<8f", *primaries) + + +def _exr_attribute(name: str, attr_type: str, value: bytes) -> bytes: + """Serialize one EXR header attribute: name\\0 type\\0 size:int32 value.""" + return ( + name.encode("utf-8") + b"\x00" + + attr_type.encode("utf-8") + b"\x00" + + struct.pack(" bytes: + """Insert ComfyUI metadata and color-space info into an EXR header. + + Color: EXR pixels are linear by convention. The standard way to describe + their RGB→XYZ relationship is the `chromaticities` attribute. We pick the + primaries that match what the user told us their input was: + + colorspace="sRGB" → Rec. 709 / sRGB primaries (D65) + colorspace="HDR" → Rec. 2020 / BT.2100 primaries (D65) + + Pixels are always converted to linear scene light upstream (sRGB EOTF + inverse for sRGB; HLG OETF inverse for HDR), so the file content is + scene-linear in the indicated gamut. OpenEXR has no standard transfer- + function attribute (the OpenEXR TSC has discussed adding one but it + doesn't exist), so we don't invent one — `chromaticities` plus the EXR + linear-by-convention rule fully specifies the color. + + Prompt/workflow: written as plain `string` attributes using the same keys + (`prompt`, `workflow`, ...) that Comfy uses for PNG tEXt chunks, so the + same readers can pull them out symmetrically. + + Implementation note: the chunk-offset table that follows the header stores + *absolute* byte offsets into the file. Inserting N bytes into the header + means every offset must be incremented by N or the file becomes unreadable. + """ + if len(exr_bytes) < 8 or exr_bytes[:4] != b"\x76\x2f\x31\x01": + return exr_bytes + + new_blob = b"" + if prompt is not None: + new_blob += _exr_attribute("prompt", "string", json.dumps(prompt).encode("utf-8")) + if extra_pnginfo: + for key, value in extra_pnginfo.items(): + new_blob += _exr_attribute(key, "string", json.dumps(value).encode("utf-8")) + if colorspace is not None: + # Map each colorspace option to the RGB primaries the linear pixels + # are now in. "sRGB" and "linear" both produce Rec. 709 linear; "HDR" + # (HLG-encoded Rec. 2020 input) produces Rec. 2020 linear. + primaries_name = { + "sRGB": "Rec.709", + "linear": "Rec.709", + "HDR": "Rec.2020", + }.get(colorspace, "Rec.709") + new_blob += _exr_attribute( + "chromaticities", + "chromaticities", + _pack_chromaticities(_CHROMATICITIES[primaries_name]), + ) + if not new_blob: + return exr_bytes + + # Walk header attributes to find the terminating null byte, and pick up + # dataWindow + compression so we know how many chunks the offset table has. + pos = 8 # past magic (4) + version (4) + data_window = None + compression = 0 + while pos < len(exr_bytes) and exr_bytes[pos] != 0: + name_end = exr_bytes.index(b"\x00", pos) + attr_name = exr_bytes[pos:name_end].decode("latin-1", errors="replace") + type_end = exr_bytes.index(b"\x00", name_end + 1) + attr_type = exr_bytes[name_end + 1:type_end].decode("latin-1", errors="replace") + size = struct.unpack(" bytes: + """Encode a single HxWxC tensor to PNG or EXR bytes in memory. + + For EXR the input is interpreted according to `colorspace` and converted + to scene-linear (EXR's convention) before writing: + + "sRGB" → input is sRGB-encoded Rec. 709; apply inverse sRGB EOTF. + "HDR" → input is HLG-encoded Rec. 2020 (BT.2100); apply inverse HLG + OETF to get scene-linear, per BT.2100 Note 5a. + "linear" → input is already scene-linear (Rec. 709 primaries); write + through unchanged. Use this for renderer/compositor output. + + For PNG, colorspace selection does not modify pixels — PNG is delivered + sRGB-encoded and there is no PNG path for wide-gamut HDR in this node. + """ + height, width, num_channels = img_tensor.shape + has_alpha = num_channels == 4 + + spec = _FORMAT_SPECS[(file_format, bit_depth, has_alpha)] + + if spec["dtype"] == np.float32: + # EXR path: preserve full range, no clamp. + if colorspace == "sRGB": + img_tensor = srgb_to_linear(img_tensor) + elif colorspace == "HDR": + img_tensor = hlg_to_linear(img_tensor) + img_np = img_tensor.cpu().numpy().astype(np.float32) + else: + # PNG path: quantize to integer range. + scaled = (img_tensor * spec["scale"]).clamp(0, spec["scale"]) + img_np = scaled.to(torch.int32).cpu().numpy().astype(spec["dtype"]) + + # Encode directly via CodecContext. PyAV's `image2` muxer does NOT write to + # BytesIO (it expects a real file path), so we bypass the container entirely. + # For single-frame PNG/EXR the raw codec output IS the file. + codec = av.CodecContext.create(file_format, "w") + codec.width = width + codec.height = height + codec.pix_fmt = spec["stream_fmt"] + codec.time_base = Fraction(1, 1) + + frame = av.VideoFrame.from_ndarray(img_np, format=spec["frame_fmt"]) + if spec["frame_fmt"] != spec["stream_fmt"]: + frame = frame.reformat(format=spec["stream_fmt"]) + frame.pts = 0 + frame.time_base = codec.time_base + + packets = list(codec.encode(frame)) + list(codec.encode(None)) # flush with None + return b"".join(bytes(p) for p in packets) + + +# --------------------------------------------------------------------------- +# Node +# --------------------------------------------------------------------------- + +class SaveImageAdvanced(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="SaveImageAdvanced", + search_aliases=["save", "save image", "export image", "output image", "write image"], + display_name="Save Image (Advanced)", + description="Saves the input images to your ComfyUI output directory.", + category="image", + essentials_category="Basics", + inputs=[ + IO.Image.Input("images", tooltip="The images to save."), + IO.String.Input( + "filename_prefix", + default="ComfyUI", + tooltip=( + "The prefix for the file to save. May include formatting tokens " + "such as %date:yyyy-MM-dd% or %Empty Latent Image.width%." + ), + ), + IO.DynamicCombo.Input( + "format", + options=[ + IO.DynamicCombo.Option("png", [ + IO.Combo.Input("bit_depth", options=["8-bit", "16-bit"], + default="8-bit", advanced=True), + IO.Combo.Input("input_color_space", options=["sRGB"], + default="sRGB", advanced=True), + ]), + IO.DynamicCombo.Option("exr", [ + IO.Combo.Input("bit_depth", options=["32-bit float"], + default="32-bit float", advanced=True), + IO.Combo.Input( + "input_color_space", + options=["sRGB", "HDR", "linear"], + default="sRGB", + advanced=True, + tooltip=( + "Colorspace of the input tensor. The EXR is " + "always written as scene-linear in the matching " + "gamut.\n" + " 'sRGB' — input is sRGB-encoded Rec.709; " + "the inverse sRGB EOTF is applied.\n" + " 'HDR' — input is HLG-encoded Rec.2020 " + "(BT.2100); the inverse HLG OETF is applied " + "to get scene-linear light.\n" + " 'linear' — input is already scene-linear " + "(Rec.709 primaries); written through unchanged. " + "Use this for renderer/compositor output." + ), + ), + ]), + ], + tooltip="The file format in which to save the image.", + ), + ], + hidden=[IO.Hidden.prompt, IO.Hidden.extra_pnginfo], + is_output_node=True, + ) + + @classmethod + def execute(cls, images, filename_prefix: str, format: dict) -> IO.NodeOutput: + file_format = format["format"] + bit_depth = format["bit_depth"] + colorspace = format.get("input_color_space", "sRGB") + + output_dir = folder_paths.get_output_directory() + full_output_folder, filename, counter, subfolder, filename_prefix = ( + folder_paths.get_save_image_path( + filename_prefix, output_dir, images[0].shape[1], images[0].shape[0] + ) + ) + + prompt = cls.hidden.prompt + extra_pnginfo = cls.hidden.extra_pnginfo + write_metadata = not args.disable_metadata + + results = [] + for batch_number, image in enumerate(images): + encoded = _encode_image(image, file_format, bit_depth, colorspace) + + if write_metadata: + if file_format == "png": + encoded = inject_png_metadata(encoded, prompt, extra_pnginfo) + elif file_format == "exr": + encoded = inject_exr_metadata(encoded, prompt, extra_pnginfo, colorspace) + + name = filename.replace("%batch_num%", str(batch_number)) + file = f"{name}_{counter:05}.{file_format}" + with open(os.path.join(full_output_folder, file), "wb") as f: + f.write(encoded) + + results.append({"filename": file, "subfolder": subfolder, "type": "output"}) + counter += 1 + + return IO.NodeOutput(ui={"images": results}) + + class ImagesExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[IO.ComfyNode]]: @@ -846,6 +1252,7 @@ class ImagesExtension(ComfyExtension): ImageAddNoise, SaveAnimatedWEBP, SaveAnimatedPNG, + SaveImageAdvanced, SaveSVGNode, ImageStitch, ResizeAndPadImage, diff --git a/comfy_extras/nodes_ip2p.py b/comfy_extras/nodes_ip2p.py index 78f29915d..9c80834f0 100644 --- a/comfy_extras/nodes_ip2p.py +++ b/comfy_extras/nodes_ip2p.py @@ -9,7 +9,7 @@ class InstructPixToPixConditioning(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="InstructPixToPixConditioning", - category="conditioning/instructpix2pix", + category="model/conditioning/instructpix2pix", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), diff --git a/comfy_extras/nodes_kandinsky5.py b/comfy_extras/nodes_kandinsky5.py index 346c50cde..015965498 100644 --- a/comfy_extras/nodes_kandinsky5.py +++ b/comfy_extras/nodes_kandinsky5.py @@ -13,7 +13,7 @@ class Kandinsky5ImageToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="Kandinsky5ImageToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -71,7 +71,7 @@ class NormalizeVideoLatentStart(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="NormalizeVideoLatentStart", - category="conditioning/video_models", + category="model/conditioning/video_models", description="Normalizes the initial frames of a video latent to match the mean and standard deviation of subsequent reference frames. Helps reduce differences between the starting frames and the rest of the video.", inputs=[ io.Latent.Input("latent"), diff --git a/comfy_extras/nodes_latent.py b/comfy_extras/nodes_latent.py index 8bb368dec..32da9e8ac 100644 --- a/comfy_extras/nodes_latent.py +++ b/comfy_extras/nodes_latent.py @@ -22,7 +22,7 @@ class LatentAdd(io.ComfyNode): return io.Schema( node_id="LatentAdd", search_aliases=["combine latents", "sum latents"], - category="latent/advanced", + category="model/latent/advanced", inputs=[ io.Latent.Input("samples1"), io.Latent.Input("samples2"), @@ -49,7 +49,7 @@ class LatentSubtract(io.ComfyNode): return io.Schema( node_id="LatentSubtract", search_aliases=["difference latent", "remove features"], - category="latent/advanced", + category="model/latent/advanced", inputs=[ io.Latent.Input("samples1"), io.Latent.Input("samples2"), @@ -76,7 +76,7 @@ class LatentMultiply(io.ComfyNode): return io.Schema( node_id="LatentMultiply", search_aliases=["scale latent", "amplify latent", "latent gain"], - category="latent/advanced", + category="model/latent/advanced", inputs=[ io.Latent.Input("samples"), io.Float.Input("multiplier", default=1.0, min=-10.0, max=10.0, step=0.01), @@ -100,7 +100,7 @@ class LatentInterpolate(io.ComfyNode): return io.Schema( node_id="LatentInterpolate", search_aliases=["blend latent", "mix latent", "lerp latent", "transition"], - category="latent/advanced", + category="model/latent/advanced", inputs=[ io.Latent.Input("samples1"), io.Latent.Input("samples2"), @@ -139,7 +139,7 @@ class LatentConcat(io.ComfyNode): return io.Schema( node_id="LatentConcat", search_aliases=["join latents", "stitch latents"], - category="latent/advanced", + category="model/latent/advanced", inputs=[ io.Latent.Input("samples1"), io.Latent.Input("samples2"), @@ -179,7 +179,7 @@ class LatentCut(io.ComfyNode): return io.Schema( node_id="LatentCut", search_aliases=["crop latent", "slice latent", "extract region"], - category="latent/advanced", + category="model/latent/advanced", inputs=[ io.Latent.Input("samples"), io.Combo.Input("dim", options=["x", "y", "t"]), @@ -220,7 +220,7 @@ class LatentCutToBatch(io.ComfyNode): return io.Schema( node_id="LatentCutToBatch", search_aliases=["slice to batch", "split latent", "tile latent"], - category="latent/advanced", + category="model/latent/advanced", inputs=[ io.Latent.Input("samples"), io.Combo.Input("dim", options=["t", "x", "y"]), @@ -262,7 +262,7 @@ class LatentBatch(io.ComfyNode): return io.Schema( node_id="LatentBatch", search_aliases=["combine latents", "merge latents", "join latents"], - category="latent/batch", + category="model/latent/batch", is_deprecated=True, inputs=[ io.Latent.Input("samples1"), @@ -290,7 +290,7 @@ class LatentBatchSeedBehavior(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LatentBatchSeedBehavior", - category="latent/advanced", + category="model/latent/advanced", inputs=[ io.Latent.Input("samples"), io.Combo.Input("seed_behavior", options=["random", "fixed"], default="fixed"), @@ -319,7 +319,7 @@ class LatentApplyOperation(io.ComfyNode): return io.Schema( node_id="LatentApplyOperation", search_aliases=["transform latent"], - category="latent/advanced/operations", + category="model/latent/advanced/operations", is_experimental=True, inputs=[ io.Latent.Input("samples"), @@ -343,7 +343,7 @@ class LatentApplyOperationCFG(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LatentApplyOperationCFG", - category="latent/advanced/operations", + category="model/latent/advanced/operations", is_experimental=True, inputs=[ io.Model.Input("model"), @@ -375,7 +375,7 @@ class LatentOperationTonemapReinhard(io.ComfyNode): return io.Schema( node_id="LatentOperationTonemapReinhard", search_aliases=["hdr latent"], - category="latent/advanced/operations", + category="model/latent/advanced/operations", is_experimental=True, inputs=[ io.Float.Input("multiplier", default=1.0, min=0.0, max=100.0, step=0.01), @@ -410,7 +410,7 @@ class LatentOperationSharpen(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LatentOperationSharpen", - category="latent/advanced/operations", + category="model/latent/advanced/operations", is_experimental=True, inputs=[ io.Int.Input("sharpen_radius", default=9, min=1, max=31, step=1, advanced=True), @@ -447,7 +447,7 @@ class ReplaceVideoLatentFrames(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="ReplaceVideoLatentFrames", - category="latent/batch", + category="model/latent/batch", inputs=[ io.Latent.Input("destination", tooltip="The destination latent where frames will be replaced."), io.Latent.Input("source", optional=True, tooltip="The source latent providing frames to insert into the destination latent. If not provided, the destination latent is returned unchanged."), diff --git a/comfy_extras/nodes_load_3d.py b/comfy_extras/nodes_load_3d.py index 9112bdd0a..9c27c0191 100644 --- a/comfy_extras/nodes_load_3d.py +++ b/comfy_extras/nodes_load_3d.py @@ -34,7 +34,7 @@ class Load3D(IO.ComfyNode): essentials_category="Basics", is_experimental=True, inputs=[ - IO.Combo.Input("model_file", options=sorted(files), upload=IO.UploadType.model), + IO.Combo.Input("model_file", options=["none"] + sorted(files), upload=IO.UploadType.model), IO.Load3D.Input("image"), IO.Int.Input("width", default=1024, min=1, max=4096, step=1), IO.Int.Input("height", default=1024, min=1, max=4096, step=1), @@ -68,8 +68,12 @@ class Load3D(IO.ComfyNode): video = InputImpl.VideoFromFile(recording_video_path) - file_3d = Types.File3D(folder_paths.get_annotated_filepath(model_file)) - return IO.NodeOutput(output_image, output_mask, model_file, normal_image, image['camera_info'], video, file_3d) + file_3d = None + mesh_path = "" + if model_file and model_file != "none": + file_3d = Types.File3D(folder_paths.get_annotated_filepath(model_file)) + mesh_path = model_file + return IO.NodeOutput(output_image, output_mask, mesh_path, normal_image, image['camera_info'], video, file_3d) process = execute # TODO: remove diff --git a/comfy_extras/nodes_logic.py b/comfy_extras/nodes_logic.py index c066064ac..95f6ab848 100644 --- a/comfy_extras/nodes_logic.py +++ b/comfy_extras/nodes_logic.py @@ -1,4 +1,3 @@ -from __future__ import annotations from typing import TypedDict from typing_extensions import override from comfy_api.latest import ComfyExtension, io @@ -8,6 +7,82 @@ from comfy_api.latest import _io MISSING = object() +class NotNode(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="ComfyNotNode", + display_name="Not", + category="utilities/logic", + description="Logical NOT operation. Returns true if the value is falsy. Uses Python's rules for truthiness.", + search_aliases=["invert", "toggle", "negate", "flip boolean"], + inputs=[ + io.AnyType.Input("value"), + ], + outputs=[ + io.Boolean.Output(), + ], + ) + + @classmethod + def execute(cls, value) -> io.NodeOutput: + return io.NodeOutput(not value) + + +class AndNode(io.ComfyNode): + @classmethod + def define_schema(cls): + template = io.Autogrow.TemplatePrefix( + input=io.AnyType.Input("value"), + prefix="value", + min=1, + ) + return io.Schema( + node_id="ComfyAndNode", + display_name="And", + category="utilities/logic", + description="Logical AND operation. Returns true if all of the values are truthy. Uses Python's rules for truthiness.", + search_aliases=["all", "every"], + inputs=[ + io.Autogrow.Input("values", template=template), + ], + outputs=[ + io.Boolean.Output(), + ], + ) + + @classmethod + def execute(cls, values: io.Autogrow.Type) -> io.NodeOutput: + return io.NodeOutput(all(values.values())) + + +class OrNode(io.ComfyNode): + @classmethod + def define_schema(cls): + template = io.Autogrow.TemplatePrefix( + input=io.AnyType.Input("value"), + prefix="value", + min=1, + ) + return io.Schema( + node_id="ComfyOrNode", + display_name="Or", + category="utilities/logic", + description="Logical OR operation. Returns true if any of the values are truthy. Uses Python's rules for truthiness.", + search_aliases=["any", "some"], + inputs=[ + io.Autogrow.Input("values", template=template), + ], + outputs=[ + io.Boolean.Output(), + ], + ) + + @classmethod + def execute(cls, values: io.Autogrow.Type) -> io.NodeOutput: + return io.NodeOutput(any(values.values())) + + class SwitchNode(io.ComfyNode): @classmethod def define_schema(cls): @@ -15,7 +90,7 @@ class SwitchNode(io.ComfyNode): return io.Schema( node_id="ComfySwitchNode", display_name="Switch", - category="logic", + category="utilities/logic", is_experimental=True, inputs=[ io.Boolean.Input("switch"), @@ -46,7 +121,7 @@ class SoftSwitchNode(io.ComfyNode): return io.Schema( node_id="ComfySoftSwitchNode", display_name="Soft Switch", - category="logic", + category="utilities/logic", is_experimental=True, inputs=[ io.Boolean.Input("switch"), @@ -101,7 +176,7 @@ class CustomComboNode(io.ComfyNode): return io.Schema( node_id="CustomCombo", display_name="Custom Combo", - category="utils", + category="utilities", is_experimental=True, inputs=[io.Combo.Input("choice", options=[])], outputs=[ @@ -136,7 +211,7 @@ class DCTestNode(io.ComfyNode): return io.Schema( node_id="DCTestNode", display_name="DCTest", - category="logic", + category="utilities/logic", is_output_node=True, inputs=[io.DynamicCombo.Input("combo", options=[ io.DynamicCombo.Option("option1", [io.String.Input("string")]), @@ -174,7 +249,7 @@ class AutogrowNamesTestNode(io.ComfyNode): return io.Schema( node_id="AutogrowNamesTestNode", display_name="AutogrowNamesTest", - category="logic", + category="utilities/logic", inputs=[ _io.Autogrow.Input("autogrow", template=template) ], @@ -194,7 +269,7 @@ class AutogrowPrefixTestNode(io.ComfyNode): return io.Schema( node_id="AutogrowPrefixTestNode", display_name="AutogrowPrefixTest", - category="logic", + category="utilities/logic", inputs=[ _io.Autogrow.Input("autogrow", template=template) ], @@ -213,7 +288,7 @@ class ComboOutputTestNode(io.ComfyNode): return io.Schema( node_id="ComboOptionTestNode", display_name="ComboOptionTest", - category="logic", + category="utilities/logic", inputs=[io.Combo.Input("combo", options=["option1", "option2", "option3"]), io.Combo.Input("combo2", options=["option4", "option5", "option6"])], outputs=[io.Combo.Output(), io.Combo.Output()], @@ -230,7 +305,7 @@ class ConvertStringToComboNode(io.ComfyNode): node_id="ConvertStringToComboNode", search_aliases=["string to dropdown", "text to combo"], display_name="Convert String to Combo", - category="logic", + category="utilities/logic", inputs=[io.String.Input("string")], outputs=[io.Combo.Output()], ) @@ -246,7 +321,7 @@ class InvertBooleanNode(io.ComfyNode): node_id="InvertBooleanNode", search_aliases=["not", "toggle", "negate", "flip boolean"], display_name="Invert Boolean", - category="logic", + category="utilities/logic", inputs=[io.Boolean.Input("boolean")], outputs=[io.Boolean.Output()], ) @@ -261,6 +336,9 @@ class LogicExtension(ComfyExtension): return [ SwitchNode, CustomComboNode, + NotNode, + AndNode, + OrNode, # SoftSwitchNode, # ConvertStringToComboNode, # DCTestNode, diff --git a/comfy_extras/nodes_lora_debug.py b/comfy_extras/nodes_lora_debug.py index 937a0fbfb..3f68064e5 100644 --- a/comfy_extras/nodes_lora_debug.py +++ b/comfy_extras/nodes_lora_debug.py @@ -30,7 +30,7 @@ class LoraLoaderBypass: OUTPUT_TOOLTIPS = ("The modified diffusion model.", "The modified CLIP model.") FUNCTION = "load_lora" - CATEGORY = "loaders" + CATEGORY = "model/loaders" DESCRIPTION = "Apply LoRA in bypass mode. Unlike regular LoRA, this doesn't modify model weights - instead it injects the LoRA computation during forward pass. Useful for training scenarios." EXPERIMENTAL = True diff --git a/comfy_extras/nodes_lotus.py b/comfy_extras/nodes_lotus.py index 9f62ba2bf..9fe4c5c7b 100644 --- a/comfy_extras/nodes_lotus.py +++ b/comfy_extras/nodes_lotus.py @@ -10,7 +10,7 @@ class LotusConditioning(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LotusConditioning", - category="conditioning/lotus", + category="model/conditioning/lotus", inputs=[], outputs=[io.Conditioning.Output(display_name="conditioning")], ) diff --git a/comfy_extras/nodes_lt.py b/comfy_extras/nodes_lt.py index 675de4f81..6d6078abe 100644 --- a/comfy_extras/nodes_lt.py +++ b/comfy_extras/nodes_lt.py @@ -25,7 +25,7 @@ class GetICLoRAParameters(io.ComfyNode): display_name="Get IC-LoRA Parameters", description="Extracts IC-LoRA parameters from the safetensors metadata of a LoRA-loaded " "model and outputs them for LTXVAddGuide (eg. reference_downscale_factor).", - category="conditioning/video_models", + category="model/conditioning/video_models", search_aliases=["ic-lora", "ic lora", "iclora", "downscale factor", "reference downscale"], inputs=[ io.Model.Input( @@ -62,7 +62,7 @@ class EmptyLTXVLatentVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="EmptyLTXVLatentVideo", - category="latent/video/ltxv", + category="model/latent/video/ltxv", inputs=[ io.Int.Input("width", default=768, min=64, max=nodes.MAX_RESOLUTION, step=32), io.Int.Input("height", default=512, min=64, max=nodes.MAX_RESOLUTION, step=32), @@ -77,7 +77,7 @@ class EmptyLTXVLatentVideo(io.ComfyNode): @classmethod def execute(cls, width, height, length, batch_size=1) -> io.NodeOutput: latent = torch.zeros([batch_size, 128, ((length - 1) // 8) + 1, height // 32, width // 32], device=comfy.model_management.intermediate_device()) - return io.NodeOutput({"samples": latent}) + return io.NodeOutput({"samples": latent, "downscale_ratio_spacial": 32}) generate = execute # TODO: remove @@ -86,7 +86,7 @@ class LTXVImgToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LTXVImgToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -131,7 +131,7 @@ class LTXVImgToVideoInplace(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LTXVImgToVideoInplace", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Vae.Input("vae"), io.Image.Input("image"), @@ -226,10 +226,20 @@ def get_noise_mask(latent): noise_mask = noise_mask.clone() return noise_mask -def get_keyframe_idxs(cond): +def get_keyframe_idxs(cond, latent_shape=None): keyframe_idxs = conditioning_get_any_value(cond, "keyframe_idxs", None) if keyframe_idxs is None: return None, 0 + # Get number of keyframes from latent_shape or guide_attention_entries if available + if latent_shape is not None and len(latent_shape) == 5: + tokens_per_frame = latent_shape[-2] * latent_shape[-1] + num_keyframes = keyframe_idxs.shape[2] // tokens_per_frame + return keyframe_idxs, num_keyframes + entries = conditioning_get_any_value(cond, "guide_attention_entries", None) + if entries: + num_keyframes = sum(e["latent_shape"][0] for e in entries) + return keyframe_idxs, num_keyframes + # fallback, may under-count if keyframes share t-start # keyframe_idxs contains start/end positions (last dimension), checking for unqiue values only for start num_keyframes = torch.unique(keyframe_idxs[:, 0, :, 0]).shape[0] return keyframe_idxs, num_keyframes @@ -241,7 +251,7 @@ class LTXVAddGuide(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LTXVAddGuide", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -322,9 +332,9 @@ class LTXVAddGuide(io.ComfyNode): return factor @classmethod - def get_latent_index(cls, cond, latent_length, guide_length, frame_idx, scale_factors): + def get_latent_index(cls, cond, latent_length, guide_length, frame_idx, scale_factors, latent_shape=None): time_scale_factor, _, _ = scale_factors - _, num_keyframes = get_keyframe_idxs(cond) + _, num_keyframes = get_keyframe_idxs(cond, latent_shape) latent_count = latent_length - num_keyframes frame_idx = frame_idx if frame_idx >= 0 else max((latent_count - 1) * time_scale_factor + 1 + frame_idx, 0) if guide_length > 1 and frame_idx != 0: @@ -436,7 +446,7 @@ class LTXVAddGuide(io.ComfyNode): num_frames_to_keep = ((image.shape[0] - 1) // time_scale_factor) * time_scale_factor + 1 resolved_frame_idx = frame_idx if frame_idx < 0: - _, num_keyframes = get_keyframe_idxs(positive) + _, num_keyframes = get_keyframe_idxs(positive, latent_image.shape) resolved_frame_idx = max((latent_length - num_keyframes - 1) * time_scale_factor + 1 + frame_idx, 0) causal_fix = resolved_frame_idx == 0 or num_frames_to_keep == 1 @@ -454,7 +464,7 @@ class LTXVAddGuide(io.ComfyNode): if latent_downscale_factor > 1: t, guide_mask = cls.dilate_latent(t, latent_downscale_factor) - frame_idx, latent_idx = cls.get_latent_index(positive, latent_length, len(image), frame_idx, scale_factors) + frame_idx, latent_idx = cls.get_latent_index(positive, latent_length, len(image), frame_idx, scale_factors, latent_shape=latent_image.shape) assert latent_idx + t.shape[2] <= latent_length, "Conditioning frames exceed the length of the latent sequence." positive, negative, latent_image, noise_mask = cls.append_keyframe( @@ -488,7 +498,7 @@ class LTXVCropGuides(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LTXVCropGuides", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -506,7 +516,7 @@ class LTXVCropGuides(io.ComfyNode): latent_image = latent["samples"].clone() noise_mask = get_noise_mask(latent) - _, num_keyframes = get_keyframe_idxs(positive) + _, num_keyframes = get_keyframe_idxs(positive, latent_image.shape) if num_keyframes == 0: return io.NodeOutput(positive, negative, {"samples": latent_image, "noise_mask": noise_mask},) @@ -532,7 +542,7 @@ class LTXVConditioning(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LTXVConditioning", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -601,7 +611,7 @@ class LTXVScheduler(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LTXVScheduler", - category="sampling/schedulers", + category="model/sampling/schedulers", inputs=[ io.Int.Input("steps", default=20, min=1, max=10000), io.Float.Input("max_shift", default=2.05, min=0.0, max=100.0, step=0.01), @@ -736,7 +746,7 @@ class LTXVConcatAVLatent(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LTXVConcatAVLatent", - category="latent/video/ltxv", + category="model/latent/video/ltxv", inputs=[ io.Latent.Input("video_latent"), io.Latent.Input("audio_latent"), @@ -771,7 +781,7 @@ class LTXVSeparateAVLatent(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LTXVSeparateAVLatent", - category="latent/video/ltxv", + category="model/latent/video/ltxv", description="LTXV Separate AV Latent", inputs=[ io.Latent.Input("av_latent"), @@ -804,7 +814,7 @@ class LTXVReferenceAudio(io.ComfyNode): return io.Schema( node_id="LTXVReferenceAudio", display_name="LTXV Reference Audio (ID-LoRA)", - category="conditioning/audio", + category="model/conditioning/audio", description="Set reference audio for ID-LoRA speaker identity transfer. Encodes a reference audio clip into the conditioning and optionally patches the model with identity guidance (extra forward pass without reference, amplifying the speaker identity effect).", inputs=[ io.Model.Input("model"), diff --git a/comfy_extras/nodes_lt_audio.py b/comfy_extras/nodes_lt_audio.py index 2c1f63afb..052186083 100644 --- a/comfy_extras/nodes_lt_audio.py +++ b/comfy_extras/nodes_lt_audio.py @@ -11,8 +11,8 @@ class LTXVAudioVAELoader(io.ComfyNode): def define_schema(cls) -> io.Schema: return io.Schema( node_id="LTXVAudioVAELoader", - display_name="LTXV Audio VAE Loader", - category="audio", + display_name="Load LTXV Audio VAE", + category="model/loaders", inputs=[ io.Combo.Input( "ckpt_name", @@ -40,7 +40,7 @@ class LTXVAudioVAEEncode(VAEEncodeAudio): return io.Schema( node_id="LTXVAudioVAEEncode", display_name="LTXV Audio VAE Encode", - category="audio", + category="model/latent/audio", inputs=[ io.Audio.Input("audio", tooltip="The audio to be encoded."), io.Vae.Input( @@ -63,7 +63,7 @@ class LTXVAudioVAEDecode(io.ComfyNode): return io.Schema( node_id="LTXVAudioVAEDecode", display_name="LTXV Audio VAE Decode", - category="audio", + category="model/latent/audio", inputs=[ io.Latent.Input("samples", tooltip="The latent to be decoded."), io.Vae.Input( @@ -96,7 +96,7 @@ class LTXVEmptyLatentAudio(io.ComfyNode): return io.Schema( node_id="LTXVEmptyLatentAudio", display_name="LTXV Empty Latent Audio", - category="latent/audio", + category="model/latent/audio", inputs=[ io.Int.Input( "frames_number", diff --git a/comfy_extras/nodes_lt_upsampler.py b/comfy_extras/nodes_lt_upsampler.py index f99ba13fb..be9a36e69 100644 --- a/comfy_extras/nodes_lt_upsampler.py +++ b/comfy_extras/nodes_lt_upsampler.py @@ -1,32 +1,32 @@ from comfy import model_management +from comfy_api.latest import ComfyExtension, IO +from typing_extensions import override import math -class LTXVLatentUpsampler: + +class LTXVLatentUpsampler(IO.ComfyNode): """ Upsamples a video latent by a factor of 2. """ @classmethod - def INPUT_TYPES(s): - return { - "required": { - "samples": ("LATENT",), - "upscale_model": ("LATENT_UPSCALE_MODEL",), - "vae": ("VAE",), - } - } + def define_schema(cls): + return IO.Schema( + node_id="LTXVLatentUpsampler", + category="model/latent/video", + is_experimental=True, + inputs=[ + IO.Latent.Input("samples"), + IO.LatentUpscaleModel.Input("upscale_model"), + IO.Vae.Input("vae"), + ], + outputs=[ + IO.Latent.Output(), + ], + ) - RETURN_TYPES = ("LATENT",) - FUNCTION = "upsample_latent" - CATEGORY = "latent/video" - EXPERIMENTAL = True - - def upsample_latent( - self, - samples: dict, - upscale_model, - vae, - ) -> tuple: + @classmethod + def execute(cls, samples, upscale_model, vae) -> IO.NodeOutput: """ Upsample the input latent using the provided model. @@ -34,7 +34,6 @@ class LTXVLatentUpsampler: samples (dict): Input latent samples upscale_model (LatentUpsampler): Loaded upscale model vae: VAE model for normalization - auto_tiling (bool): Whether to automatically tile the input for processing Returns: tuple: Tuple containing the upsampled latent @@ -67,9 +66,16 @@ class LTXVLatentUpsampler: return_dict = samples.copy() return_dict["samples"] = upsampled_latents return_dict.pop("noise_mask", None) - return (return_dict,) + return IO.NodeOutput(return_dict) + + upsample_latent = execute # TODO: remove -NODE_CLASS_MAPPINGS = { - "LTXVLatentUpsampler": LTXVLatentUpsampler, -} +class LTXVLatentUpsamplerExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [LTXVLatentUpsampler] + + +async def comfy_entrypoint() -> LTXVLatentUpsamplerExtension: + return LTXVLatentUpsamplerExtension() diff --git a/comfy_extras/nodes_lumina2.py b/comfy_extras/nodes_lumina2.py index b35ab8b7d..c060a86a0 100644 --- a/comfy_extras/nodes_lumina2.py +++ b/comfy_extras/nodes_lumina2.py @@ -81,7 +81,7 @@ class CLIPTextEncodeLumina2(io.ComfyNode): node_id="CLIPTextEncodeLumina2", search_aliases=["lumina prompt"], display_name="CLIP Text Encode for Lumina2", - category="conditioning", + category="model/conditioning", description="Encodes a system prompt and a user prompt using a CLIP model into an embedding " "that can be used to guide the diffusion model towards generating specific images.", inputs=[ diff --git a/comfy_extras/nodes_mask.py b/comfy_extras/nodes_mask.py index d15f1f4e7..52484697a 100644 --- a/comfy_extras/nodes_mask.py +++ b/comfy_extras/nodes_mask.py @@ -53,7 +53,7 @@ class LatentCompositeMasked(IO.ComfyNode): return IO.Schema( node_id="LatentCompositeMasked", search_aliases=["overlay latent", "layer latent", "paste latent", "inpaint latent"], - category="latent", + category="model/latent", inputs=[ IO.Latent.Input("destination"), IO.Latent.Input("source"), diff --git a/comfy_extras/nodes_math.py b/comfy_extras/nodes_math.py index 6030ee9d8..873ee7b51 100644 --- a/comfy_extras/nodes_math.py +++ b/comfy_extras/nodes_math.py @@ -4,7 +4,6 @@ Provides a ComfyMathExpression node that evaluates math expressions against dynamically-grown numeric inputs. """ -from __future__ import annotations import math import string @@ -70,7 +69,7 @@ class MathExpressionNode(io.ComfyNode): return io.Schema( node_id="ComfyMathExpression", display_name="Math Expression", - category="logic", + category="utilities", search_aliases=[ "expression", "formula", "calculate", "calculator", "eval", "math", diff --git a/comfy_extras/nodes_mediapipe.py b/comfy_extras/nodes_mediapipe.py new file mode 100644 index 000000000..343d88dbb --- /dev/null +++ b/comfy_extras/nodes_mediapipe.py @@ -0,0 +1,508 @@ +"""ComfyUI nodes for the pure-PyTorch MediaPipe Face Landmarker port. + +Custom IO types: + FACE_LANDMARKER — FaceLandmarkerModel wrapper (ModelPatcher inside) + FACE_LANDMARKS — {"frames": List[List[face_dict]], "image_size": (H, W), + "connection_sets": dict[str, frozenset[(int, int)]]} + face_dict: bbox_xyxy, blendshapes, landmarks_xy, + landmarks_3d, presence, score, transformation_matrix + +MediaPipeFaceLandmarker also emits the core BOUNDING_BOX type — pair with DrawBBoxes. +""" + + +import numpy as np +import torch +from PIL import Image, ImageColor, ImageDraw +from tqdm.auto import tqdm +from typing_extensions import override + +import comfy.model_management +import comfy.model_patcher +import comfy.utils +import folder_paths +from comfy_api.latest import ComfyExtension, io + +from comfy_extras.mediapipe.face_landmarker import FaceLandmarker +from comfy_extras.mediapipe.face_geometry import transformation_matrix_from_detection + + +FaceDetectionType = io.Custom("FACE_DETECTION_MODEL") +FaceLandmarksType = io.Custom("FACE_LANDMARKS") + +_CANONICAL_KEYS = ("canonical_vertices", "procrustes_indices", "procrustes_weights") +_CONTOUR_PARTS = ("face_oval", "left_eye", "right_eye", "left_eyebrow", "right_eyebrow", "lips") + + +class FaceLandmarkerModel: + """Loaded FaceLandmarker variants + ModelPatcher per variant. + + Safetensors layout: `detector_short.*` / `detector_full.*` plus shared + `mesh.*`, `blendshapes.*`, `canonical_*`, and `topology.*`. + PReLU forces plain-nn / fp32 (manual_cast strands buffers across devices). + """ + + def __init__(self, state_dict: dict): + self.load_device = comfy.model_management.text_encoder_device() + offload_device = comfy.model_management.text_encoder_offload_device() + self.dtype = torch.float32 + + # FACEMESH_* connection sets, embedded as int32 (N, 2) under topology.*. + base: dict[str, frozenset] = {} + for k in [k for k in state_dict if k.startswith("topology.")]: + base[k[len("topology."):]] = frozenset(map(tuple, state_dict.pop(k).tolist())) + base["contours"] = frozenset().union(*(base[p] for p in _CONTOUR_PARTS)) + base["all"] = base["contours"] | base["irises"] | base["nose"] + + self.connection_sets: dict[str, frozenset] = base + self.canonical_data: dict[str, np.ndarray] = {k: state_dict.pop(k).numpy() for k in _CANONICAL_KEYS} + + shared = {k: v for k, v in state_dict.items() if k.startswith(("mesh.", "blendshapes."))} + + self.models: dict[str, FaceLandmarker] = {} + self.patchers: dict[str, comfy.model_patcher.ModelPatcher] = {} + for variant in ("short", "full"): + prefix = f"detector_{variant}." + sub = dict(shared) + sub.update({f"detector.{k[len(prefix):]}": v for k, v in state_dict.items() if k.startswith(prefix)}) + fl = FaceLandmarker(device=offload_device, dtype=self.dtype, operations=None, detector_variant=variant).eval() + fl.load_state_dict(sub, strict=False) + + self.models[variant] = fl + self.patchers[variant] = comfy.model_patcher.CoreModelPatcher( + fl, load_device=self.load_device, offload_device=offload_device, + size=comfy.model_management.module_size(fl), + ) + + def detect_batch(self, images, num_faces: int, score_thresh: float, variant: str): + comfy.model_management.load_model_gpu(self.patchers[variant]) + return self.models[variant].detect_batch(images, num_faces=num_faces, score_thresh=score_thresh) + + +def _image_to_uint8(image: torch.Tensor) -> np.ndarray: + return image[..., :3].mul(255.0).add_(0.5).clamp_(0, 255).to(torch.uint8).cpu().numpy() + + +def _parse_color(color: str) -> tuple[int, int, int]: + try: + return ImageColor.getrgb(color)[:3] + except ValueError: + return (0, 255, 0) + + +def _copy_face(face: dict) -> dict: + """Shallow copy of a face_dict with array-fields cloned so callers can mutate.""" + return { + "bbox_xyxy": face["bbox_xyxy"].copy(), + "blendshapes": dict(face["blendshapes"]), + "landmarks_xy": face["landmarks_xy"].copy(), + "landmarks_3d": face["landmarks_3d"].copy(), + "presence": face["presence"], + "score": face["score"], + } + + +def _lerp_face(a: dict, b: dict, t: float) -> dict: + return { + "bbox_xyxy": (1 - t) * a["bbox_xyxy"] + t * b["bbox_xyxy"], + "blendshapes": {k: (1 - t) * a["blendshapes"][k] + t * b["blendshapes"][k] for k in a["blendshapes"]}, + "landmarks_xy": (1 - t) * a["landmarks_xy"] + t * b["landmarks_xy"], + "landmarks_3d": (1 - t) * a["landmarks_3d"] + t * b["landmarks_3d"], + "presence": (1 - t) * a["presence"] + t * b["presence"], + "score": (1 - t) * a["score"] + t * b["score"], + } + + +def _match_faces(a: list[dict], b: list[dict]) -> list[tuple[int, int]]: + """Greedy nearest-neighbour pairing of faces between two frames by bbox + centre distance. Unmatched (when counts differ) are dropped.""" + if not a or not b: + return [] + centers_a = np.array([(0.5 * (f["bbox_xyxy"][0] + f["bbox_xyxy"][2]), + 0.5 * (f["bbox_xyxy"][1] + f["bbox_xyxy"][3])) for f in a]) + centers_b = np.array([(0.5 * (f["bbox_xyxy"][0] + f["bbox_xyxy"][2]), + 0.5 * (f["bbox_xyxy"][1] + f["bbox_xyxy"][3])) for f in b]) + dists = np.linalg.norm(centers_a[:, None] - centers_b[None], axis=-1) + pairs: list[tuple[int, int]] = [] + used_a: set[int] = set() + used_b: set[int] = set() + candidates = sorted((dists[ia, ib], ia, ib) for ia in range(len(a)) for ib in range(len(b))) + for _, ia, ib in candidates: + if ia in used_a or ib in used_b: + continue + pairs.append((ia, ib)) + used_a.add(ia) + used_b.add(ib) + return pairs + + +def _fill_missing_frames(frames: list[list[dict]], mode: str) -> None: + """In-place fill empty frame slots from neighbouring detections. Multi-face + aware: pairs faces across bracketing frames by greedy bbox-centre NN. + When counts differ, unmatched faces are dropped from the synthesised frame.""" + if mode == "empty": + return + valid = [i for i, fr in enumerate(frames) if fr] + if not valid: + return # nothing to fill from + if mode == "previous": + last: list[dict] = [] + for i, fr in enumerate(frames): + if fr: + last = fr + elif last: + frames[i] = [_copy_face(f) for f in last] + return + # interpolate: lerp between bracketing valid frames; clamp at ends. + for i in range(len(frames)): + if frames[i]: + continue + prev_i = max((v for v in valid if v < i), default=None) + next_i = min((v for v in valid if v > i), default=None) + if prev_i is None: + frames[i] = [_copy_face(f) for f in frames[next_i]] + elif next_i is None: + frames[i] = [_copy_face(f) for f in frames[prev_i]] + else: + t = (i - prev_i) / (next_i - prev_i) + pairs = _match_faces(frames[prev_i], frames[next_i]) + frames[i] = [_lerp_face(frames[prev_i][a], frames[next_i][b], t) for a, b in pairs] + + +def _ordered_rings(edges: frozenset[tuple[int, int]]) -> list[list[int]]: + """Walk an unordered edge set into one or more closed-loop vertex rings + (handles multi-loop sets like FACEMESH_LIPS: outer + inner).""" + adj: dict[int, set[int]] = {} + for a, b in edges: + adj.setdefault(a, set()).add(b) + adj.setdefault(b, set()).add(a) + visited: set[int] = set() + rings: list[list[int]] = [] + for start in adj: + if start in visited: + continue + ring = [start] + visited.add(start) + prev, cur = -1, start + while True: + nxt = next((v for v in adj[cur] if v != prev), None) + if nxt is None or nxt == start: + break + ring.append(nxt) + visited.add(nxt) + prev, cur = cur, nxt + rings.append(ring) + return rings + + +class LoadMediaPipeFaceLandmarker(io.ComfyNode): + """Load MediaPipe Face Landmarker v2 weights. Contains both detector variants + (short / full), shared mesh, blendshapes, and canonical geometry.""" + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="LoadMediaPipeFaceLandmarker", + search_aliases=["face", "facial", "mediapipe", "face landmark", "face mesh", "blazeface", "face detection"], + display_name="Load Face Detection Model (MediaPipe)", + category="model/loaders", + inputs=[ + io.Combo.Input("model_name", options=folder_paths.get_filename_list("detection"), + tooltip="Face detection model from models/detection/."), + ], + outputs=[FaceDetectionType.Output()], + ) + + @classmethod + def execute(cls, model_name) -> io.NodeOutput: + sd = comfy.utils.load_torch_file(folder_paths.get_full_path_or_raise("detection", model_name), safe_load=True) + wrapper = FaceLandmarkerModel(sd) + return io.NodeOutput(wrapper) + + +# Per-frame fallback modes for detection failures in a batch. +_FALLBACK_MODES = ("empty", "previous", "interpolate") + + +class MediaPipeFaceLandmarker(io.ComfyNode): + """BlazeFace → FaceMesh v2 → ARKit-52 blendshapes, batched across the + input. Also emits a BOUNDING_BOX list (landmark-extent bbox per face) — + pair with DrawBBoxes for detector-only viz or MediaPipeFaceMeshVisualize + for the mesh overlay.""" + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="MediaPipeFaceLandmarker", + search_aliases=["face", "facial", "mediapipe", "face landmark", "face mesh", "blazeface", "face detection"], + display_name="Detect Face Landmarks (MediaPipe)", + category="image/detection", + description="Detects facial landmarks using MediaPipe model.", + inputs=[ + FaceDetectionType.Input("face_detection_model"), + io.Image.Input("image"), + io.Combo.Input("detector_variant", options=["short", "full", "both"], default="short", + tooltip="Face detector range. 'short' is tuned for close-up faces " + "(within ~2 m of the camera); 'full' covers farther / smaller " + "faces (up to ~5 m) but is slower. 'both' runs both detectors and " + "keeps whichever found more faces per frame (~2× detection cost)."), + io.Int.Input("num_faces", default=1, min=0, max=16, step=1, + tooltip="Maximum faces to return per frame. 0 = no cap (return all detected)."), + io.Float.Input("min_confidence", default=0.5, min=0.0, max=1.0, step=0.01, advanced=True, + tooltip="BlazeFace score threshold. Lower to catch small/occluded faces."), + io.Combo.Input("missing_frame_fallback", options=list(_FALLBACK_MODES), default="empty", advanced=True, + tooltip="Per-frame behaviour when detection fails in a batch. " + "'empty' leaves the frame faceless. 'previous' copies the most recent successful " + "detection. 'interpolate' lerps landmarks/bbox/blendshapes between bracketing " + "successful frames. Multi-face: pairs faces across frames by greedy bbox-centre NN."), + ], + outputs=[ + FaceLandmarksType.Output(display_name="face_landmarks"), + io.BoundingBox.Output("bboxes"), + ], + ) + + @classmethod + def execute(cls, face_detection_model, image, detector_variant, num_faces, min_confidence, + missing_frame_fallback) -> io.NodeOutput: + canonical = face_detection_model.canonical_data + img_np = _image_to_uint8(image) + B, H, W = img_np.shape[:3] + chunk = 16 + is_both = detector_variant == "both" + total_work = 2 * B if is_both else B + pbar = comfy.utils.ProgressBar(total_work) + + def _run(variant: str) -> list[list[dict]]: + res: list[list[dict]] = [] + with tqdm(total=B, desc=f"MediaPipe Face Landmarker ({variant})") as tq: + for i in range(0, B, chunk): + end = min(i + chunk, B) + res.extend(face_detection_model.detect_batch( + [img_np[bi] for bi in range(i, end)], + num_faces=int(num_faces), + score_thresh=float(min_confidence), + variant=variant, + )) + pbar.update_absolute(min(pbar.current + (end - i), total_work)) + tq.update(end - i) + return res + + if is_both: + short_res = _run("short") + full_res = _run("full") + # Per-frame keep whichever found more faces (tie → short). + frames: list[list[dict]] = [ + short_res[bi] if len(short_res[bi]) >= len(full_res[bi]) else full_res[bi] + for bi in range(B) + ] + else: + frames = _run(detector_variant) + _fill_missing_frames(frames, missing_frame_fallback) + bboxes = [] + for per_frame in frames: + per_bb = [] + for f in per_frame: + f["transformation_matrix"] = transformation_matrix_from_detection(f, W, H, canonical) + x1, y1, x2, y2 = (float(v) for v in f["bbox_xyxy"]) + per_bb.append({"x": x1, "y": y1, "width": x2 - x1, "height": y2 - y1, "label": "face", "score": float(f["score"])}) + bboxes.append(per_bb) + return io.NodeOutput({"frames": frames, "image_size": (H, W), + "connection_sets": face_detection_model.connection_sets}, bboxes) + + +# Topology keys unioned by the 'all' connections preset (contour parts + irises + nose). +_ALL_CONNECTION_PARTS: tuple[str, ...] = (*_CONTOUR_PARTS, "irises", "nose") +_CUSTOM_FEATURES: tuple[tuple[str, bool], ...] = ( + ("face_oval", True), + ("lips", True), + ("left_eye", True), + ("right_eye", True), + ("left_eyebrow", True), + ("right_eyebrow", True), + ("irises", True), + ("nose", True), + ("tesselation", False), +) + + +class MediaPipeFaceMeshVisualize(io.ComfyNode): + """Draw a FACEMESH_* subset over an image. Topology travels with the + FACE_LANDMARKS payload (set at detection time).""" + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="MediaPipeFaceMeshVisualize", + search_aliases=["face", "facial", "mediapipe", "face landmark", "face mesh", "blazeface", "face detection", "visualize"], + display_name="Visualize Face Landmarks (MediaPipe)", + category="image/detection", + description="Draws face landmarks mesh on the input image.", + inputs=[ + FaceLandmarksType.Input("face_landmarks"), + io.Image.Input("image", optional=True, tooltip="If not connected, a black canvas will be used."), + io.DynamicCombo.Input( + "connections", + tooltip="'all' = oval+eyes+brows+lips+irises+nose. 'fill' = solid face_oval polygon (silhouette mask). 'custom' = toggle each feature individually (including 'tesselation', the full 2547-edge wireframe).", + options=[ + io.DynamicCombo.Option("all", []), + io.DynamicCombo.Option("fill", []), + io.DynamicCombo.Option("custom", [ + io.Boolean.Input(feat, default=default, + tooltip=f"Draw the '{feat}' connection set.") + for feat, default in _CUSTOM_FEATURES + ]), + ], + ), + io.Color.Input("color", default="#00ff00"), + io.Int.Input("thickness", default=1, min=0, max=8, step=1, + tooltip="Edge line thickness in pixels. 0 disables edge drawing."), + io.Int.Input("point_size", default=2, min=0, max=16, step=1, + tooltip="Landmark dot radius in pixels. 0 disables point drawing."), + ], + outputs=[io.Image.Output()], + ) + + @classmethod + def execute(cls, face_landmarks, connections, color, thickness, point_size, image=None) -> io.NodeOutput: + sets = face_landmarks["connection_sets"] + sel = connections["connections"] + fill_rings: list[list[int]] | None = None + if sel == "fill": + fill_rings = _ordered_rings(sets["face_oval"]) + edges = frozenset() + elif sel == "custom": + parts = [feat for feat, _ in _CUSTOM_FEATURES if connections.get(feat, False)] + edges = frozenset().union(*(sets[p] for p in parts)) + else: # "all" + edges = frozenset().union(*(sets[p] for p in _ALL_CONNECTION_PARTS)) + rgb, thick, psize = _parse_color(color), int(thickness), int(point_size) + frames = face_landmarks["frames"] + if image is None: + H, W = face_landmarks["image_size"] + img_np = np.zeros((len(frames), H, W, 3), dtype=np.uint8) + else: + img_np = _image_to_uint8(image) + B = img_np.shape[0] + n_frames = len(frames) + pbar = comfy.utils.ProgressBar(B) + out = np.empty_like(img_np) + for bi in range(B): + faces = frames[bi] if bi < n_frames else [] + out[bi] = _draw_mesh(img_np[bi], faces, edges, rgb, thick, psize, fill_rings) + pbar.update_absolute(bi + 1) + return io.NodeOutput(torch.from_numpy(out).to( + device=comfy.model_management.intermediate_device(), + dtype=comfy.model_management.intermediate_dtype(), + ).div_(255.0)) + + +def _draw_mesh(image_rgb: np.ndarray, faces: list, edges, + rgb: tuple[int, int, int], thickness: int, + point_size: int, fill_rings: list[list[int]] | None = None) -> np.ndarray: + draw_edges = thickness > 0 and edges + if not faces or (fill_rings is None and not draw_edges and point_size <= 0): + return image_rgb.copy() + pil = Image.fromarray(image_rgb) + draw = ImageDraw.Draw(pil) + r = point_size * 0.5 + if fill_rings is not None: + for f in faces: + lmks = f["landmarks_xy"] + for ring in fill_rings: + draw.polygon([(float(lmks[i, 0]), float(lmks[i, 1])) for i in ring], fill=rgb) + return np.asarray(pil) + for f in faces: + lmks = f["landmarks_xy"] + n = lmks.shape[0] + if draw_edges: + for a, b in edges: + if a < n and b < n: + draw.line([(float(lmks[a, 0]), float(lmks[a, 1])), + (float(lmks[b, 0]), float(lmks[b, 1]))], fill=rgb, width=thickness) + if point_size == 1: + draw.point(lmks.flatten().tolist(), fill=rgb) + elif point_size > 1: + for x, y in lmks: + draw.ellipse((float(x) - r, float(y) - r, float(x) + r, float(y) + r), fill=rgb) + return np.asarray(pil) + + +# Mask region presets — closed-loop topologies only. +_MASK_REGIONS: tuple[str, ...] = ("face_oval", "lips", "left_eye", "right_eye", "irises") +_MASK_CUSTOM_FEATURES: tuple[tuple[str, bool], ...] = ( + ("face_oval", True), + ("lips", False), + ("left_eye", False), + ("right_eye", False), + ("irises", False), +) + + +class MediaPipeFaceMask(io.ComfyNode): + """Binary mask from face landmarks, filled polygon per face. One mask per + frame in the batch; faces in the same frame composite (union).""" + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="MediaPipeFaceMask", + search_aliases=["face", "facial", "mediapipe", "face mask", "blazeface", "face detection", "visualize"], + display_name="Draw Face Mask (MediaPipe)", + category="image/detection", + description="Draws a mask from face landmarks.", + inputs=[ + FaceLandmarksType.Input("face_landmarks"), + io.DynamicCombo.Input( + "regions", + tooltip="'all' = union of face_oval+lips+eyes+irises (which collapses to face_oval since it encloses the rest). 'custom' = toggle each region individually for combos like lips+eyes.", + options=[ + io.DynamicCombo.Option("all", []), + io.DynamicCombo.Option("custom", [ + io.Boolean.Input(reg, default=default, + tooltip=f"Include the '{reg}' region in the mask.") + for reg, default in _MASK_CUSTOM_FEATURES + ]), + ], + ), + ], + outputs=[io.Mask.Output()], + ) + + @classmethod + def execute(cls, face_landmarks, regions) -> io.NodeOutput: + sets = face_landmarks["connection_sets"] + sel = regions["regions"] + if sel == "custom": + picked = [reg for reg, _ in _MASK_CUSTOM_FEATURES if regions.get(reg, False)] + else: + picked = list(_MASK_REGIONS) + rings = [r for reg in picked for r in _ordered_rings(sets[reg])] + frames = face_landmarks["frames"] + H, W = face_landmarks["image_size"] + masks = np.zeros((len(frames), H, W), dtype=np.uint8) + pbar = comfy.utils.ProgressBar(len(frames)) + for bi, per_frame in enumerate(frames): + if per_frame: + pil = Image.new("L", (W, H), 0) + draw = ImageDraw.Draw(pil) + for f in per_frame: + lmks = f["landmarks_xy"] + for ring in rings: + draw.polygon([(float(lmks[i, 0]), float(lmks[i, 1])) for i in ring], fill=255) + masks[bi] = np.asarray(pil) + pbar.update_absolute(bi + 1) + return io.NodeOutput(torch.from_numpy(masks).to( + device=comfy.model_management.intermediate_device(), + dtype=comfy.model_management.intermediate_dtype(), + ).div_(255.0)) + + +class MediaPipeFaceExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [LoadMediaPipeFaceLandmarker, MediaPipeFaceLandmarker, MediaPipeFaceMeshVisualize, MediaPipeFaceMask] + + +async def comfy_entrypoint() -> MediaPipeFaceExtension: + return MediaPipeFaceExtension() diff --git a/comfy_extras/nodes_mochi.py b/comfy_extras/nodes_mochi.py index d750194fc..3dcea6ab3 100644 --- a/comfy_extras/nodes_mochi.py +++ b/comfy_extras/nodes_mochi.py @@ -10,7 +10,7 @@ class EmptyMochiLatentVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="EmptyMochiLatentVideo", - category="latent/video", + category="model/latent/video", inputs=[ io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16), io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), diff --git a/comfy_extras/nodes_model_downscale.py b/comfy_extras/nodes_model_downscale.py index 24d47a903..817542452 100644 --- a/comfy_extras/nodes_model_downscale.py +++ b/comfy_extras/nodes_model_downscale.py @@ -10,7 +10,7 @@ class PatchModelAddDownscale(io.ComfyNode): return io.Schema( node_id="PatchModelAddDownscale", display_name="PatchModelAddDownscale (Kohya Deep Shrink)", - category="model_patches/unet", + category="model/patch/unet", inputs=[ io.Model.Input("model"), io.Int.Input("block_number", default=3, min=1, max=32, step=1, advanced=True), diff --git a/comfy_extras/nodes_model_patch.py b/comfy_extras/nodes_model_patch.py index 748559a6b..bdccbf8c4 100644 --- a/comfy_extras/nodes_model_patch.py +++ b/comfy_extras/nodes_model_patch.py @@ -548,7 +548,7 @@ class USOStyleReference: FUNCTION = "apply_patch" EXPERIMENTAL = True - CATEGORY = "advanced/model_patches/flux" + CATEGORY = "model/patch/flux" def apply_patch(self, model, model_patch, clip_vision_output): encoded_image = torch.stack((clip_vision_output.all_hidden_states[:, -20], clip_vision_output.all_hidden_states[:, -11], clip_vision_output.penultimate_hidden_states)) @@ -594,7 +594,7 @@ class SUPIRApply(io.ComfyNode): def define_schema(cls) -> io.Schema: return io.Schema( node_id="SUPIRApply", - category="model_patches/supir", + category="model/patch/supir", is_experimental=True, inputs=[ io.Model.Input("model"), diff --git a/comfy_extras/nodes_moge.py b/comfy_extras/nodes_moge.py index d9a08ebc7..422949531 100644 --- a/comfy_extras/nodes_moge.py +++ b/comfy_extras/nodes_moge.py @@ -1,6 +1,5 @@ """ComfyUI nodes for the native MoGe (Monocular Geometry Estimation) integration.""" -from __future__ import annotations import torch @@ -79,7 +78,7 @@ class LoadMoGeModel(io.ComfyNode): return io.Schema( node_id="LoadMoGeModel", display_name="Load MoGe Model", - category="loaders", + category="model/loaders", inputs=[ io.Combo.Input("model_name", options=folder_paths.get_filename_list("geometry_estimation")), ], @@ -103,8 +102,10 @@ class MoGePanoramaInference(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="MoGePanoramaInference", - display_name="MoGe Panorama Inference", - category="image/geometry_estimation", + search_aliases=["moge", "panorama", "depth", "geometry", "depth estimation", "geometry estimation"], + display_name="Run MoGe Panorama Inference", + category="image/geometry estimation", + description="Run MoGe on an equirectangular panorama by splitting it into 12 perspective views, running inference on each, and merging the results into a single depth map.", inputs=[ MoGeModelType.Input("moge_model"), io.Image.Input("image", tooltip="Equirectangular panorama (any aspect)."), @@ -222,8 +223,10 @@ class MoGeInference(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="MoGeInference", - display_name="MoGe Inference", - category="image/geometry_estimation", + search_aliases=["moge", "depth", "geometry", "depth estimation", "geometry estimation"], + display_name="Run MoGe Inference", + description="Run MoGe on a single image to estimate depth and geometry.", + category="image/geometry estimation", inputs=[ MoGeModelType.Input("moge_model"), io.Image.Input("image"), @@ -277,8 +280,10 @@ class MoGeRender(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="MoGeRender", - display_name="MoGe Render", - category="image/geometry_estimation", + search_aliases=["moge", "render", "geometry", "depth", "normal"], + display_name="Render MoGe Geometry", + description="Render a depth map or normal map from geometry data", + category="image/geometry estimation", inputs=[ MoGeGeometry.Input("moge_geometry"), io.Combo.Input("output", options=["depth", "depth_colored", "normal_opengl", "normal_directx", "mask"], default="depth", @@ -342,8 +347,10 @@ class MoGePointMapToMesh(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="MoGePointMapToMesh", - display_name="MoGe Point Map to Mesh", - category="image/geometry_estimation", + search_aliases=["moge", "mesh", "geometry", "point map"], + display_name="Convert MoGe Point Map to Mesh", + description="Convert a MoGe point map into a 3D mesh.", + category="image/geometry estimation", inputs=[ MoGeGeometry.Input("moge_geometry"), io.Int.Input("batch_index", default=0, min=0, max=4096, diff --git a/comfy_extras/nodes_multigpu.py b/comfy_extras/nodes_multigpu.py new file mode 100644 index 000000000..d2f6fe67a --- /dev/null +++ b/comfy_extras/nodes_multigpu.py @@ -0,0 +1,408 @@ +from __future__ import annotations + +import copy +import logging +from inspect import cleandoc +from typing import TYPE_CHECKING +from typing_extensions import override + +from comfy_api.latest import ComfyExtension, io + +if TYPE_CHECKING: + from comfy.model_patcher import ModelPatcher + from comfy.sd import CLIP, VAE +import torch + +import comfy.model_management +import comfy.multigpu + + +class MultiGPUCFGSplitNode(io.ComfyNode): + """ + Prepares model to have sampling accelerated via splitting work units. + + Should be placed after nodes that modify the model object itself, such as compile or attention-switch nodes. + + Other than those exceptions, this node can be placed in any order. + """ + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="MultiGPU_WorkUnits", + display_name="MultiGPU CFG Split", + category="advanced/multigpu", + description=cleandoc(cls.__doc__), + inputs=[ + io.Model.Input("model"), + io.Int.Input("max_gpus", default=2, min=1, step=1), + ], + outputs=[ + io.Model.Output(), + ], + ) + + @classmethod + def execute(cls, model: ModelPatcher, max_gpus: int) -> io.NodeOutput: + model = comfy.multigpu.create_multigpu_deepclones(model, max_gpus, reuse_loaded=True) + return io.NodeOutput(model) + + +def _force_supported_compute_dtype(patcher: ModelPatcher, device: torch.device): + """Cast compute dtype to one the device supports; no-op if already supported.""" + weight_dtype = patcher.model_dtype() + cast_dtype = comfy.model_management.unet_manual_cast(weight_dtype, device) + if cast_dtype is None: + return + logging.info(f"Select Model Device: using {cast_dtype} compute dtype on {device} (model weight dtype was {weight_dtype}).") + patcher.set_model_compute_dtype(cast_dtype) + + +def _remember_base_devices(patcher: ModelPatcher): + """Stash the original load/offload device on the underlying model. + + Stored on patcher.model (which is shared with the input patcher), so + later "default" selections can recover the loader's original routing. + Only the first Select on a given chain writes these attrs; subsequent + deepclones inherit them onto their freshly-loaded model below. + """ + if not hasattr(patcher.model, "_select_base_load_device"): + patcher.model._select_base_load_device = patcher.load_device + patcher.model._select_base_offload_device = patcher.offload_device + + +def _propagate_base_devices(src_model, dst_model): + """Carry the loader-original device attrs onto the freshly-deepcloned model.""" + if hasattr(src_model, "_select_base_load_device") and not hasattr(dst_model, "_select_base_load_device"): + dst_model._select_base_load_device = src_model._select_base_load_device + dst_model._select_base_offload_device = src_model._select_base_offload_device + + +def _retarget_patcher(patcher: ModelPatcher, target_load_device, target_offload_device): + """Return a patcher whose actual model weights live on *target_load_device*. + + If *patcher* is already on *target_load_device* we just retarget the + (already-cloned) patcher's metadata in place. Otherwise we call + :meth:`ModelPatcher.deepclone_multigpu` to spawn a fresh model from + the loader's ``cached_patcher_init`` factory -- the only safe way to + move weights that may already be partially loaded onto another device. + + NOTE: reusing the input patcher's model when the requested device + matches its current load_device is a deliberate fast path. Anything + that has already mutated the original model (e.g. a prior KSampler + invocation on the same model) will be observed here. This is by + design and documented on the SelectXDeviceNode docstrings -- placing + Select X Device after a node that consumes the same model is not + recommended. + """ + if patcher.load_device == target_load_device: + # Fast path: weights already on the desired device, just update offload. + patcher.offload_device = target_offload_device + return patcher + src_model = patcher.model + patcher = patcher.deepclone_multigpu(new_load_device=target_load_device) + patcher.offload_device = target_offload_device + _propagate_base_devices(src_model, patcher.model) + if hasattr(patcher, "register_load_device"): + patcher.register_load_device(patcher.load_device) + return patcher + + +def _apply_patcher_device(patcher: ModelPatcher, resolved, base_offload_override=None): + """Resolve the requested device and produce a patcher routed there. + + For "default" we restore the loader's original load/offload pair. + For CPU we pin both load and offload to CPU (and, on a dynamic + patcher, downgrade to a plain ModelPatcher so the dynamic-only + code paths are bypassed). + For an explicit GPU we keep the loader's original offload but + target the requested load device; if that differs from the current + load device the patcher is deepcloned onto the new device. + """ + _remember_base_devices(patcher) + base_load = patcher.model._select_base_load_device + base_offload = base_offload_override if base_offload_override is not None else patcher.model._select_base_offload_device + + if resolved is None: + # "default" -> route back to the loader's original devices. + return _retarget_patcher(patcher, base_load, base_offload) + if resolved.type == "cpu": + if patcher.is_dynamic(): + # clone(disable_dynamic=True) requires cached_patcher_init; let the + # exception surface to the caller (Select*DeviceNode.execute), which + # will translate it into a passthrough+log so unsupported loaders + # don't hard-fail the workflow. + patcher = patcher.clone(disable_dynamic=True) + patcher.load_device = resolved + patcher.offload_device = resolved + return patcher + return _retarget_patcher(patcher, resolved, base_offload) + + +def _prune_multigpu_collision(model: ModelPatcher, primary_device): + """Drop any multigpu clone whose load_device matches *primary_device*. + + Without pruning, MultiGPU CFG Split would have stacked a clone on + the same device the primary now occupies (i.e. the workflow places + MultiGPU CFG Split before Select Model Device). Keeps the clone set + consistent with the new primary placement. + """ + multigpu_models = model.get_additional_models_with_key("multigpu") + if not multigpu_models: + return + filtered = [m for m in multigpu_models if m.load_device != primary_device] + if len(filtered) != len(multigpu_models): + logging.info(f"Select Model Device: pruning MultiGPU clone on {primary_device} that now collides with the primary model.") + model.set_additional_models("multigpu", filtered) + if hasattr(model, "match_multigpu_clones"): + model.match_multigpu_clones() + + +class SelectModelDeviceNode(io.ComfyNode): + """ + Place the diffusion model on a specific device (default / cpu / gpu:N). + + - "default" restores the device assigned by the loader (even after a + prior Select Model Device call). + - "cpu" pins both the load and offload device to CPU. + - "gpu:N" pins the load device to the Nth available GPU; the offload + device is restored to the loader's original choice. + + When the requested device differs from the device the input model is + already on, a fresh model is spawned via the loader's reload factory + (cached_patcher_init) so the new patcher owns independent weights on + the new device. Loaders that don't support multigpu (no factory) will + cause the node to pass through unchanged with a warning. + + If the workflow already has MultiGPU CFG Split applied and the chosen + GPU collides with one of the existing multigpu clones, that clone is + dropped so two patchers don't end up bound to the same device. + + When the selected device does not exist on the current machine + (e.g. a workflow built on a 2-GPU box opened on a 1-GPU box), + the node passes the model through unchanged and logs a message + instead of failing. + + NOTE: Placing Select Model Device *after* a node that has already + consumed the same model (e.g. a KSampler that ran on this model on + the original device) is not recommended -- any state the prior + consumer mutated on the original model will be observed when the + selected device matches the original (fast path). Place Select Model + Device before any consumer of the model. + """ + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="SelectModelDevice", + display_name="Select Model Device", + category="advanced/multigpu", + description=cleandoc(cls.__doc__), + inputs=[ + io.Model.Input("model"), + io.Combo.Input("device", options=comfy.model_management.get_gpu_device_options()), + ], + outputs=[ + io.Model.Output(), + ], + ) + + @classmethod + def validate_inputs(cls, device="default"): + # Allow unknown gpu:N values so portable workflows do not error + # at validation time; runtime fallback will handle them. + return True + + @classmethod + def execute(cls, model: ModelPatcher, device: str = "default") -> io.NodeOutput: + model = model.clone() + resolved = comfy.model_management.resolve_gpu_device_option(device) + if resolved is None and device not in (None, "default"): + logging.info(f"Select Model Device: requested device '{device}' not available, passing through unchanged.") + return io.NodeOutput(model) + try: + model = _apply_patcher_device(model, resolved) + except RuntimeError as e: + logging.warning(f"Select Model Device: cannot retarget model, passing through unchanged. ({e})") + return io.NodeOutput(model) + if resolved is not None: + _force_supported_compute_dtype(model, resolved) + _prune_multigpu_collision(model, model.load_device) + return io.NodeOutput(model) + + +class SelectCLIPDeviceNode(io.ComfyNode): + """ + Place the CLIP text encoder on a specific device (default / cpu / gpu:N). + + - "default" restores the device assigned by the loader. + - "cpu" pins both the load and offload device to CPU. + - "gpu:N" pins the load device to the Nth available GPU. + + When the selected device does not exist on the current machine + (e.g. a workflow built on a 2-GPU box opened on a 1-GPU box), + the node passes the CLIP through unchanged and logs a message + instead of failing. + """ + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="SelectCLIPDevice", + display_name="Select CLIP Device", + category="advanced/multigpu", + description=cleandoc(cls.__doc__), + inputs=[ + io.Clip.Input("clip"), + io.Combo.Input("device", options=comfy.model_management.get_gpu_device_options()), + ], + outputs=[ + io.Clip.Output(), + ], + ) + + @classmethod + def validate_inputs(cls, device="default"): + return True + + @classmethod + def execute(cls, clip: CLIP, device: str = "default") -> io.NodeOutput: + clip = clip.clone() + resolved = comfy.model_management.resolve_gpu_device_option(device) + if resolved is None and device not in (None, "default"): + logging.info(f"Select CLIP Device: requested device '{device}' not available, passing through unchanged.") + return io.NodeOutput(clip) + try: + clip.patcher = _apply_patcher_device(clip.patcher, resolved) + except RuntimeError as e: + logging.warning(f"Select CLIP Device: cannot retarget CLIP, passing through unchanged. ({e})") + return io.NodeOutput(clip) + + +class SelectVAEDeviceNode(io.ComfyNode): + """ + Place the VAE on a specific device (default / gpu:N). + + - "default" restores the device assigned by the loader. + - "gpu:N" pins the load device to the Nth available GPU; the offload + device is set to the standard VAE offload device. + + CPU is intentionally not exposed in the UI for the VAE; if a workflow + supplies "cpu" anyway (e.g. opened from another machine), the request + is dropped with a log message and the VAE is passed through unchanged. + + When the selected device does not exist on the current machine + (e.g. a workflow built on a 2-GPU box opened on a 1-GPU box), + the node passes the VAE through unchanged and logs a message + instead of failing. + """ + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="SelectVAEDevice", + display_name="Select VAE Device", + category="advanced/multigpu", + description=cleandoc(cls.__doc__), + inputs=[ + io.Vae.Input("vae"), + io.Combo.Input("device", options=comfy.model_management.get_gpu_device_options_no_cpu()), + ], + outputs=[ + io.Vae.Output(), + ], + ) + + @classmethod + def validate_inputs(cls, device="default"): + return True + + @classmethod + def execute(cls, vae: VAE, device: str = "default") -> io.NodeOutput: + # VAE has no .clone(); shallow-copy the wrapper and clone the patcher + # so we can retarget load/offload device without affecting the input VAE. + vae = copy.copy(vae) + vae.patcher = vae.patcher.clone() + resolved = comfy.model_management.resolve_gpu_device_option(device) + if resolved is None and device not in (None, "default"): + logging.info(f"Select VAE Device: requested device '{device}' not available, passing through unchanged.") + return io.NodeOutput(vae) + if resolved is not None and resolved.type == "cpu": + logging.info("Select VAE Device: CPU is not a supported choice, passing through unchanged.") + return io.NodeOutput(vae) + if not hasattr(vae, "_select_base_device"): + vae._select_base_device = vae.device + try: + vae.patcher = _apply_patcher_device( + vae.patcher, resolved, + base_offload_override=comfy.model_management.vae_offload_device(), + ) + except RuntimeError as e: + logging.warning(f"Select VAE Device: cannot retarget VAE, passing through unchanged. ({e})") + return io.NodeOutput(vae) + # Keep VAE wrapper in sync with whatever model the patcher now owns; + # deepclone_multigpu may have produced a fresh first_stage_model. + vae.first_stage_model = vae.patcher.model + vae.device = vae._select_base_device if resolved is None else resolved + return io.NodeOutput(vae) + + +class MultiGPUOptionsNode(io.ComfyNode): + """ + Select the relative speed of GPUs in the special case they have significantly different performance from one another. + + NOTE (not registered yet, see MultiGPUExtension.get_node_list below): + The output GPUOptionsGroup is plumbed through create_multigpu_deepclones() and stored on + model.model_options['multigpu_options'] via GPUOptionsGroup.register(), but the cond + scheduler in comfy/samplers.py (calc_cond_batch_outer_multigpu) does NOT yet consult + relative_speed when distributing conds across devices; it uses a uniform conds_per_device + round-robin via next_available_device(). Before re-enabling this node, wire its + relative_speed into the scheduler (e.g. via comfy.multigpu.load_balance_devices(), + which already implements the proportional split) so the input actually affects work + distribution. + """ + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="MultiGPU_Options", + display_name="MultiGPU Options", + category="advanced/multigpu", + description=cleandoc(cls.__doc__), + inputs=[ + io.Int.Input("device_index", default=0, min=0, max=64), + io.Float.Input("relative_speed", default=1.0, min=0.0, step=0.01), + io.Custom("GPU_OPTIONS").Input("gpu_options", optional=True), + ], + outputs=[ + io.Custom("GPU_OPTIONS").Output(), + ], + ) + + @classmethod + def execute(cls, device_index: int, relative_speed: float, gpu_options: comfy.multigpu.GPUOptionsGroup = None) -> io.NodeOutput: + if not gpu_options: + gpu_options = comfy.multigpu.GPUOptionsGroup() + else: + gpu_options = gpu_options.clone() + + opt = comfy.multigpu.GPUOptions(device_index=device_index, relative_speed=relative_speed) + gpu_options.add(opt) + + return io.NodeOutput(gpu_options) + + +class MultiGPUExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + MultiGPUCFGSplitNode, + SelectModelDeviceNode, + SelectCLIPDeviceNode, + SelectVAEDeviceNode, + # MultiGPUOptionsNode, + ] + + +async def comfy_entrypoint() -> MultiGPUExtension: + return MultiGPUExtension() diff --git a/comfy_extras/nodes_number_convert.py b/comfy_extras/nodes_number_convert.py index e38a33c15..d7e557e95 100644 --- a/comfy_extras/nodes_number_convert.py +++ b/comfy_extras/nodes_number_convert.py @@ -4,7 +4,6 @@ Provides a single node that converts INT, FLOAT, STRING, and BOOL inputs into FLOAT and INT outputs. """ -from __future__ import annotations import math @@ -21,7 +20,7 @@ class NumberConvertNode(io.ComfyNode): return io.Schema( node_id="ComfyNumberConvert", display_name="Convert Number", - category="utils", + category="utilities", search_aliases=[ "int to float", "float to int", "number convert", "int2float", "float2int", "cast", "parse number", diff --git a/comfy_extras/nodes_optimalsteps.py b/comfy_extras/nodes_optimalsteps.py index 5beeaa7db..19629790f 100644 --- a/comfy_extras/nodes_optimalsteps.py +++ b/comfy_extras/nodes_optimalsteps.py @@ -31,7 +31,7 @@ class OptimalStepsScheduler(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="OptimalStepsScheduler", - category="sampling/schedulers", + category="model/sampling/schedulers", inputs=[ io.Combo.Input("model_type", options=["FLUX", "Wan", "Chroma"]), io.Int.Input("steps", default=20, min=3, max=1000), diff --git a/comfy_extras/nodes_pag.py b/comfy_extras/nodes_pag.py index 79fea5f0c..c875e1e06 100644 --- a/comfy_extras/nodes_pag.py +++ b/comfy_extras/nodes_pag.py @@ -15,7 +15,7 @@ class PerturbedAttentionGuidance(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="PerturbedAttentionGuidance", - category="model_patches/unet", + category="model/patch/unet", inputs=[ io.Model.Input("model"), io.Float.Input("scale", default=3.0, min=0.0, max=100.0, step=0.01, round=0.01), diff --git a/comfy_extras/nodes_painter.py b/comfy_extras/nodes_painter.py index e104c8480..df7a0b76a 100644 --- a/comfy_extras/nodes_painter.py +++ b/comfy_extras/nodes_painter.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import hashlib import os diff --git a/comfy_extras/nodes_pid.py b/comfy_extras/nodes_pid.py new file mode 100644 index 000000000..811b9ae8e --- /dev/null +++ b/comfy_extras/nodes_pid.py @@ -0,0 +1,55 @@ +"""PiD (Pixel Diffusion Decoder) node""" + +import torch +from typing_extensions import override + +import node_helpers +import comfy.latent_formats +from comfy_api.latest import ComfyExtension, io + + +class PiDConditioning(io.ComfyNode): + @classmethod + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="PiDConditioning", + display_name="PiD Conditioning", + category="advanced/conditioning", + description=( + "Attaches a latent and a degrade_sigma scalar to a CONDITIONING for PiD decoding/upscaling" + ), + inputs=[ + io.Conditioning.Input("positive"), + io.Latent.Input("latent", tooltip="latent (from VAEEncode or a KSampler)."), + io.Combo.Input("latent_format", options=["flux", "sd3"], default="flux", + tooltip="Flux1 and Flux2 latents auto-detected from channel dim, sd3 has to be selected manually."), + io.Float.Input( + "degrade_sigma", default=0.0, min=0.0, max=1.0, step=0.01, + tooltip="0 = clean latent. Increase to denoise corrupted latent outputs.", + ), + ], + outputs=[io.Conditioning.Output()], + ) + + @classmethod + def execute(cls, positive, latent, latent_format: str, degrade_sigma: float) -> io.NodeOutput: + samples = latent["samples"] + if latent_format == "flux": + fmt_cls = comfy.latent_formats.Flux2 if samples.shape[1] == 128 else comfy.latent_formats.Flux + else: + fmt_cls = comfy.latent_formats.SD3 + lq_latent = fmt_cls().process_in(samples) + sigma_t = torch.tensor([float(degrade_sigma)], dtype=torch.float32) + return io.NodeOutput(node_helpers.conditioning_set_values( + positive, {"lq_latent": lq_latent, "degrade_sigma": sigma_t}, + )) + + +class PiDExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [PiDConditioning] + + +async def comfy_entrypoint() -> PiDExtension: + return PiDExtension() diff --git a/comfy_extras/nodes_post_processing.py b/comfy_extras/nodes_post_processing.py index a25db277c..3e440433e 100644 --- a/comfy_extras/nodes_post_processing.py +++ b/comfy_extras/nodes_post_processing.py @@ -616,7 +616,7 @@ class BatchLatentsNode(io.ComfyNode): node_id="BatchLatentsNode", search_aliases=["combine latents", "stack latents", "merge latents"], display_name="Batch Latents", - category="latent", + category="model/latent", inputs=[ io.Autogrow.Input("latents", template=autogrow_template) ], diff --git a/comfy_extras/nodes_preview_any.py b/comfy_extras/nodes_preview_any.py index 17e25d514..1070a69d0 100644 --- a/comfy_extras/nodes_preview_any.py +++ b/comfy_extras/nodes_preview_any.py @@ -16,7 +16,7 @@ class PreviewAny(): FUNCTION = "main" OUTPUT_NODE = True - CATEGORY = "utils" + CATEGORY = "utilities" SEARCH_ALIASES = ["show output", "inspect", "debug", "print value", "show text"] def main(self, source=None): diff --git a/comfy_extras/nodes_primitive.py b/comfy_extras/nodes_primitive.py index 33373266b..c44b09098 100644 --- a/comfy_extras/nodes_primitive.py +++ b/comfy_extras/nodes_primitive.py @@ -11,7 +11,7 @@ class String(io.ComfyNode): node_id="PrimitiveString", search_aliases=["text", "string", "text box", "prompt"], display_name="Text String", - category="utils/primitive", + category="utilities/primitive", inputs=[ io.String.Input("value"), ], @@ -30,7 +30,7 @@ class StringMultiline(io.ComfyNode): node_id="PrimitiveStringMultiline", search_aliases=["text", "string", "text multiline", "string multiline", "text box", "prompt"], display_name="Text String (Multiline)", - category="utils/primitive", + category="utilities/primitive", essentials_category="Basics", inputs=[ io.String.Input("value", multiline=True), @@ -49,7 +49,7 @@ class Int(io.ComfyNode): return io.Schema( node_id="PrimitiveInt", display_name="Int", - category="utils/primitive", + category="utilities/primitive", inputs=[ io.Int.Input("value", min=-sys.maxsize, max=sys.maxsize, control_after_generate=io.ControlAfterGenerate.fixed), ], @@ -67,7 +67,7 @@ class Float(io.ComfyNode): return io.Schema( node_id="PrimitiveFloat", display_name="Float", - category="utils/primitive", + category="utilities/primitive", inputs=[ io.Float.Input("value", min=-sys.maxsize, max=sys.maxsize, step=0.1), ], @@ -85,7 +85,7 @@ class Boolean(io.ComfyNode): return io.Schema( node_id="PrimitiveBoolean", display_name="Boolean", - category="utils/primitive", + category="utilities/primitive", inputs=[ io.Boolean.Input("value"), ], diff --git a/comfy_extras/nodes_qwen.py b/comfy_extras/nodes_qwen.py index fde8fac9a..5b92814a4 100644 --- a/comfy_extras/nodes_qwen.py +++ b/comfy_extras/nodes_qwen.py @@ -112,7 +112,7 @@ class EmptyQwenImageLayeredLatentImage(io.ComfyNode): return io.Schema( node_id="EmptyQwenImageLayeredLatentImage", display_name="Empty Qwen Image Layered Latent", - category="latent/qwen", + category="model/latent/qwen", inputs=[ io.Int.Input("width", default=640, min=16, max=nodes.MAX_RESOLUTION, step=16), io.Int.Input("height", default=640, min=16, max=nodes.MAX_RESOLUTION, step=16), diff --git a/comfy_extras/nodes_rebatch.py b/comfy_extras/nodes_rebatch.py index 5f4e82aef..2185385f0 100644 --- a/comfy_extras/nodes_rebatch.py +++ b/comfy_extras/nodes_rebatch.py @@ -10,7 +10,7 @@ class LatentRebatch(io.ComfyNode): return io.Schema( node_id="RebatchLatents", display_name="Rebatch Latents", - category="latent/batch", + category="model/latent/batch", is_input_list=True, inputs=[ io.Latent.Input("latents"), diff --git a/comfy_extras/nodes_resolution.py b/comfy_extras/nodes_resolution.py index 520b4067e..dc405291c 100644 --- a/comfy_extras/nodes_resolution.py +++ b/comfy_extras/nodes_resolution.py @@ -1,4 +1,3 @@ -from __future__ import annotations import math from enum import Enum from typing_extensions import override @@ -36,7 +35,7 @@ class ResolutionSelector(io.ComfyNode): return io.Schema( node_id="ResolutionSelector", display_name="Resolution Selector", - category="utils", + category="utilities", description="Calculate width and height from aspect ratio and megapixel target. Useful for setting up Empty Latent Image dimensions.", inputs=[ io.Combo.Input( diff --git a/comfy_extras/nodes_rope.py b/comfy_extras/nodes_rope.py index 918ddc02b..808eee29b 100644 --- a/comfy_extras/nodes_rope.py +++ b/comfy_extras/nodes_rope.py @@ -7,7 +7,7 @@ class ScaleROPE(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="ScaleROPE", - category="advanced/model_patches", + category="model/patch", description="Scale and shift the ROPE of the model.", is_experimental=True, inputs=[ diff --git a/comfy_extras/nodes_sd3.py b/comfy_extras/nodes_sd3.py index 6655c1ba7..38cbf117b 100644 --- a/comfy_extras/nodes_sd3.py +++ b/comfy_extras/nodes_sd3.py @@ -41,7 +41,7 @@ class EmptySD3LatentImage(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="EmptySD3LatentImage", - category="latent/sd3", + category="model/latent/sd3", inputs=[ io.Int.Input("width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16), io.Int.Input("height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16), @@ -113,7 +113,7 @@ class ControlNetApplySD3(io.ComfyNode): return io.Schema( node_id="ControlNetApplySD3", display_name="Apply Controlnet with VAE", - category="conditioning/controlnet", + category="model/conditioning/controlnet", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), diff --git a/comfy_extras/nodes_sdupscale.py b/comfy_extras/nodes_sdupscale.py index 5877719d3..ea283e971 100644 --- a/comfy_extras/nodes_sdupscale.py +++ b/comfy_extras/nodes_sdupscale.py @@ -9,7 +9,7 @@ class SD_4XUpscale_Conditioning(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SD_4XUpscale_Conditioning", - category="conditioning/upscale_diffusion", + category="model/conditioning/upscale_diffusion", inputs=[ io.Image.Input("images"), io.Conditioning.Input("positive"), diff --git a/comfy_extras/nodes_stable3d.py b/comfy_extras/nodes_stable3d.py index 829c837a1..8a6e5b726 100644 --- a/comfy_extras/nodes_stable3d.py +++ b/comfy_extras/nodes_stable3d.py @@ -27,7 +27,7 @@ class StableZero123_Conditioning(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="StableZero123_Conditioning", - category="conditioning/3d_models", + category="model/conditioning/3d_models", inputs=[ io.ClipVision.Input("clip_vision"), io.Image.Input("init_image"), @@ -65,7 +65,7 @@ class StableZero123_Conditioning_Batched(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="StableZero123_Conditioning_Batched", - category="conditioning/3d_models", + category="model/conditioning/3d_models", inputs=[ io.ClipVision.Input("clip_vision"), io.Image.Input("init_image"), @@ -112,7 +112,7 @@ class SV3D_Conditioning(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SV3D_Conditioning", - category="conditioning/3d_models", + category="model/conditioning/3d_models", inputs=[ io.ClipVision.Input("clip_vision"), io.Image.Input("init_image"), diff --git a/comfy_extras/nodes_stable_cascade.py b/comfy_extras/nodes_stable_cascade.py index 0dc6c9fcd..e55f248ae 100644 --- a/comfy_extras/nodes_stable_cascade.py +++ b/comfy_extras/nodes_stable_cascade.py @@ -29,7 +29,7 @@ class StableCascade_EmptyLatentImage(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="StableCascade_EmptyLatentImage", - category="latent/stable_cascade", + category="model/latent/stable_cascade", inputs=[ io.Int.Input("width", default=1024, min=256, max=nodes.MAX_RESOLUTION, step=8), io.Int.Input("height", default=1024, min=256, max=nodes.MAX_RESOLUTION, step=8), @@ -58,7 +58,7 @@ class StableCascade_StageC_VAEEncode(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="StableCascade_StageC_VAEEncode", - category="latent/stable_cascade", + category="model/latent/stable_cascade", inputs=[ io.Image.Input("image"), io.Vae.Input("vae"), @@ -93,7 +93,7 @@ class StableCascade_StageB_Conditioning(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="StableCascade_StageB_Conditioning", - category="conditioning/stable_cascade", + category="model/conditioning/stable_cascade", inputs=[ io.Conditioning.Input("conditioning"), io.Latent.Input("stage_c"), diff --git a/comfy_extras/nodes_string.py b/comfy_extras/nodes_string.py index 925a40da8..97485c8c5 100644 --- a/comfy_extras/nodes_string.py +++ b/comfy_extras/nodes_string.py @@ -1,10 +1,41 @@ import re import json +import string from typing_extensions import override from comfy_api.latest import ComfyExtension, io +class StringFormat(io.ComfyNode): + @classmethod + def define_schema(cls) -> io.Schema: + autogrow = io.Autogrow.TemplateNames( + input=io.AnyType.Input("value"), + names=list(string.ascii_lowercase), + min=0, + ) + return io.Schema( + node_id="StringFormat", + display_name="Format Text", + category="text", + search_aliases=["string", "format"], + description="Same as Python's string format method. Supports all of Python's format options and features.", + inputs=[ + io.Autogrow.Input("values", template=autogrow), + io.String.Input("f_string", default="{a}", multiline=True), + ], + outputs=[ + io.String.Output(), + ], + ) + + @classmethod + def execute( + cls, values: io.Autogrow.Type, f_string: str + ) -> io.NodeOutput: + return io.NodeOutput(f_string.format(**values)) + + class StringConcatenate(io.ComfyNode): @classmethod def define_schema(cls): @@ -413,6 +444,7 @@ class StringExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[io.ComfyNode]]: return [ + StringFormat, StringConcatenate, StringSubstring, StringLength, diff --git a/comfy_extras/nodes_tomesd.py b/comfy_extras/nodes_tomesd.py index 87bf29b8f..3667fac3a 100644 --- a/comfy_extras/nodes_tomesd.py +++ b/comfy_extras/nodes_tomesd.py @@ -151,7 +151,7 @@ class TomePatchModel(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="TomePatchModel", - category="model_patches/unet", + category="model/patch/unet", inputs=[ io.Model.Input("model"), io.Float.Input("ratio", default=0.3, min=0.0, max=1.0, step=0.01), diff --git a/comfy_extras/nodes_toolkit.py b/comfy_extras/nodes_toolkit.py index 71faf7226..9f709bbe3 100644 --- a/comfy_extras/nodes_toolkit.py +++ b/comfy_extras/nodes_toolkit.py @@ -1,4 +1,3 @@ -from __future__ import annotations from typing_extensions import override from comfy_api.latest import ComfyExtension, io @@ -14,7 +13,7 @@ class CreateList(io.ComfyNode): return io.Schema( node_id="CreateList", display_name="Create List", - category="logic", + category="utilities", is_input_list=True, search_aliases=["Image Iterator", "Text Iterator", "Iterator"], inputs=[io.Autogrow.Input("inputs", template=template_autogrow)], diff --git a/comfy_extras/nodes_train.py b/comfy_extras/nodes_train.py index e9871369b..046eeaaf5 100644 --- a/comfy_extras/nodes_train.py +++ b/comfy_extras/nodes_train.py @@ -951,7 +951,7 @@ class TrainLoraNode(io.ComfyNode): return io.Schema( node_id="TrainLoraNode", display_name="Train LoRA", - category="training", + category="model/training", is_experimental=True, is_input_list=True, # All inputs become lists inputs=[ @@ -1309,7 +1309,7 @@ class LoraModelLoader(io.ComfyNode): return io.Schema( node_id="LoraModelLoader", display_name="Load LoRA Model", - category="loaders", + category="model/loaders", is_experimental=True, inputs=[ io.Model.Input( @@ -1405,7 +1405,7 @@ class LossGraphNode(io.ComfyNode): node_id="LossGraphNode", search_aliases=["training chart", "training visualization", "plot loss"], display_name="Plot Loss Graph", - category="training", + category="model/training", is_experimental=True, is_output_node=True, inputs=[ diff --git a/comfy_extras/nodes_upscale_model.py b/comfy_extras/nodes_upscale_model.py index d3ee3f1c1..1cf5a5d01 100644 --- a/comfy_extras/nodes_upscale_model.py +++ b/comfy_extras/nodes_upscale_model.py @@ -22,7 +22,7 @@ class UpscaleModelLoader(io.ComfyNode): return io.Schema( node_id="UpscaleModelLoader", display_name="Load Upscale Model", - category="loaders", + category="model/loaders", inputs=[ io.Combo.Input("model_name", options=folder_paths.get_filename_list("upscale_models")), ], diff --git a/comfy_extras/nodes_video.py b/comfy_extras/nodes_video.py index 78a2a28f8..ae1d826d5 100644 --- a/comfy_extras/nodes_video.py +++ b/comfy_extras/nodes_video.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import os import av import torch diff --git a/comfy_extras/nodes_video_model.py b/comfy_extras/nodes_video_model.py index 8f19895a1..0d6cae6a8 100644 --- a/comfy_extras/nodes_video_model.py +++ b/comfy_extras/nodes_video_model.py @@ -15,7 +15,7 @@ class ImageOnlyCheckpointLoader: RETURN_TYPES = ("MODEL", "CLIP_VISION", "VAE") FUNCTION = "load_checkpoint" - CATEGORY = "loaders" + CATEGORY = "model/loaders" def load_checkpoint(self, ckpt_name, output_vae=True, output_clip=True): ckpt_path = folder_paths.get_full_path_or_raise("checkpoints", ckpt_name) @@ -41,7 +41,7 @@ class SVD_img2vid_Conditioning: FUNCTION = "encode" - CATEGORY = "conditioning/video_models" + CATEGORY = "model/conditioning/video_models" def encode(self, clip_vision, init_image, vae, width, height, video_frames, motion_bucket_id, fps, augmentation_level): output = clip_vision.encode_image(init_image) @@ -65,7 +65,7 @@ class VideoLinearCFGGuidance: RETURN_TYPES = ("MODEL",) FUNCTION = "patch" - CATEGORY = "sampling/guiders" + CATEGORY = "model/sampling/guiders" def patch(self, model, min_cfg): def linear_cfg(args): @@ -89,7 +89,7 @@ class VideoTriangleCFGGuidance: RETURN_TYPES = ("MODEL",) FUNCTION = "patch" - CATEGORY = "sampling/guiders" + CATEGORY = "model/sampling/guiders" def patch(self, model, min_cfg): def linear_cfg(args): @@ -138,7 +138,7 @@ class ConditioningSetAreaPercentageVideo: RETURN_TYPES = ("CONDITIONING",) FUNCTION = "append" - CATEGORY = "conditioning" + CATEGORY = "model/conditioning" def append(self, conditioning, width, height, temporal, x, y, z, strength): c = node_helpers.conditioning_set_values(conditioning, {"area": ("percentage", temporal, height, width, z, y, x), diff --git a/comfy_extras/nodes_void.py b/comfy_extras/nodes_void.py index be724371a..b43154b8d 100644 --- a/comfy_extras/nodes_void.py +++ b/comfy_extras/nodes_void.py @@ -58,7 +58,7 @@ class OpticalFlowLoader(io.ComfyNode): return io.Schema( node_id="OpticalFlowLoader", display_name="Load Optical Flow Model", - category="loaders", + category="model/loaders", inputs=[ io.Combo.Input( "model_name", @@ -175,7 +175,7 @@ class VOIDInpaintConditioning(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="VOIDInpaintConditioning", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -288,7 +288,7 @@ class VOIDWarpedNoise(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="VOIDWarpedNoise", - category="latent/video", + category="model/latent/video", inputs=[ OpticalFlow.Input( "optical_flow", @@ -393,7 +393,7 @@ class VOIDWarpedNoiseSource(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="VOIDWarpedNoiseSource", - category="sampling/noise", + category="model/sampling/noise", inputs=[ io.Latent.Input("warped_noise", tooltip="Warped noise latent from VOIDWarpedNoise"), @@ -455,7 +455,7 @@ class VOIDSampler(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="VOIDSampler", - category="sampling/samplers", + category="model/sampling/samplers", inputs=[], outputs=[io.Sampler.Output()], ) diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index e50bfcd2c..67d3a8443 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -18,7 +18,7 @@ class WanImageToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanImageToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -66,7 +66,7 @@ class WanFunControlToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanFunControlToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -119,7 +119,7 @@ class Wan22FunControlToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="Wan22FunControlToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -184,7 +184,7 @@ class WanFirstLastFrameToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanFirstLastFrameToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -256,7 +256,7 @@ class WanFunInpaintToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanFunInpaintToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -288,7 +288,7 @@ class WanVaceToVideo(io.ComfyNode): return io.Schema( node_id="WanVaceToVideo", search_aliases=["video conditioning", "video control"], - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -375,7 +375,7 @@ class TrimVideoLatent(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="TrimVideoLatent", - category="latent/video", + category="model/latent/video", inputs=[ io.Latent.Input("samples"), io.Int.Input("trim_amount", default=0, min=0, max=99999), @@ -398,7 +398,7 @@ class WanCameraImageToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanCameraImageToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -452,7 +452,7 @@ class WanPhantomSubjectToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanPhantomSubjectToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -707,7 +707,7 @@ class WanTrackToVideo(io.ComfyNode): return io.Schema( node_id="WanTrackToVideo", search_aliases=["motion tracking", "trajectory video", "point tracking", "keypoint animation"], - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -951,7 +951,7 @@ class WanSoundImageToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanSoundImageToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -984,7 +984,7 @@ class WanSoundImageToVideoExtend(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanSoundImageToVideoExtend", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -1046,7 +1046,7 @@ class WanHuMoImageToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanHuMoImageToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -1112,7 +1112,7 @@ class WanAnimateToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanAnimateToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -1252,7 +1252,7 @@ class Wan22ImageToVideoLatent(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="Wan22ImageToVideoLatent", - category="conditioning/inpaint", + category="model/conditioning/inpaint", inputs=[ io.Vae.Input("vae"), io.Int.Input("width", default=1280, min=32, max=nodes.MAX_RESOLUTION, step=32), @@ -1302,7 +1302,7 @@ class WanInfiniteTalkToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanInfiniteTalkToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.DynamicCombo.Input("mode", options=[ io.DynamicCombo.Option("single_speaker", []), @@ -1461,7 +1461,7 @@ class WanSCAILToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanSCAILToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), diff --git a/comfy_extras/nodes_wandancer.py b/comfy_extras/nodes_wandancer.py index fc005ed4c..a96885745 100644 --- a/comfy_extras/nodes_wandancer.py +++ b/comfy_extras/nodes_wandancer.py @@ -713,7 +713,7 @@ class WanDancerEncodeAudio(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanDancerEncodeAudio", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Audio.Input("audio"), io.Int.Input("video_frames", default=149, min=1, max=nodes.MAX_RESOLUTION, step=4), @@ -787,7 +787,7 @@ class WanDancerVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanDancerVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), diff --git a/comfy_extras/nodes_wanmove.py b/comfy_extras/nodes_wanmove.py index 5acae03eb..2db064922 100644 --- a/comfy_extras/nodes_wanmove.py +++ b/comfy_extras/nodes_wanmove.py @@ -247,7 +247,7 @@ class WanMoveVisualizeTracks(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanMoveVisualizeTracks", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Image.Input("images"), io.Tracks.Input("tracks", optional=True), @@ -283,7 +283,7 @@ class WanMoveTracksFromCoords(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanMoveTracksFromCoords", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.String.Input("track_coords", force_input=True, default="[]", optional=True), io.Mask.Input("track_mask", optional=True), @@ -325,7 +325,7 @@ class GenerateTracks(io.ComfyNode): return io.Schema( node_id="GenerateTracks", search_aliases=["motion paths", "camera movement", "trajectory"], - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Int.Input("width", default=832, min=16, max=4096, step=16), io.Int.Input("height", default=480, min=16, max=4096, step=16), @@ -434,7 +434,7 @@ class WanMoveConcatTrack(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanMoveConcatTrack", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Tracks.Input("tracks_1"), io.Tracks.Input("tracks_2", optional=True), @@ -463,7 +463,7 @@ class WanMoveTrackToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanMoveTrackToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), diff --git a/comfyui_version.py b/comfyui_version.py index 4c6f5eb2a..0bb0f780c 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.21.1" +__version__ = "0.22.0" diff --git a/execution.py b/execution.py index 4c7de2e84..5246d651c 100644 --- a/execution.py +++ b/execution.py @@ -2,6 +2,7 @@ import copy import heapq import inspect import logging +import psutil import sys import threading import time @@ -727,6 +728,7 @@ class PromptExecutor: self._notify_prompt_lifecycle("start", prompt_id) ram_headroom = int(self.cache_args["ram"] * (1024 ** 3)) + ram_inactive_headroom = int(self.cache_args["ram_inactive"] * (1024 ** 3)) ram_release_callback = self.caches.outputs.ram_release if self.cache_type == CacheType.RAM_PRESSURE else None comfy.memory_management.set_ram_cache_release_state(ram_release_callback, ram_headroom) @@ -780,8 +782,14 @@ class PromptExecutor: execution_list.complete_node_execution() if self.cache_type == CacheType.RAM_PRESSURE: - comfy.model_management.free_memory(0, None, pins_required=ram_headroom, ram_required=ram_headroom) - ram_release_callback(ram_headroom, free_active=True) + ram_release_callback(ram_inactive_headroom) + ram_shortfall = ram_headroom - psutil.virtual_memory().available + freed = comfy.model_management.free_pins(ram_shortfall + 512 * (1024 ** 2)) + if freed < ram_shortfall: + if freed > 64 * (1024 ** 2): + # AIMDO MEM_DECOMMIT can outrun psutil.available catching up. + time.sleep(0.05) + ram_release_callback(ram_headroom, free_active=True) else: # Only execute when the while-loop ends without break # Send cached UI for intermediate output nodes that weren't executed diff --git a/folder_paths.py b/folder_paths.py index ad7f0f4fc..7304e1b73 100644 --- a/folder_paths.py +++ b/folder_paths.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import os import time import mimetypes @@ -60,6 +58,8 @@ folder_names_and_paths["geometry_estimation"] = ([os.path.join(models_dir, "geom folder_names_and_paths["optical_flow"] = ([os.path.join(models_dir, "optical_flow")], supported_pt_extensions) +folder_names_and_paths["detection"] = ([os.path.join(models_dir, "detection")], supported_pt_extensions) + output_directory = os.path.join(base_path, "output") temp_directory = os.path.join(base_path, "temp") input_directory = os.path.join(base_path, "input") diff --git a/main.py b/main.py index a6fdaf43c..bce451a83 100644 --- a/main.py +++ b/main.py @@ -218,7 +218,7 @@ import comfy.model_patcher if args.enable_dynamic_vram or (enables_dynamic_vram() and comfy.model_management.is_nvidia() and not comfy.model_management.is_wsl()): if (not args.enable_dynamic_vram) and (comfy.model_management.torch_version_numeric < (2, 8)): logging.warning("Unsupported Pytorch detected. DynamicVRAM support requires Pytorch version 2.8 or later. Falling back to legacy ModelPatcher. VRAM estimates may be unreliable especially on Windows") - elif comfy_aimdo.control.init_device(comfy.model_management.get_torch_device().index): + elif comfy_aimdo.control.init_devices(d.index for d in comfy.model_management.get_all_torch_devices()): if args.verbose == 'DEBUG': comfy_aimdo.control.set_log_debug() elif args.verbose == 'CRITICAL': @@ -283,19 +283,25 @@ def _collect_output_absolute_paths(history_result: dict) -> list[str]: def prompt_worker(q, server_instance): current_time: float = 0.0 - cache_ram = args.cache_ram - if cache_ram < 0: - cache_ram = min(32.0, max(4.0, comfy.model_management.total_ram * 0.25 / 1024.0)) + cache_ram = 0 + cache_ram_inactive = 0 + if not args.cache_classic and not args.cache_none and args.cache_lru <= 0: + cache_ram = min(10.0, max(2.0, comfy.model_management.total_ram * 0.10 / 1024.0)) + cache_ram_inactive = min(96.0, comfy.model_management.total_ram / 1024.0) + if len(args.cache_ram) > 0: + cache_ram = args.cache_ram[0] + if len(args.cache_ram) > 1: + cache_ram_inactive = args.cache_ram[1] - cache_type = execution.CacheType.CLASSIC - if args.cache_lru > 0: + cache_type = execution.CacheType.RAM_PRESSURE + if args.cache_classic: + cache_type = execution.CacheType.CLASSIC + elif args.cache_lru > 0: cache_type = execution.CacheType.LRU - elif cache_ram > 0: - cache_type = execution.CacheType.RAM_PRESSURE elif args.cache_none: cache_type = execution.CacheType.NONE - e = execution.PromptExecutor(server_instance, cache_type=cache_type, cache_args={ "lru" : args.cache_lru, "ram" : cache_ram } ) + e = execution.PromptExecutor(server_instance, cache_type=cache_type, cache_args={ "lru" : args.cache_lru, "ram" : cache_ram, "ram_inactive" : cache_ram_inactive } ) last_gc_collect = 0 need_gc = False gc_collect_interval = 10.0 @@ -338,9 +344,9 @@ def prompt_worker(q, server_instance): # Log Time in a more readable way after 10 minutes if execution_time > 600: execution_time = time.strftime("%H:%M:%S", time.gmtime(execution_time)) - logging.info(f"Prompt executed in {execution_time}") + logging.info(f"Prompt executed in {execution_time}", extra={'color': 'green'}) else: - logging.info("Prompt executed in {:.2f} seconds".format(execution_time)) + logging.info("Prompt executed in {:.2f} seconds".format(execution_time), extra={'color': 'green'}) if not asset_seeder.is_disabled(): paths = _collect_output_absolute_paths(e.history_result) diff --git a/models/detection/put_detection_models_here b/models/detection/put_detection_models_here new file mode 100644 index 000000000..e69de29bb diff --git a/nodes.py b/nodes.py index fdd6eeb5f..528bf316f 100644 --- a/nodes.py +++ b/nodes.py @@ -1,4 +1,3 @@ -from __future__ import annotations import torch @@ -69,7 +68,7 @@ class CLIPTextEncode(ComfyNodeABC): OUTPUT_TOOLTIPS = ("A conditioning containing the embedded text used to guide the diffusion model.",) FUNCTION = "encode" - CATEGORY = "conditioning" + CATEGORY = "model/conditioning" DESCRIPTION = "Encodes a text prompt using a CLIP model into an embedding that can be used to guide the diffusion model towards generating specific images." SEARCH_ALIASES = ["text", "prompt", "text prompt", "positive prompt", "negative prompt", "encode text", "text encoder", "encode prompt"] @@ -88,7 +87,7 @@ class ConditioningCombine: RETURN_TYPES = ("CONDITIONING",) FUNCTION = "combine" - CATEGORY = "conditioning" + CATEGORY = "model/conditioning" SEARCH_ALIASES = ["combine", "merge conditioning", "combine prompts", "merge prompts", "mix prompts", "add prompt"] def combine(self, conditioning_1, conditioning_2): @@ -105,7 +104,7 @@ class ConditioningAverage : RETURN_TYPES = ("CONDITIONING",) FUNCTION = "addWeighted" - CATEGORY = "conditioning" + CATEGORY = "model/conditioning" def addWeighted(self, conditioning_to, conditioning_from, conditioning_to_strength): out = [] @@ -144,7 +143,7 @@ class ConditioningConcat: RETURN_TYPES = ("CONDITIONING",) FUNCTION = "concat" - CATEGORY = "conditioning" + CATEGORY = "model/conditioning" def concat(self, conditioning_to, conditioning_from): out = [] @@ -177,7 +176,7 @@ class ConditioningSetArea: RETURN_TYPES = ("CONDITIONING",) FUNCTION = "append" - CATEGORY = "conditioning" + CATEGORY = "model/conditioning" def append(self, conditioning, width, height, x, y, strength): c = node_helpers.conditioning_set_values(conditioning, {"area": (height // 8, width // 8, y // 8, x // 8), @@ -198,7 +197,7 @@ class ConditioningSetAreaPercentage: RETURN_TYPES = ("CONDITIONING",) FUNCTION = "append" - CATEGORY = "conditioning" + CATEGORY = "model/conditioning" def append(self, conditioning, width, height, x, y, strength): c = node_helpers.conditioning_set_values(conditioning, {"area": ("percentage", height, width, y, x), @@ -215,7 +214,7 @@ class ConditioningSetAreaStrength: RETURN_TYPES = ("CONDITIONING",) FUNCTION = "append" - CATEGORY = "conditioning" + CATEGORY = "model/conditioning" def append(self, conditioning, strength): c = node_helpers.conditioning_set_values(conditioning, {"strength": strength}) @@ -235,7 +234,7 @@ class ConditioningSetMask: RETURN_TYPES = ("CONDITIONING",) FUNCTION = "append" - CATEGORY = "conditioning" + CATEGORY = "model/conditioning" def append(self, conditioning, mask, set_cond_area, strength): set_area_to_bounds = False @@ -304,7 +303,7 @@ class VAEDecode: OUTPUT_TOOLTIPS = ("The decoded image.",) FUNCTION = "decode" - CATEGORY = "latent" + CATEGORY = "model/latent" DESCRIPTION = "Decodes latent images back into pixel space images." SEARCH_ALIASES = ["decode", "decode latent", "latent to image", "render latent"] @@ -358,7 +357,7 @@ class VAEEncode: RETURN_TYPES = ("LATENT",) FUNCTION = "encode" - CATEGORY = "latent" + CATEGORY = "model/latent" SEARCH_ALIASES = ["encode", "encode image", "image to latent"] def encode(self, vae, pixels): @@ -390,7 +389,7 @@ class VAEEncodeForInpaint: RETURN_TYPES = ("LATENT",) FUNCTION = "encode" - CATEGORY = "latent/inpaint" + CATEGORY = "model/latent/inpaint" def encode(self, vae, pixels, mask, grow_mask_by=6): downscale_ratio = vae.spacial_compression_encode() @@ -439,7 +438,7 @@ class InpaintModelConditioning: RETURN_NAMES = ("positive", "negative", "latent") FUNCTION = "encode" - CATEGORY = "conditioning/inpaint" + CATEGORY = "model/conditioning/inpaint" def encode(self, positive, negative, pixels, vae, mask, noise_mask=True): x = (pixels.shape[1] // 8) * 8 @@ -599,7 +598,7 @@ class CheckpointLoaderSimple: "The VAE model used for encoding and decoding images to and from latent space.") FUNCTION = "load_checkpoint" - CATEGORY = "loaders" + CATEGORY = "model/loaders" DESCRIPTION = "Loads a diffusion model checkpoint, diffusion models are used to denoise latents." SEARCH_ALIASES = ["load model", "checkpoint", "model loader", "load checkpoint", "ckpt", "model"] @@ -645,7 +644,7 @@ class unCLIPCheckpointLoader: RETURN_TYPES = ("MODEL", "CLIP", "VAE", "CLIP_VISION") FUNCTION = "load_checkpoint" - CATEGORY = "loaders" + CATEGORY = "model/loaders" def load_checkpoint(self, ckpt_name, output_vae=True, output_clip=True): ckpt_path = folder_paths.get_full_path_or_raise("checkpoints", ckpt_name) @@ -661,7 +660,7 @@ class CLIPSetLastLayer: RETURN_TYPES = ("CLIP",) FUNCTION = "set_last_layer" - CATEGORY = "conditioning" + CATEGORY = "model/conditioning" def set_last_layer(self, clip, stop_at_clip_layer): clip = clip.clone() @@ -690,7 +689,7 @@ class LoraLoader: OUTPUT_TOOLTIPS = ("The modified diffusion model.", "The modified CLIP model.") FUNCTION = "load_lora" - CATEGORY = "loaders" + CATEGORY = "model/loaders" DESCRIPTION = "This LoRA loader is used to modify both diffusion and CLIP models, altering the way in which latents are denoised such as applying styles. Multiple LoRA nodes can be linked together." SEARCH_ALIASES = ["lora", "load lora", "apply lora", "lora loader", "lora model"] @@ -790,11 +789,12 @@ class VAELoader: RETURN_TYPES = ("VAE",) FUNCTION = "load_vae" - CATEGORY = "loaders" + CATEGORY = "model/loaders" #TODO: scale factor? def load_vae(self, vae_name): metadata = None + vae_path = None if vae_name == "pixel_space": sd = {} sd["pixel_space_vae"] = torch.tensor(1.0) @@ -813,6 +813,14 @@ class VAELoader: metadata["tae_latent_channels"] = 128 vae = comfy.sd.VAE(sd=sd, metadata=metadata) vae.throw_exception_if_invalid() + # Register a reload factory on the patcher so multigpu deepclones + # (Select VAE Device, future MultiGPU VAE work-units) can produce + # per-device clones from the same loader context. Only set when we + # actually have a single backing file -- pixel_space and the + # image TAESDs (composed from separate encoder/decoder files via + # load_taesd) are not addressable by a single vae_path. + if vae_path is not None: + vae.patcher.cached_patcher_init = (comfy.sd.load_vae_patcher, (vae_path, metadata, None)) return (vae,) class ControlNetLoader: @@ -823,7 +831,7 @@ class ControlNetLoader: RETURN_TYPES = ("CONTROL_NET",) FUNCTION = "load_controlnet" - CATEGORY = "loaders" + CATEGORY = "model/loaders" SEARCH_ALIASES = ["controlnet", "control net", "cn", "load controlnet", "controlnet loader"] def load_controlnet(self, control_net_name): @@ -842,7 +850,7 @@ class DiffControlNetLoader: RETURN_TYPES = ("CONTROL_NET",) FUNCTION = "load_controlnet" - CATEGORY = "loaders" + CATEGORY = "model/loaders" def load_controlnet(self, model, control_net_name): controlnet_path = folder_paths.get_full_path_or_raise("controlnet", control_net_name) @@ -862,7 +870,7 @@ class ControlNetApply: FUNCTION = "apply_controlnet" DEPRECATED = True - CATEGORY = "conditioning/controlnet" + CATEGORY = "model/conditioning/controlnet" def apply_controlnet(self, conditioning, control_net, image, strength): if strength == 0: @@ -900,7 +908,7 @@ class ControlNetApplyAdvanced: RETURN_NAMES = ("positive", "negative") FUNCTION = "apply_controlnet" - CATEGORY = "conditioning/controlnet" + CATEGORY = "model/conditioning/controlnet" SEARCH_ALIASES = ["controlnet", "apply controlnet", "use controlnet", "control net"] def apply_controlnet(self, positive, negative, control_net, image, strength, start_percent, end_percent, vae=None, extra_concat=[]): @@ -961,7 +969,7 @@ class CLIPLoader: @classmethod def INPUT_TYPES(s): return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ), - "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image", "cogvideox"], ), + "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image", "cogvideox", "lens", "pixeldit"], ), }, "optional": { "device": (["default", "cpu"], {"advanced": True}), @@ -971,7 +979,7 @@ class CLIPLoader: CATEGORY = "advanced/loaders" - DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 xxl/ clip-g / clip-l\nstable_audio: t5 base\nmochi: t5 xxl\ncogvideox: t5 xxl (226-token padding)\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl\n hidream: llama-3.1 (Recommend) or t5\nomnigen2: qwen vl 2.5 3B" + DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 xxl/ clip-g / clip-l\nstable_audio: t5 base\nmochi: t5 xxl\ncogvideox: t5 xxl (226-token padding)\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl\n hidream: llama-3.1 (Recommend) or t5\nomnigen2: qwen vl 2.5 3B\nlens: gpt-oss-20b\n pixeldit: gemma 2 2B elm" def load_clip(self, clip_name, type="stable_diffusion", device="default"): clip_type = getattr(comfy.sd.CLIPType, type.upper(), comfy.sd.CLIPType.STABLE_DIFFUSION) @@ -1022,7 +1030,7 @@ class CLIPVisionLoader: RETURN_TYPES = ("CLIP_VISION",) FUNCTION = "load_clip" - CATEGORY = "loaders" + CATEGORY = "model/loaders" def load_clip(self, clip_name): clip_path = folder_paths.get_full_path_or_raise("clip_vision", clip_name) @@ -1041,7 +1049,7 @@ class CLIPVisionEncode: RETURN_TYPES = ("CLIP_VISION_OUTPUT",) FUNCTION = "encode" - CATEGORY = "conditioning" + CATEGORY = "model/conditioning" def encode(self, clip_vision, image, crop): crop_image = True @@ -1058,7 +1066,7 @@ class StyleModelLoader: RETURN_TYPES = ("STYLE_MODEL",) FUNCTION = "load_style_model" - CATEGORY = "loaders" + CATEGORY = "model/loaders" def load_style_model(self, style_model_name): style_model_path = folder_paths.get_full_path_or_raise("style_models", style_model_name) @@ -1080,7 +1088,7 @@ class StyleModelApply: RETURN_TYPES = ("CONDITIONING",) FUNCTION = "apply_stylemodel" - CATEGORY = "conditioning/style_model" + CATEGORY = "model/conditioning/style_model" def apply_stylemodel(self, conditioning, style_model, clip_vision_output, strength, strength_type): cond = style_model.get_cond(clip_vision_output).flatten(start_dim=0, end_dim=1).unsqueeze(dim=0) @@ -1140,7 +1148,7 @@ class unCLIPConditioning: RETURN_TYPES = ("CONDITIONING",) FUNCTION = "apply_adm" - CATEGORY = "conditioning" + CATEGORY = "model/conditioning" def apply_adm(self, conditioning, clip_vision_output, strength, noise_augmentation): if strength == 0: @@ -1157,7 +1165,7 @@ class GLIGENLoader: RETURN_TYPES = ("GLIGEN",) FUNCTION = "load_gligen" - CATEGORY = "loaders" + CATEGORY = "model/loaders" def load_gligen(self, gligen_name): gligen_path = folder_paths.get_full_path_or_raise("gligen", gligen_name) @@ -1179,7 +1187,7 @@ class GLIGENTextBoxApply: RETURN_TYPES = ("CONDITIONING",) FUNCTION = "append" - CATEGORY = "conditioning/gligen" + CATEGORY = "model/conditioning/gligen" def append(self, conditioning_to, clip, gligen_textbox_model, text, width, height, x, y): c = [] @@ -1209,7 +1217,7 @@ class EmptyLatentImage: OUTPUT_TOOLTIPS = ("The empty latent image batch.",) FUNCTION = "generate" - CATEGORY = "latent" + CATEGORY = "model/latent" DESCRIPTION = "Create a new batch of empty latent images to be denoised via sampling." SEARCH_ALIASES = ["empty", "empty latent", "new latent", "create latent", "blank latent", "blank"] @@ -1230,7 +1238,7 @@ class LatentFromBatch: RETURN_TYPES = ("LATENT",) FUNCTION = "frombatch" - CATEGORY = "latent/batch" + CATEGORY = "model/latent/batch" def frombatch(self, samples, batch_index, length): s = samples.copy() @@ -1265,7 +1273,7 @@ class RepeatLatentBatch: RETURN_TYPES = ("LATENT",) FUNCTION = "repeat" - CATEGORY = "latent/batch" + CATEGORY = "model/latent/batch" def repeat(self, samples, amount): s = samples.copy() @@ -1297,7 +1305,7 @@ class LatentUpscale: RETURN_TYPES = ("LATENT",) FUNCTION = "upscale" - CATEGORY = "latent" + CATEGORY = "model/latent" def upscale(self, samples, upscale_method, width, height, crop): if width == 0 and height == 0: @@ -1330,7 +1338,7 @@ class LatentUpscaleBy: RETURN_TYPES = ("LATENT",) FUNCTION = "upscale" - CATEGORY = "latent" + CATEGORY = "model/latent" def upscale(self, samples, upscale_method, scale_by): s = samples.copy() @@ -1348,7 +1356,7 @@ class LatentRotate: RETURN_TYPES = ("LATENT",) FUNCTION = "rotate" - CATEGORY = "latent/transform" + CATEGORY = "model/latent/transform" def rotate(self, samples, rotation): s = samples.copy() @@ -1374,7 +1382,7 @@ class LatentFlip: RETURN_TYPES = ("LATENT",) FUNCTION = "flip" - CATEGORY = "latent/transform" + CATEGORY = "model/latent/transform" def flip(self, samples, flip_method): s = samples.copy() @@ -1399,7 +1407,7 @@ class LatentComposite: RETURN_TYPES = ("LATENT",) FUNCTION = "composite" - CATEGORY = "latent" + CATEGORY = "model/latent" def composite(self, samples_to, samples_from, x, y, composite_method="normal", feather=0): x = x // 8 @@ -1486,7 +1494,7 @@ class LatentCrop: RETURN_TYPES = ("LATENT",) FUNCTION = "crop" - CATEGORY = "latent/transform" + CATEGORY = "model/latent/transform" def crop(self, samples, width, height, x, y): s = samples.copy() @@ -1516,7 +1524,7 @@ class SetLatentNoiseMask: RETURN_TYPES = ("LATENT",) FUNCTION = "set_mask" - CATEGORY = "latent/inpaint" + CATEGORY = "model/latent/inpaint" def set_mask(self, samples, mask): s = samples.copy() @@ -1570,7 +1578,7 @@ class KSampler: OUTPUT_TOOLTIPS = ("The denoised latent.",) FUNCTION = "sample" - CATEGORY = "sampling" + CATEGORY = "model/sampling" DESCRIPTION = "Uses the provided model, positive and negative conditioning to denoise the latent image." SEARCH_ALIASES = ["sampler", "sample", "generate", "denoise", "diffuse", "txt2img", "img2img"] @@ -1600,7 +1608,7 @@ class KSamplerAdvanced: RETURN_TYPES = ("LATENT",) FUNCTION = "sample" - CATEGORY = "sampling" + CATEGORY = "model/sampling" def sample(self, model, add_noise, noise_seed, steps, cfg, sampler_name, scheduler, positive, negative, latent_image, start_at_step, end_at_step, return_with_leftover_noise, denoise=1.0): force_full_denoise = True @@ -2389,6 +2397,7 @@ async def init_builtin_extra_nodes(): "nodes_lt_audio.py", "nodes_lt.py", "nodes_hooks.py", + "nodes_multigpu.py", "nodes_load_3d.py", "nodes_cosmos.py", "nodes_video.py", @@ -2411,6 +2420,7 @@ async def init_builtin_extra_nodes(): "nodes_context_windows.py", "nodes_qwen.py", "nodes_chroma_radiance.py", + "nodes_pid.py", "nodes_model_patch.py", "nodes_easycache.py", "nodes_audio_encoder.py", @@ -2444,6 +2454,7 @@ async def init_builtin_extra_nodes(): "nodes_hidream_o1.py", "nodes_save_3d.py", "nodes_moge.py", + "nodes_mediapipe.py", ] import_failed = [] diff --git a/openapi.yaml b/openapi.yaml index 9a3117e22..f801a39d9 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -104,6 +104,8 @@ paths: responses: "101": description: WebSocket upgrade successful + '401': + description: Unauthorized x-websocket-messages: - type: status schema: @@ -170,6 +172,18 @@ paths: application/json: schema: $ref: "#/components/schemas/PromptInfo" + '401': + description: Unauthorized + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' post: operationId: executePrompt tags: [prompt] @@ -195,12 +209,36 @@ paths: schema: $ref: "#/components/schemas/PromptErrorResponse" + '402': + description: Payment required - Insufficient credits + content: + application/json: + schema: + $ref: '#/components/schemas/PromptErrorResponse' + '429': + description: Payment required - User has not paid + content: + application/json: + schema: + $ref: '#/components/schemas/PromptErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/PromptErrorResponse' + '503': + description: Service unavailable + content: + application/json: + schema: + $ref: '#/components/schemas/PromptErrorResponse' # --------------------------------------------------------------------------- # Queue # --------------------------------------------------------------------------- /api/queue: get: - operationId: getQueue + operationId: getQueueInfo tags: [queue] summary: Get running and pending queue items description: Returns the server's current execution queue, split into the currently-running prompt and the list of pending prompts. @@ -211,6 +249,18 @@ paths: application/json: schema: $ref: "#/components/schemas/QueueInfo" + '400': + description: Invalid request parameters + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Invalid request parameters + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' post: operationId: manageQueue tags: [queue] @@ -225,10 +275,31 @@ paths: responses: "200": description: Queue updated - + content: + application/json: + schema: + $ref: "#/components/schemas/QueueManageResponse" + '400': + description: Invalid request parameters + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '401': + description: Unauthorized + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/interrupt: post: - operationId: interruptExecution + operationId: interruptJob tags: [queue] summary: Interrupt current execution description: Interrupts the prompt that is currently executing. The next queued prompt (if any) will start immediately after. @@ -247,6 +318,18 @@ paths: "200": description: Interrupt signal sent + '401': + description: Unauthorized - Authentication required + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/free: post: operationId: freeMemory @@ -327,9 +410,21 @@ paths: pagination: $ref: "#/components/schemas/PaginationInfo" + '401': + description: Unauthorized - Authentication required + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/jobs/{job_id}: get: - operationId: getJob + operationId: getJobDetail tags: [queue] summary: Get a single job by ID description: Returns the full record for a single completed prompt execution, including its outputs, status, and metadata. @@ -351,12 +446,30 @@ paths: "404": description: Job not found + '401': + description: Unauthorized - Authentication required + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '403': + description: Forbidden - Job does not belong to user + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' # --------------------------------------------------------------------------- # History # --------------------------------------------------------------------------- /api/history: get: - operationId: getHistory + operationId: getPromptHistory tags: [history] summary: Get execution history deprecated: true @@ -388,6 +501,8 @@ paths: type: object additionalProperties: $ref: "#/components/schemas/HistoryEntry" + '404': + description: "Not Found \u2014 use /api/history_v2 instead" post: operationId: manageHistory tags: [history] @@ -409,6 +524,24 @@ paths: "200": description: History updated + '400': + description: Invalid request parameters + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '401': + description: Unauthorized - Authentication required + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/history/{prompt_id}: get: operationId: getHistoryByPromptId @@ -438,6 +571,8 @@ paths: additionalProperties: $ref: "#/components/schemas/HistoryEntry" + '404': + description: "Not Found \u2014 use /api/jobs/{prompt_id} instead" # --------------------------------------------------------------------------- # Upload # --------------------------------------------------------------------------- @@ -481,6 +616,18 @@ paths: "400": description: No file provided or invalid request + '401': + description: Unauthorized + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/upload/mask: post: operationId: uploadMask @@ -539,6 +686,18 @@ paths: "400": description: No file provided or invalid request + '401': + description: Unauthorized + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' # --------------------------------------------------------------------------- # View # --------------------------------------------------------------------------- @@ -601,6 +760,33 @@ paths: "404": description: File not found + '302': + description: Redirect to GCS signed URL + headers: + Location: + description: Signed URL to access the file in GCS + schema: + type: string + Cache-Control: + description: Cache directive for the redirect response + schema: + type: string + Vary: + description: Headers that affect response caching + schema: + type: string + '400': + description: Invalid request parameters + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/view_metadata/{folder_name}: get: operationId: viewMetadata @@ -648,6 +834,12 @@ paths: schema: $ref: "#/components/schemas/SystemStatsResponse" + '401': + description: Unauthorized + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/features: get: operationId: getFeatures @@ -706,7 +898,7 @@ paths: # --------------------------------------------------------------------------- /api/object_info: get: - operationId: getObjectInfo + operationId: getNodeInfo tags: [node] summary: Get all node definitions description: | @@ -782,6 +974,8 @@ paths: items: type: string + '404': + description: "Not Found \u2014 use /api/experiment/models instead" /api/models/{folder}: get: operationId: getModelsByFolder @@ -809,7 +1003,7 @@ paths: /api/experiment/models: get: - operationId: getExperimentModels + operationId: getModelFolders tags: [model] summary: List model folders with paths description: Returns an array of model folder objects with name and folder paths. @@ -823,9 +1017,15 @@ paths: items: $ref: "#/components/schemas/ModelFolder" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/experiment/models/{folder}: get: - operationId: getExperimentModelsByFolder + operationId: getModelsInFolder tags: [model] summary: List model files with metadata description: Returns the model files in the given folder with richer metadata (path index, mtime, size) than the legacy `/api/models/{folder}` endpoint. @@ -848,6 +1048,12 @@ paths: "404": description: Unknown folder type + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/experiment/models/preview/{folder}/{path_index}/{filename}: get: operationId: getModelPreview @@ -884,12 +1090,18 @@ paths: "404": description: Preview not found + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' # --------------------------------------------------------------------------- # Users # --------------------------------------------------------------------------- /api/users: get: - operationId: getUsers + operationId: getUsersInfo tags: [user] summary: Get user storage info description: | @@ -917,6 +1129,12 @@ paths: additionalProperties: type: string description: Map of user_id to directory name (multi-user) + '401': + description: Unauthorized + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' post: operationId: createUser tags: [user] @@ -952,7 +1170,7 @@ paths: # --------------------------------------------------------------------------- /api/userdata: get: - operationId: listUserdata + operationId: getUserdata tags: [userdata] summary: List files in a userdata directory description: Lists files in the authenticated user's data directory. Returns either filename strings or full objects depending on the `full_info` query parameter. @@ -985,10 +1203,28 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/ListUserdataResponse" + $ref: "#/components/schemas/GetUserDataResponseFull" "404": description: Directory not found + '400': + description: Bad request (e.g., invalid filename). + content: + text/plain: + schema: + type: string + '401': + description: Unauthorized. + content: + text/plain: + schema: + type: string + '500': + description: General error + content: + text/plain: + schema: + type: string /api/v2/userdata: get: operationId: listUserdataV2 @@ -1025,6 +1261,8 @@ paths: type: number description: Unix timestamp + '404': + description: "Not Found \u2014 use /api/userdata instead" /api/userdata/{file}: get: operationId: getUserdataFile @@ -1049,8 +1287,26 @@ paths: format: binary "404": description: File not found + '400': + description: Bad request (e.g., invalid filename). + content: + text/plain: + schema: + type: string + '401': + description: Unauthorized. + content: + text/plain: + schema: + type: string + '500': + description: General error + content: + text/plain: + schema: + type: string post: - operationId: writeUserdataFile + operationId: postUserdataFile tags: [userdata] summary: Write or create a userdata file description: Writes (creates or replaces) a file in the authenticated user's data directory. @@ -1087,9 +1343,33 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/UserDataResponse" + $ref: "#/components/schemas/UserDataResponseFull" "409": description: File exists and overwrite not set + '400': + description: Missing or invalid 'file' parameter. + content: + text/plain: + schema: + type: string + '401': + description: Unauthorized. + content: + text/plain: + schema: + type: string + '403': + description: The requested path is not allowed. + content: + text/plain: + schema: + type: string + '500': + description: General error + content: + text/plain: + schema: + type: string delete: operationId: deleteUserdataFile tags: [userdata] @@ -1109,6 +1389,18 @@ paths: "404": description: File not found + '401': + description: Unauthorized. + content: + text/plain: + schema: + type: string + '500': + description: Internal server error. + content: + text/plain: + schema: + type: string /api/userdata/{file}/move/{dest}: post: operationId: moveUserdataFile @@ -1145,18 +1437,36 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/UserDataResponse" + $ref: "#/components/schemas/UserDataResponseFull" "404": description: Source file not found "409": description: Destination exists and overwrite not set + '400': + description: Missing or invalid parameters. + content: + text/plain: + schema: + type: string + '401': + description: Unauthorized. + content: + text/plain: + schema: + type: string + '500': + description: General error + content: + text/plain: + schema: + type: string # --------------------------------------------------------------------------- # Settings # --------------------------------------------------------------------------- /api/settings: get: - operationId: getSettings + operationId: getAllSettings tags: [settings] summary: Get all user settings description: Returns all settings for the authenticated user. @@ -1170,8 +1480,14 @@ paths: schema: type: object additionalProperties: true + '401': + description: Unauthorized + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' post: - operationId: updateSettings + operationId: updateMultipleSettings tags: [settings] summary: Update user settings (partial merge) description: Replaces the authenticated user's settings with the provided object. @@ -1189,9 +1505,21 @@ paths: "200": description: Settings updated + '400': + description: Invalid request + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '401': + description: Unauthorized + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/settings/{id}: get: - operationId: getSetting + operationId: getSettingById tags: [settings] summary: Get a single setting by key description: Returns the value of a single setting, identified by key. @@ -1211,8 +1539,20 @@ paths: schema: nullable: true description: The setting value (any JSON type), or null if not set + '401': + description: Unauthorized + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '404': + description: Setting not found + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' post: - operationId: updateSetting + operationId: updateSettingById tags: [settings] summary: Set a single setting value description: Sets the value of a single setting, identified by key. @@ -1234,6 +1574,18 @@ paths: "200": description: Setting updated + '400': + description: Invalid request + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '401': + description: Unauthorized + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' # --------------------------------------------------------------------------- # Extensions / Templates / i18n # --------------------------------------------------------------------------- @@ -1308,6 +1660,12 @@ paths: additionalProperties: $ref: "#/components/schemas/GlobalSubgraphInfo" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/global_subgraphs/{id}: get: operationId: getGlobalSubgraph @@ -1331,6 +1689,12 @@ paths: "404": description: Subgraph not found + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' # --------------------------------------------------------------------------- # Node Replacements # --------------------------------------------------------------------------- @@ -1351,6 +1715,12 @@ paths: type: object additionalProperties: true + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' # --------------------------------------------------------------------------- # Internal (x-internal: true) # --------------------------------------------------------------------------- @@ -1454,7 +1824,7 @@ paths: /internal/files/{directory_type}: get: - operationId: getInternalFiles + operationId: getFiles tags: [internal] summary: List files in a directory type description: Lists the files present in one of ComfyUI's known directories (input, output, or temp). @@ -1476,6 +1846,12 @@ paths: items: type: string + '400': + description: Invalid directory type + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' # --------------------------------------------------------------------------- # Assets (x-feature-gate: enable-assets) # --------------------------------------------------------------------------- @@ -1499,6 +1875,24 @@ paths: "404": description: No asset with this hash + '400': + description: Invalid hash format + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '401': + description: Unauthorized + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/assets: get: operationId: listAssets @@ -1556,12 +1950,6 @@ paths: type: string enum: [asc, desc] description: Sort direction - - name: job_ids - in: query - schema: - type: string - x-runtime: [cloud] - description: "[cloud-only] Comma-separated UUIDs to filter assets by associated job." - name: include_public in: query schema: @@ -1581,8 +1969,26 @@ paths: application/json: schema: $ref: "#/components/schemas/ListAssetsResponse" + '400': + description: Invalid request parameters + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '401': + description: Unauthorized + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' post: - operationId: createAsset + operationId: uploadAsset tags: [assets] summary: Upload a new asset description: Uploads a new asset (binary content plus metadata) and registers it in the asset database. @@ -1670,6 +2076,60 @@ paths: schema: $ref: "#/components/schemas/AssetCreated" + '200': + description: Asset already exists (returned existing asset) + content: + application/json: + schema: + $ref: '#/components/schemas/AssetCreated' + '400': + description: Invalid request (bad file, invalid URL, invalid content type, etc.) + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '401': + description: Unauthorized + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '403': + description: Source URL requires authentication or access denied + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '404': + description: Source URL not found + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '413': + description: File too large + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '415': + description: Unsupported media type + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '422': + description: Download failed due to network error or timeout + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/assets/from-hash: post: operationId: createAssetFromHash @@ -1713,9 +2173,39 @@ paths: schema: $ref: "#/components/schemas/AssetCreated" + '200': + description: Asset reference already exists (returned existing) + content: + application/json: + schema: + $ref: '#/components/schemas/AssetCreated' + '400': + description: Invalid request (bad hash format, invalid tags, etc.) + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '401': + description: Unauthorized + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '404': + description: Source asset with given hash not found + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/assets/{id}: get: - operationId: getAsset + operationId: getAssetById tags: [assets] summary: Get asset metadata description: Returns the metadata for a single asset. @@ -1737,6 +2227,18 @@ paths: $ref: "#/components/schemas/Asset" "404": description: Asset not found + '401': + description: Unauthorized + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' put: operationId: updateAsset tags: [assets] @@ -1781,6 +2283,30 @@ paths: application/json: schema: $ref: "#/components/schemas/AssetUpdated" + '400': + description: Invalid request (no fields provided) + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '401': + description: Unauthorized + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '404': + description: Asset not found + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' delete: operationId: deleteAsset tags: [assets] @@ -1804,6 +2330,30 @@ paths: "204": description: Asset deleted + '401': + description: Unauthorized + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '404': + description: Asset not found + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '409': + description: Asset cannot be deleted because it is referenced by another resource (e.g., workflow version) + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/assets/{id}/content: get: operationId: getAssetContent @@ -1865,6 +2415,36 @@ paths: application/json: schema: $ref: "#/components/schemas/TagsModificationResponse" + '400': + description: Invalid request + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '401': + description: Unauthorized + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '404': + description: Asset not found + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '422': + description: Validation error (e.g., reserved tag) + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' delete: operationId: removeAssetTags tags: [assets] @@ -1900,6 +2480,36 @@ paths: schema: $ref: "#/components/schemas/TagsModificationResponse" + '400': + description: Invalid request + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '401': + description: Unauthorized + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '404': + description: Asset not found + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '422': + description: Validation error (e.g., reserved tag) + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/tags: get: operationId: listTags @@ -1929,9 +2539,27 @@ paths: schema: $ref: "#/components/schemas/ListTagsResponse" + '400': + description: Invalid request parameters + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '401': + description: Unauthorized + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/assets/tags/refine: get: - operationId: refineAssetTags + operationId: getAssetTagHistogram tags: [assets] summary: Get tag counts for assets matching current filters description: Returns suggested additional tags that would refine a filtered asset query, together with the count of assets each tag would select. @@ -1992,6 +2620,24 @@ paths: schema: $ref: "#/components/schemas/AssetTagHistogramResponse" + '400': + description: Invalid request parameters + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '401': + description: Unauthorized + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/assets/seed: post: operationId: seedAssets @@ -2109,7 +2755,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/CloudJobStatus" + $ref: "#/components/schemas/JobCancelResponse" "401": description: Unauthorized content: @@ -2123,9 +2769,21 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '400': + description: Bad Request - job_id is not a valid UUID (emitted by request validation before the handler runs) + content: + application/json: + schema: + $ref: '#/components/schemas/BindingErrorResponse' + '500': + description: Internal server error - cancellation failed + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/job/{job_id}/status: get: - operationId: getCloudJobStatus + operationId: getJobStatus tags: [queue] summary: Get status of a cloud job deprecated: true @@ -2148,7 +2806,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/CloudJobStatus" + $ref: "#/components/schemas/JobStatusResponse" "401": description: Unauthorized content: @@ -2162,6 +2820,18 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '403': + description: Forbidden - job belongs to another user + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/prompt/{prompt_id}: get: operationId: getCloudPrompt @@ -2199,7 +2869,7 @@ paths: /api/history_v2: get: - operationId: getHistoryV2 + operationId: getHistory tags: [history] summary: Get paginated execution history (v2) deprecated: true @@ -2232,7 +2902,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/HistoryV2Response" + $ref: "#/components/schemas/HistoryResponse" "401": description: Unauthorized content: @@ -2240,9 +2910,15 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/history_v2/{prompt_id}: get: - operationId: getHistoryV2ByPromptId + operationId: getHistoryForPrompt tags: [history] summary: Get v2 history for a specific prompt deprecated: true @@ -2265,7 +2941,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/HistoryV2Entry" + $ref: "#/components/schemas/HistoryDetailResponse" "401": description: Unauthorized content: @@ -2279,9 +2955,15 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/logs: get: - operationId: getCloudLogs + operationId: getLogs tags: [system] summary: Get cloud execution logs deprecated: true @@ -2315,7 +2997,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/CloudLogsResponse" + $ref: "#/components/schemas/LogsResponse" "401": description: Unauthorized content: @@ -2328,7 +3010,7 @@ paths: # --------------------------------------------------------------------------- /api/assets/download: post: - operationId: downloadAssets + operationId: createAssetDownload tags: [assets] summary: Download assets to cloud runtime description: "[cloud-only] Initiates a download of one or more assets to the cloud runtime environment. Returns a task ID for tracking download progress via WebSocket." @@ -2348,16 +3030,27 @@ paths: $ref: "#/components/schemas/AssetDownloadRequest" description: Assets to download responses: - "200": - description: Download initiated + "202": + description: Download task accepted content: application/json: schema: type: object + required: + - task_id + - status properties: task_id: type: string - description: Task ID for tracking progress via WebSocket + format: uuid + description: ID of the download task; use to poll status. + status: + type: string + enum: [created, running, completed, failed] + description: Current task status (typically `created` on initial creation). + message: + type: string + description: Human-readable task message. "400": description: Bad request content: @@ -2371,9 +3064,27 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '200': + description: File already exists in storage - asset created/returned immediately + content: + application/json: + schema: + $ref: '#/components/schemas/AssetCreated' + '422': + description: Validation errors + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/assets/export: post: - operationId: exportAssets + operationId: createAssetExport tags: [assets] summary: Export assets as a downloadable archive description: "[cloud-only] Initiates a bulk export of assets. Returns a task ID for tracking progress via WebSocket. When complete, the export can be downloaded via the exports endpoint." @@ -2384,30 +3095,56 @@ paths: application/json: schema: type: object - required: - - asset_ids properties: + job_ids: + type: array + items: + type: string + description: Job IDs whose associated assets should all be included in the ZIP bundle. asset_ids: type: array items: type: string format: uuid - description: IDs of assets to export + description: Asset IDs to include in the ZIP bundle. Additive to assets associated with provided job IDs. export_name: type: string description: Name for the export archive + naming_strategy: + type: string + enum: [group_by_job_id, preserve, asset_id, group_by_job_time] + default: group_by_job_time + description: "Strategy for naming files in the ZIP: group by job ID, preserve original names, use the asset ID, or group by job creation time." + job_asset_name_filters: + type: object + additionalProperties: + type: array + minItems: 1 + items: + type: string + description: Optional per-job asset name filters. When provided for a job ID, only assets whose name matches one of the listed names are included. responses: - "200": - description: Export initiated + "202": + description: Export task accepted content: application/json: schema: type: object + required: + - task_id + - status properties: task_id: type: string - export_name: + format: uuid + description: ID of the export task; use to poll status. + status: type: string + enum: [created, running, completed, failed] + description: Current task status (typically `created` on initial creation). + message: + type: string + description: Human-readable task message. "400": description: Bad request content: @@ -2421,6 +3158,12 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/assets/exports/{exportName}: get: operationId: getAssetExport @@ -2456,9 +3199,21 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '400': + description: Invalid export name + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/assets/from-workflow: post: - operationId: createAssetsFromWorkflow + operationId: postAssetsFromWorkflow tags: [assets] summary: Create asset records from a workflow execution description: "[cloud-only] Registers output files from a workflow execution as assets in the asset database." @@ -2482,8 +3237,8 @@ paths: type: string description: Tags to apply to the created assets responses: - "201": - description: Assets created + "200": + description: Assets created or referenced content: application/json: schema: @@ -2512,39 +3267,33 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/assets/import: post: - operationId: importAssets + operationId: importPublishedAssets tags: [assets] - summary: Import assets from external URLs - description: "[cloud-only] Imports one or more assets from external URLs into the cloud asset store." + summary: "[cloud-only] Import published assets into the caller's library" + description: | + [cloud-only] Imports the specified published assets into the caller's asset library. New DB records reference the same storage objects; no file copying occurs. Assets the caller already owns (by hash) are deduplicated. The `id` field on each returned `AssetInfo` is the caller's newly-created private asset ID, not the published asset ID supplied in the request. x-runtime: [cloud] requestBody: required: true content: application/json: schema: - type: object - required: - - imports - properties: - imports: - type: array - items: - $ref: "#/components/schemas/AssetImportRequest" - description: Assets to import + $ref: "#/components/schemas/ImportPublishedAssetsRequest" responses: "200": - description: Import initiated + description: Successfully imported assets content: application/json: schema: - type: object - properties: - assets: - type: array - items: - $ref: "#/components/schemas/Asset" + $ref: "#/components/schemas/ImportPublishedAssetsResponse" "400": description: Bad request content: @@ -2558,9 +3307,15 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/assets/remote-metadata: get: - operationId: getAssetRemoteMetadata + operationId: getRemoteAssetMetadata tags: [assets] summary: Fetch metadata for a remote asset URL description: "[cloud-only] Fetches and returns metadata (content type, size, filename) for a remote URL without downloading the full content." @@ -2579,7 +3334,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/RemoteAssetMetadata" + $ref: "#/components/schemas/AssetMetadataResponse" "400": description: Bad request content: @@ -2593,6 +3348,18 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '422': + description: Failed to retrieve metadata from source + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' # --------------------------------------------------------------------------- # Custom nodes / hub (cloud) # --------------------------------------------------------------------------- @@ -2748,7 +3515,7 @@ paths: /api/hub/assets/upload-url: post: - operationId: getHubAssetUploadUrl + operationId: createHubAssetUploadUrl tags: [hub] summary: Get a pre-signed upload URL for a hub asset description: "[cloud-only] Returns a pre-signed URL that can be used to upload an asset file directly to storage." @@ -2802,6 +3569,18 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '404': + description: Not found + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/hub/labels: get: operationId: listHubLabels @@ -2815,10 +3594,19 @@ paths: content: application/json: schema: - type: array - items: - $ref: "#/components/schemas/HubLabel" - + $ref: "#/components/schemas/HubLabelListResponse" + '400': + description: Bad request (e.g. invalid type parameter) + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/hub/profiles: get: operationId: listHubProfiles @@ -2902,6 +3690,12 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/hub/profiles/{username}: get: operationId: getHubProfile @@ -2930,9 +3724,15 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/hub/profiles/check: get: - operationId: checkHubProfileUsername + operationId: checkHubUsername tags: [hub] summary: Check if a hub username is available description: "[cloud-only] Returns whether the given username is available for registration." @@ -2957,6 +3757,24 @@ paths: username: type: string + '401': + description: Unauthorized + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '404': + description: Not found + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/hub/profiles/me: get: operationId: getMyHubProfile @@ -2977,6 +3795,18 @@ paths: application/json: schema: $ref: "#/components/schemas/CloudError" + '404': + description: No hub profile exists + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' put: operationId: updateMyHubProfile tags: [hub] @@ -3075,7 +3905,25 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/HubWorkflowList" + $ref: "#/components/schemas/HubWorkflowListResponse" + '400': + description: Bad request (e.g. malformed pagination cursor) + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '404': + description: Profile not found (when filtering by username) + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' post: operationId: publishHubWorkflow tags: [hub] @@ -3114,6 +3962,12 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/hub/workflows/{share_id}: get: operationId: getHubWorkflow @@ -3134,13 +3988,25 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/HubWorkflow" + $ref: "#/components/schemas/HubWorkflowDetail" "404": description: Not found content: application/json: schema: $ref: "#/components/schemas/CloudError" + '413': + description: Workflow JSON too large + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' delete: operationId: deleteHubWorkflow tags: [hub] @@ -3170,9 +4036,15 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/hub/workflows/index: get: - operationId: getHubWorkflowIndex + operationId: listHubWorkflowIndex tags: [hub] summary: Get the hub workflow index description: "[cloud-only] Returns the lightweight index of all hub workflows for client-side search and navigation." @@ -3187,12 +4059,18 @@ paths: items: $ref: "#/components/schemas/HubWorkflowIndexEntry" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' # --------------------------------------------------------------------------- # Workflows (cloud) # --------------------------------------------------------------------------- /api/workflows: get: - operationId: listCloudWorkflows + operationId: listWorkflows tags: [workflows] summary: List cloud workflows description: "[cloud-only] Returns a paginated list of the authenticated user's cloud workflows." @@ -3230,15 +4108,21 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/CloudWorkflowList" + $ref: "#/components/schemas/WorkflowListResponse" "401": description: Unauthorized content: application/json: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' post: - operationId: createCloudWorkflow + operationId: createWorkflow tags: [workflows] summary: Create a new cloud workflow description: "[cloud-only] Creates a new cloud workflow with the provided name and optional initial content." @@ -3268,7 +4152,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/CloudWorkflow" + $ref: "#/components/schemas/WorkflowResponse" "400": description: Bad request content: @@ -3282,9 +4166,21 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '422': + description: Validation error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/workflows/{workflow_id}: get: - operationId: getCloudWorkflow + operationId: getWorkflow tags: [workflows] summary: Get a cloud workflow by ID description: "[cloud-only] Returns the metadata for a cloud workflow." @@ -3303,7 +4199,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/CloudWorkflow" + $ref: "#/components/schemas/WorkflowResponse" "401": description: Unauthorized content: @@ -3316,8 +4212,20 @@ paths: application/json: schema: $ref: "#/components/schemas/CloudError" + '403': + description: Forbidden + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' patch: - operationId: updateCloudWorkflow + operationId: updateWorkflow tags: [workflows] summary: Update a cloud workflow description: "[cloud-only] Updates the metadata (name, description) of an existing cloud workflow." @@ -3347,7 +4255,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/CloudWorkflow" + $ref: "#/components/schemas/WorkflowResponse" "400": description: Bad request content: @@ -3366,8 +4274,20 @@ paths: application/json: schema: $ref: "#/components/schemas/CloudError" + '422': + description: Validation error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' delete: - operationId: deleteCloudWorkflow + operationId: deleteWorkflow tags: [workflows] summary: Delete a cloud workflow description: "[cloud-only] Deletes a cloud workflow and all its versions." @@ -3396,9 +4316,15 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/workflows/{workflow_id}/content: get: - operationId: getCloudWorkflowContent + operationId: getWorkflowContent tags: [workflows] summary: Get the content of a cloud workflow description: "[cloud-only] Returns the full workflow graph JSON for the latest version of a cloud workflow." @@ -3437,6 +4363,18 @@ paths: application/json: schema: $ref: "#/components/schemas/CloudError" + '403': + description: Forbidden + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' put: operationId: updateCloudWorkflowContent tags: [workflows] @@ -3487,7 +4425,7 @@ paths: /api/workflows/{workflow_id}/fork: post: - operationId: forkCloudWorkflow + operationId: forkWorkflow tags: [workflows] summary: Fork a cloud workflow description: "[cloud-only] Creates a copy of a cloud workflow under the authenticated user's account." @@ -3516,7 +4454,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/CloudWorkflow" + $ref: "#/components/schemas/WorkflowResponse" "401": description: Unauthorized content: @@ -3530,6 +4468,24 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '403': + description: Forbidden + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '422': + description: Validation error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/workflows/{workflow_id}/versions: get: operationId: listCloudWorkflowVersions @@ -3584,7 +4540,7 @@ paths: schema: $ref: "#/components/schemas/CloudError" post: - operationId: createCloudWorkflowVersion + operationId: createWorkflowVersion tags: [workflows] summary: Create a new cloud workflow version description: "[cloud-only] Creates a new workflow version with updated workflow JSON. Uses optimistic concurrency via base_version." @@ -3635,6 +4591,18 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '422': + description: Validation error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/workflows/published/{share_id}: get: operationId: getPublishedWorkflow @@ -3655,7 +4623,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/CloudWorkflow" + $ref: "#/components/schemas/PublishedWorkflowDetail" "404": description: Not found content: @@ -3663,6 +4631,24 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '401': + description: Unauthorized + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '413': + description: Workflow JSON too large + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' # --------------------------------------------------------------------------- # Auth / session (cloud) # --------------------------------------------------------------------------- @@ -3687,7 +4673,7 @@ paths: schema: $ref: "#/components/schemas/CloudError" post: - operationId: createAuthSession + operationId: createSession tags: [auth] summary: Create a session cookie description: "[cloud-only] Creates a session cookie from the bearer token in the Authorization header. Returns a Set-Cookie header with a secure HttpOnly session cookie. Cookie authentication is not allowed for this endpoint." @@ -3711,8 +4697,14 @@ paths: application/json: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' delete: - operationId: deleteAuthSession + operationId: deleteSession tags: [auth] summary: Delete session cookie (logout) description: "[cloud-only] Clears the session cookie and optionally revokes the session on the server." @@ -3725,9 +4717,15 @@ paths: schema: $ref: "#/components/schemas/DeleteSessionResponse" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/auth/token: post: - operationId: createAuthToken + operationId: exchangeToken tags: [auth] summary: Exchange credentials for an access token description: "[cloud-only] Exchanges authentication credentials (e.g. an authorization code) for an access token." @@ -3761,7 +4759,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/AuthTokenResponse" + $ref: "#/components/schemas/ExchangeTokenResponse" "400": description: Bad request content: @@ -3775,6 +4773,18 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '404': + description: Workspace not found or user not a member + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /.well-known/jwks.json: get: operationId: getJwks @@ -3790,6 +4800,295 @@ paths: schema: $ref: "#/components/schemas/JwksResponse" + # --------------------------------------------------------------------------- + # OAuth 2.1 / RFC 7591 Dynamic Client Registration (cloud) + # --------------------------------------------------------------------------- + /.well-known/oauth-authorization-server: + get: + operationId: getOAuthAuthorizationServer + tags: [auth] + summary: "[cloud-only] OAuth 2.1 authorization-server metadata (RFC 8414)" + description: "[cloud-only] Public metadata document for OAuth 2.1 clients. Cached 5 minutes." + x-runtime: [cloud] + security: [] + responses: + "200": + description: Authorization-server metadata + content: + application/json: + schema: + $ref: "#/components/schemas/OAuthAuthorizationServerMetadata" + "404": + description: OAuth disabled + content: + application/json: + schema: + $ref: "#/components/schemas/CloudError" + + /.well-known/oauth-protected-resource: + get: + operationId: getOAuthProtectedResource + tags: [auth] + summary: "[cloud-only] OAuth 2.1 protected-resource metadata (RFC 9728)" + description: "[cloud-only] Public metadata describing the currently advertised protected resource. Cached 5 minutes." + x-runtime: [cloud] + security: [] + responses: + "200": + description: Protected-resource metadata + content: + application/json: + schema: + $ref: "#/components/schemas/OAuthProtectedResourceMetadata" + "404": + description: OAuth disabled or no active resource configured + content: + application/json: + schema: + $ref: "#/components/schemas/CloudError" + + /oauth/authorize: + get: + operationId: getOAuthAuthorize + tags: [auth] + summary: "[cloud-only] Begin or resume an OAuth 2.1 authorization request" + description: | + [cloud-only] Two modes: + - **Initial entry** (OAuth params present): validates client/redirect/resource/scopes, persists a server-side authorization-request row, and either redirects (no session / unverified email) to the configured frontend login URL carrying only the opaque `oauth_request_id`, or returns the JSON consent challenge for the frontend to render. + - **Resume** (`oauth_request_id` present): loads the server-side row, fails closed if expired/consumed/unknown, returns the JSON consent challenge. Browser-replayed OAuth params are intentionally ignored. + + The frontend renders the consent UI from the JSON payload and POSTs the user's decision back to this endpoint. + x-runtime: [cloud] + security: [] + parameters: + - { name: response_type, in: query, required: false, schema: { type: string } } + - { name: client_id, in: query, required: false, schema: { type: string } } + - { name: redirect_uri, in: query, required: false, schema: { type: string } } + - { name: scope, in: query, required: false, schema: { type: string } } + - name: state + in: query + required: false + schema: { type: string } + description: | + RFC 6749 §10.12 marks `state` as RECOMMENDED. Cloud hardening makes it REQUIRED on the initial-entry path (omitted only on the resume path where `oauth_request_id` is supplied instead). This parameter is `required: false` at the spec level only because the operation is dual-mode (initial entry vs. resume); the runtime rejects empty `state` on the initial-entry path with a stable `invalid_request` 400. + - { name: code_challenge, in: query, required: false, schema: { type: string } } + - { name: code_challenge_method, in: query, required: false, schema: { type: string } } + - { name: resource, in: query, required: false, schema: { type: string } } + - { name: oauth_request_id, in: query, required: false, schema: { type: string } } + responses: + "200": + description: Consent challenge payload (session present, email verified). Frontend renders the consent UI from this payload and POSTs back to /oauth/authorize. + content: + application/json: + schema: + $ref: "#/components/schemas/OAuthConsentChallenge" + "302": + description: Redirect to login (no session / unverified email) or to registered redirect_uri (pre-validated client error) + headers: + Location: + schema: + type: string + "400": + description: Invalid authorize request (pre-redirect failure — unknown client, redirect mismatch, malformed params) + content: + application/json: + schema: + $ref: "#/components/schemas/CloudError" + "404": + description: OAuth disabled + content: + application/json: + schema: + $ref: "#/components/schemas/CloudError" + post: + operationId: postOAuthAuthorize + tags: [auth] + summary: "[cloud-only] Submit OAuth consent decision" + description: | + [cloud-only] JSON-only consent submission. The handler verifies the per-row CSRF token, atomically marks the authorization request consumed (single-use covers both allow and deny paths), then returns the redirect URL the browser must navigate to. The URL contains either `code` + original `state` for allow, or the RFC 6749 §5.2 error and `state` for deny. + + Workspace membership is re-checked at submission time. Consent is persisted keyed by `(user_id, client_id, resource_id, workspace_id)`; broadening the previously approved scope set requires a fresh consent flow. + x-runtime: [cloud] + security: [] + requestBody: + required: true + content: + application/json: + schema: + type: object + required: [oauth_request_id, csrf_token, decision, workspace_id] + properties: + oauth_request_id: { type: string, format: uuid } + csrf_token: { type: string } + decision: { type: string, enum: [allow, deny] } + workspace_id: { type: string } + responses: + "200": + description: Redirect URL for the frontend to navigate to (allow → with code+state; deny → with error+state) + content: + application/json: + schema: + $ref: "#/components/schemas/OAuthAuthorizeRedirectResponse" + "400": + description: Bad request (CSRF mismatch, expired/consumed request, inaccessible workspace) + content: + application/json: + schema: + $ref: "#/components/schemas/CloudError" + "403": + description: Scope broadening on consent re-grant — fresh consent flow required + content: + application/json: + schema: + $ref: "#/components/schemas/CloudError" + "404": + description: OAuth disabled + content: + application/json: + schema: + $ref: "#/components/schemas/CloudError" + + /oauth/token: + post: + operationId: postOAuthToken + tags: [auth] + summary: "[cloud-only] Exchange authorization code or refresh token for a resource-bound access token" + description: | + [cloud-only] OAuth 2.1 token endpoint (RFC 6749 §3.2). Public clients only — `client_secret` is rejected. + + Two grant types are supported: + - `authorization_code` — exchanges the code minted by `/oauth/authorize` (with PKCE verifier) for an access token + first refresh token. Single-use; reuse fails closed. + - `refresh_token` — rotates the refresh token. Old token immediately invalid; presenting an already-rotated token revokes the entire token family and emits a security metric. + + Both grant types re-validate canonical user state, current workspace membership, and the resource's active flag at every mint. A code or refresh token bound to a deactivated resource fails closed. + + Errors follow RFC 6749 §5.2. Logs never contain raw codes, refresh tokens, or minted tokens. + + Per RFC 6749 §5.1, every 200 and 400 response carries `Cache-Control: no-store` and `Pragma: no-cache` so intermediaries cannot cache token-bearing or state-change-reason responses. + x-runtime: [cloud] + security: [] + requestBody: + required: true + content: + application/x-www-form-urlencoded: + schema: + type: object + required: [grant_type, client_id] + properties: + grant_type: { type: string, enum: [authorization_code, refresh_token] } + client_id: { type: string } + code: { type: string } + redirect_uri: { type: string } + code_verifier: { type: string } + refresh_token: { type: string } + scope: { type: string } + client_secret: { type: string } + responses: + "200": + description: New token pair + headers: + Cache-Control: + schema: + type: string + description: 'Always "no-store" per RFC 6749 §5.1' + Pragma: + schema: + type: string + description: 'Always "no-cache" per RFC 6749 §5.1' + content: + application/json: + schema: + $ref: "#/components/schemas/OAuthTokenResponse" + "400": + description: RFC 6749 §5.2 error + headers: + Cache-Control: + schema: + type: string + description: 'Always "no-store" per RFC 6749 §5.1' + Pragma: + schema: + type: string + description: 'Always "no-cache" per RFC 6749 §5.1' + content: + application/json: + schema: + $ref: "#/components/schemas/OAuthTokenError" + "404": + description: OAuth disabled + content: + application/json: + schema: + $ref: "#/components/schemas/CloudError" + + /oauth/register: + post: + operationId: postOAuthRegister + tags: [auth] + summary: "[cloud-only] Dynamic Client Registration (RFC 7591)" + description: | + [cloud-only] Public, unauthenticated, insert-only RFC 7591 §3.1 client registration. Used by MCP-spec-compliant clients to self-register a public OAuth client without operator involvement. + + Policy: + + - Public clients only — `token_endpoint_auth_method` is forced to `none`. Confidential-client registration is out of scope this phase. + - Server-owned `resource_grants`. Caller-supplied `scope` or `resource_grants` is rejected as `invalid_client_metadata` (would be a privilege-escalation surface). Dynamic clients receive the same scopes the active resource publishes. + - Application-type-aware redirect URI policy. `application_type=native` accepts loopback (`127.0.0.1`, `::1`, `localhost`) and reverse-DNS-shaped custom schemes; `application_type=web` accepts HTTPS to hosts in an operator-controlled allowlist only. `application_type` is REQUIRED on the request — missing or empty rejects with `invalid_client_metadata`. + - Anti-impersonation: reserved client names are rejected from third parties via NFKC-folded compare. + - Generated `client_id` carries a stable prefix to distinguish dynamic from seeded clients in audit logs. + - Cache-Control: `no-store` on every 201 and 400 response (the response carries fresh credentials and rejection reasons). + x-runtime: [cloud] + security: [] + requestBody: + required: true + content: + application/json: + schema: + $ref: "#/components/schemas/OAuthRegisterRequest" + responses: + "201": + description: Registered. Body echoes the metadata RFC 7591 §3.2.1 requires. + headers: + Cache-Control: + schema: + type: string + description: 'Always "no-store"' + Pragma: + schema: + type: string + description: 'Always "no-cache"' + content: + application/json: + schema: + $ref: "#/components/schemas/OAuthRegisterResponse" + "400": + description: RFC 7591 §3.2.2 invalid client metadata + headers: + Cache-Control: + schema: + type: string + description: 'Always "no-store"' + Pragma: + schema: + type: string + description: 'Always "no-cache"' + content: + application/json: + schema: + $ref: "#/components/schemas/OAuthRegisterError" + "404": + description: OAuth disabled + content: + application/json: + schema: + $ref: "#/components/schemas/CloudError" + "503": + description: No active resource is configured — DCR cannot mint a usable client until an active resource row is seeded. + content: + application/json: + schema: + $ref: "#/components/schemas/CloudError" + # --------------------------------------------------------------------------- # Billing (cloud) # --------------------------------------------------------------------------- @@ -3806,7 +5105,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/BillingBalance" + $ref: "#/components/schemas/BillingBalanceResponse" "401": description: Unauthorized content: @@ -3814,9 +5113,15 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/billing/events: get: - operationId: listBillingEvents + operationId: getBillingEvents tags: [billing] summary: List billing events description: "[cloud-only] Returns a paginated list of billing events (charges, credits, refunds) for the authenticated user." @@ -3843,7 +5148,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/BillingEventList" + $ref: "#/components/schemas/BillingEventsResponse" "401": description: Unauthorized content: @@ -3851,9 +5156,15 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/billing/ops/{id}: get: - operationId: getBillingOp + operationId: getBillingOpStatus tags: [billing] summary: Get a billing operation by ID description: "[cloud-only] Returns details of a specific billing operation." @@ -3871,7 +5182,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/BillingOp" + $ref: "#/components/schemas/BillingOpStatusResponse" "401": description: Unauthorized content: @@ -3885,9 +5196,15 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/billing/payment-portal: post: - operationId: createPaymentPortalSession + operationId: getPaymentPortal tags: [billing] summary: Create a payment portal session description: "[cloud-only] Creates a Stripe customer portal session for managing payment methods and invoices. Returns a URL to redirect the user to." @@ -3911,9 +5228,21 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '400': + description: Bad request (e.g., missing return_url) + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/billing/plans: get: - operationId: listBillingPlans + operationId: getBillingPlans tags: [billing] summary: List available billing plans description: "[cloud-only] Returns the list of available subscription plans and their pricing." @@ -3928,9 +5257,21 @@ paths: items: $ref: "#/components/schemas/BillingPlan" + '401': + description: Unauthorized + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/billing/preview-subscribe: post: - operationId: previewSubscription + operationId: previewSubscribe tags: [billing] summary: Preview a subscription change description: "[cloud-only] Returns a preview of what a subscription change would cost, including prorations." @@ -3953,7 +5294,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/SubscriptionPreview" + $ref: "#/components/schemas/PreviewSubscribeResponse" "400": description: Bad request content: @@ -3967,6 +5308,12 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/billing/status: get: operationId: getBillingStatus @@ -3980,7 +5327,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/BillingStatus" + $ref: "#/components/schemas/BillingStatusResponse" "401": description: Unauthorized content: @@ -3988,9 +5335,21 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '404': + description: Workspace not found + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/billing/subscribe: post: - operationId: createSubscription + operationId: subscribe tags: [billing] summary: Subscribe to a billing plan description: "[cloud-only] Creates a new subscription to the specified billing plan." @@ -4016,7 +5375,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/BillingSubscription" + $ref: "#/components/schemas/SubscribeResponse" "400": description: Bad request content: @@ -4030,6 +5389,12 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/billing/subscription/cancel: post: operationId: cancelSubscription @@ -4043,7 +5408,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/BillingSubscription" + $ref: "#/components/schemas/CancelSubscriptionResponse" "401": description: Unauthorized content: @@ -4051,6 +5416,18 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '400': + description: Invalid request (e.g., no active subscription) + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/billing/subscription/resubscribe: post: operationId: resubscribe @@ -4064,7 +5441,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/BillingSubscription" + $ref: "#/components/schemas/ResubscribeResponse" "401": description: Unauthorized content: @@ -4072,9 +5449,21 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '400': + description: Invalid request (e.g., no active subscription, not in cancellation grace period) + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/billing/topup: post: - operationId: topUpCredits + operationId: createTopup tags: [billing] summary: Purchase additional credits description: "[cloud-only] Purchases a one-time credit top-up using the user's payment method on file." @@ -4097,7 +5486,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/BillingBalance" + $ref: "#/components/schemas/CreateTopupResponse" "400": description: Bad request content: @@ -4111,12 +5500,18 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' # --------------------------------------------------------------------------- # Workspace (cloud) # --------------------------------------------------------------------------- /api/workspace/api-keys: get: - operationId: listWorkspaceApiKeys + operationId: listWorkspaceAPIKeys tags: [workspace] summary: List workspace API keys description: "[cloud-only] Returns the list of API keys for the current workspace." @@ -4142,8 +5537,14 @@ paths: application/json: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' post: - operationId: createWorkspaceApiKey + operationId: createWorkspaceAPIKey tags: [workspace] summary: Create a workspace API key description: "[cloud-only] Creates a new API key for the current workspace." @@ -4160,13 +5561,17 @@ paths: name: type: string description: Display name for the API key + description: + type: string + description: User-provided description of the key's purpose + maxLength: 5000 responses: "201": description: API key created content: application/json: schema: - $ref: "#/components/schemas/WorkspaceApiKeyCreated" + $ref: "#/components/schemas/CreateWorkspaceAPIKeyResponse" "400": description: Bad request content: @@ -4186,9 +5591,33 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '404': + description: Workspace not found + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '422': + description: Validation error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '429': + description: Key limit reached + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/workspace/api-keys/{id}: delete: - operationId: deleteWorkspaceApiKey + operationId: revokeWorkspaceAPIKey tags: [workspace] summary: Delete a workspace API key description: "[cloud-only] Revokes and deletes a workspace API key." @@ -4222,6 +5651,12 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/workspace/invites: get: operationId: listWorkspaceInvites @@ -4250,6 +5685,12 @@ paths: application/json: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' post: operationId: createWorkspaceInvite tags: [workspace] @@ -4279,7 +5720,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/WorkspaceInvite" + $ref: "#/components/schemas/PendingInvite" "400": description: Bad request content: @@ -4305,9 +5746,27 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '404': + description: Workspace not found + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '422': + description: Validation error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/workspace/invites/{inviteId}: delete: - operationId: deleteWorkspaceInvite + operationId: revokeWorkspaceInvite tags: [workspace] summary: Cancel a workspace invite description: "[cloud-only] Cancels a pending workspace invitation." @@ -4341,6 +5800,12 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/workspace/leave: post: operationId: leaveWorkspace @@ -4364,6 +5829,18 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '404': + description: Workspace not found or not a member + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/workspace/members: get: operationId: listWorkspaceMembers @@ -4393,6 +5870,24 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '404': + description: Workspace not found + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '422': + description: Validation error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/workspace/members/{user_id}/api-keys: get: operationId: listMemberApiKeys @@ -4435,7 +5930,7 @@ paths: schema: $ref: "#/components/schemas/CloudError" delete: - operationId: bulkRevokeMemberApiKeys + operationId: bulkRevokeWorkspaceMemberAPIKeys tags: [workspace] summary: Bulk revoke a member's API keys description: "[cloud-only] Revokes all active API keys for a specific workspace member. Only workspace owners can perform this action." @@ -4468,6 +5963,18 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '422': + description: Validation error (e.g. empty user_id) + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/workspace/members/{userId}: patch: operationId: updateWorkspaceMember @@ -4561,6 +6068,12 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/workspaces: get: operationId: listWorkspaces @@ -4583,6 +6096,18 @@ paths: application/json: schema: $ref: "#/components/schemas/CloudError" + '404': + description: Feature not enabled for user + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' post: operationId: createWorkspace tags: [workspace] @@ -4621,6 +6146,24 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '404': + description: Feature not enabled for user + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '422': + description: Validation error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/workspaces/{id}: get: operationId: getWorkspace @@ -4660,6 +6203,12 @@ paths: application/json: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' patch: operationId: updateWorkspace tags: [workspace] @@ -4714,6 +6263,18 @@ paths: application/json: schema: $ref: "#/components/schemas/CloudError" + '422': + description: Validation error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' delete: operationId: deleteWorkspace tags: [workspace] @@ -4749,6 +6310,12 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' # --------------------------------------------------------------------------- # User / settings / misc (cloud) # --------------------------------------------------------------------------- @@ -4764,24 +6331,9 @@ paths: content: application/json: schema: - type: object - required: - - message - properties: - message: - type: string - description: Feedback message - rating: - type: integer - minimum: 1 - maximum: 5 - description: Optional satisfaction rating - context: - type: object - additionalProperties: true - description: Additional context metadata + $ref: "#/components/schemas/FeedbackRequest" responses: - "200": + "201": description: Feedback submitted content: application/json: @@ -4805,6 +6357,12 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/files/mask-layers: get: operationId: getMaskLayers @@ -4903,9 +6461,15 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/invites/{token}/accept: post: - operationId: acceptInvite + operationId: acceptWorkspaceInvite tags: [workspace] summary: Accept a workspace invitation description: "[cloud-only] Accepts a workspace invitation using the invite token. The authenticated user is added to the workspace." @@ -4923,7 +6487,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/Workspace" + $ref: "#/components/schemas/AcceptInviteResponse" "400": description: Bad request content: @@ -4943,6 +6507,24 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '403': + description: Email does not match invite + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '409': + description: Already a member of this workspace + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/secrets: get: operationId: listSecrets @@ -4965,6 +6547,18 @@ paths: application/json: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '503': + description: Service unavailable - feature is disabled + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' post: operationId: createSecret tags: [settings] @@ -4993,7 +6587,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/SecretMeta" + $ref: "#/components/schemas/SecretResponse" "400": description: Bad request content: @@ -5007,6 +6601,30 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '409': + description: Conflict - secret with this name or provider already exists + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '422': + description: Validation error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '503': + description: Service unavailable - secrets feature disabled + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/secrets/{id}: get: operationId: getSecret @@ -5028,7 +6646,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/SecretMeta" + $ref: "#/components/schemas/SecretResponse" "401": description: Unauthorized content: @@ -5041,6 +6659,24 @@ paths: application/json: schema: $ref: "#/components/schemas/CloudError" + '403': + description: Forbidden - user does not own this secret + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '503': + description: Service unavailable - secrets feature disabled + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' patch: operationId: updateSecret tags: [settings] @@ -5067,7 +6703,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/SecretMeta" + $ref: "#/components/schemas/SecretResponse" "400": description: Bad request content: @@ -5092,6 +6728,24 @@ paths: application/json: schema: $ref: "#/components/schemas/CloudError" + '403': + description: Forbidden - user does not own this secret + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '503': + description: Service unavailable - secrets feature disabled + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' delete: operationId: deleteSecret tags: [settings] @@ -5121,9 +6775,27 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '403': + description: Forbidden - user does not own this secret + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '503': + description: Service unavailable - secrets feature disabled + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/user: get: - operationId: getCloudUser + operationId: getUser tags: [user] summary: Get the authenticated cloud user description: "[cloud-only] Returns the profile and account information for the currently authenticated user." @@ -5134,7 +6806,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/CloudUser" + $ref: "#/components/schemas/UserResponse" "401": description: Unauthorized content: @@ -5212,8 +6884,14 @@ paths: application/json: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' post: - operationId: publishUserdataFile + operationId: postUserdataFilePublish tags: [userdata] summary: Publish a userdata file to the cloud description: "[cloud-only] Makes a userdata file available via a public URL for sharing or embedding." @@ -5250,9 +6928,21 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '400': + description: Bad request + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/vhs/queryvideo: get: - operationId: queryVhsVideo + operationId: getVhsQueryVideo tags: [view] summary: Query VHS video metadata description: "[cloud-only] Returns metadata about a video file processed by the VHS (Video Helper Suite) integration." @@ -5296,6 +6986,15 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '400': + description: 'Missing required query parameter. Produced by the oapi-codegen + wrapper via echo.NewHTTPError, so the body shape matches Echo''s + default HTTPError serialization rather than ErrorResponse. + ' + content: + application/json: + schema: + $ref: '#/components/schemas/BindingErrorResponse' /api/vhs/viewaudio: get: operationId: viewVhsAudio @@ -5516,6 +7215,12 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /api/tasks/{task_id}: get: operationId: getTask @@ -5551,6 +7256,12 @@ paths: schema: $ref: "#/components/schemas/CloudError" + '500': + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' components: parameters: ComfyUserHeader: @@ -5771,6 +7482,25 @@ components: type: string description: Array of prompt IDs to delete from queue + QueueManageResponse: + type: object + x-runtime: [cloud] + description: >- + [cloud-only] Result of a queue mutation. The Cloud runtime returns which + items were deleted and whether the queue was cleared; local ComfyUI + returns an empty 200 body. + properties: + deleted: + type: array + nullable: true + items: + type: string + description: Prompt IDs that were deleted from the queue. + cleared: + type: boolean + nullable: true + description: Whether the queue was cleared. + # ------------------------------------------------------------------- # History # ------------------------------------------------------------------- @@ -5825,16 +7555,25 @@ components: description: Unique job identifier (same as prompt_id) status: type: string + enum: + - pending + - in_progress + - completed + - failed + - cancelled description: Current job status create_time: - type: number - description: Job creation timestamp + type: integer + format: int64 + description: Job creation timestamp (Unix milliseconds). execution_start_time: - type: number - description: Workflow execution start timestamp + type: integer + format: int64 + description: Workflow execution start timestamp (Unix milliseconds, terminal states only). execution_end_time: - type: number - description: Workflow execution end timestamp + type: integer + format: int64 + description: Workflow execution end timestamp (Unix milliseconds, terminal states only). preview_output: type: object additionalProperties: true @@ -5842,6 +7581,16 @@ components: outputs_count: type: integer description: Total number of output files + workflow_id: + type: string + nullable: true + x-runtime: [cloud] + description: "[cloud-only] UUID of the Cloud workflow entity this job is associated with. Local ComfyUI returns null." + execution_error: + x-runtime: [cloud] + description: "[cloud-only] Detailed execution error from ComfyUI for failed jobs. Absent on local ComfyUI." + allOf: + - $ref: "#/components/schemas/ExecutionError" JobDetailResponse: type: object @@ -5855,6 +7604,12 @@ components: format: uuid status: type: string + enum: + - pending + - in_progress + - completed + - failed + - cancelled workflow: type: object additionalProperties: true @@ -5866,13 +7621,21 @@ components: execution_error: $ref: "#/components/schemas/ExecutionError" create_time: - type: number + type: integer + format: int64 + description: Job creation timestamp (Unix milliseconds). update_time: - type: number + type: integer + format: int64 + description: Last state-change timestamp (Unix milliseconds). execution_start_time: - type: number + type: integer + format: int64 + description: Workflow execution start timestamp (Unix milliseconds, terminal states only). execution_end_time: - type: number + type: integer + format: int64 + description: Workflow execution end timestamp (Unix milliseconds, terminal states only). preview_output: type: object additionalProperties: true @@ -6351,14 +8114,6 @@ components: type: integer format: int64 description: Size of the asset in bytes - width: - type: integer - nullable: true - description: "Original image width in pixels. Null for non-image assets or assets ingested before dimension extraction." - height: - type: integer - nullable: true - description: "Original image height in pixels. Null for non-image assets or assets ingested before dimension extraction." mime_type: type: string description: MIME type of the asset @@ -7094,24 +8849,35 @@ components: type: string description: Target path on the runtime filesystem - AssetImportRequest: + ImportPublishedAssetsRequest: type: object x-runtime: [cloud] - description: "[cloud-only] A single asset to import from an external URL." + description: "[cloud-only] Request body for importing published assets into the caller's library." required: - - url + - published_asset_ids properties: - url: - type: string - format: uri - description: URL of the asset to import - name: - type: string - description: Display name for the imported asset - tags: + published_asset_ids: type: array + description: IDs of published assets (inputs and models) to import. items: type: string + share_id: + type: string + nullable: true + description: | + Optional. Share ID of the published workflow these assets belong to. When provided (non-null, non-empty): all `published_asset_ids` must belong to this share's workflow version; returns 400 if the share is not found or any asset does not belong to it. When omitted, null, or empty string: no share-scoped validation is performed and the assets are validated only against global rules (preserved for clients that have not yet adopted `share_id`). + + ImportPublishedAssetsResponse: + type: object + x-runtime: [cloud] + description: "[cloud-only] Response after importing published assets. Each returned `AssetInfo.id` is the caller's newly-created private asset ID, not the published asset ID supplied in the request." + required: + - assets + properties: + assets: + type: array + items: + $ref: "#/components/schemas/AssetInfo" RemoteAssetMetadata: type: object @@ -7428,6 +9194,325 @@ components: description: RSA exponent (base64url) additionalProperties: true + OAuthAuthorizationServerMetadata: + type: object + x-runtime: [cloud] + description: "[cloud-only] OAuth 2.1 authorization-server metadata (RFC 8414)." + required: + - issuer + - authorization_endpoint + - token_endpoint + - jwks_uri + - response_types_supported + - grant_types_supported + - code_challenge_methods_supported + - token_endpoint_auth_methods_supported + properties: + issuer: + type: string + format: uri + authorization_endpoint: + type: string + format: uri + token_endpoint: + type: string + format: uri + jwks_uri: + type: string + format: uri + registration_endpoint: + type: string + format: uri + description: "[cloud-only] RFC 7591 §3.1 Dynamic Client Registration endpoint. Advertised so MCP-spec-compliant clients can auto-discover and self-register without operator involvement. Present only when DCR is enabled." + response_types_supported: + type: array + items: + type: string + grant_types_supported: + type: array + items: + type: string + code_challenge_methods_supported: + type: array + items: + type: string + token_endpoint_auth_methods_supported: + type: array + items: + type: string + scopes_supported: + type: array + items: + type: string + + OAuthProtectedResourceMetadata: + type: object + x-runtime: [cloud] + description: "[cloud-only] OAuth 2.1 protected-resource metadata (RFC 9728)." + required: + - resource + - authorization_servers + - scopes_supported + properties: + resource: + type: string + format: uri + authorization_servers: + type: array + items: + type: string + format: uri + scopes_supported: + type: array + items: + type: string + bearer_methods_supported: + type: array + items: + type: string + + OAuthConsentChallenge: + type: object + x-runtime: [cloud] + description: "[cloud-only] Server-side state describing the OAuth consent decision the user is being asked to make. Returned by GET /oauth/authorize when a valid session exists; the frontend renders the consent UI from this payload and POSTs the decision back. Browser never sees the original OAuth params on resume." + required: + - oauth_request_id + - csrf_token + - client_display_name + - resource_display_name + - scopes + - workspaces + properties: + oauth_request_id: + type: string + format: uuid + description: Opaque server-side identifier for the authorization-request row. Carried back unchanged in the consent submission. + csrf_token: + type: string + description: Per-row CSRF token bound to this authorization request (not to the session). Must be echoed back on POST. + client_display_name: + type: string + description: Human-readable name of the OAuth client requesting authorization. + resource_display_name: + type: string + description: Human-readable name of the protected resource. + scopes: + type: array + description: Scopes the client is requesting for this resource. The frontend should present these for the user to approve. + items: + type: string + workspaces: + type: array + description: Workspaces the user can select from. Membership is re-checked on POST. + items: + $ref: "#/components/schemas/OAuthConsentChallengeWorkspace" + + OAuthConsentChallengeWorkspace: + type: object + x-runtime: [cloud] + description: "[cloud-only] One workspace option presented in the OAuth consent challenge." + required: [id, name, type, role] + properties: + id: { type: string } + name: { type: string } + type: { type: string, enum: [personal, team] } + role: { type: string, enum: [owner, member] } + + OAuthAuthorizeRedirectResponse: + type: object + x-runtime: [cloud] + description: "[cloud-only] Redirect target produced after a JSON consent submission. The frontend must navigate the browser to this URL so custom-scheme client callbacks work without relying on fetch-visible 302 headers." + required: + - redirect_url + properties: + redirect_url: + type: string + format: uri + description: OAuth client redirect URI with either code+state for allow, or error+state for deny. + + OAuthTokenResponse: + type: object + x-runtime: [cloud] + description: "[cloud-only] RFC 6749 §5.1 successful token response." + required: [access_token, token_type, expires_in, refresh_token, scope] + properties: + access_token: + type: string + description: Resource-bound access token (audience matches the protected resource). + token_type: + type: string + enum: [Bearer] + expires_in: + type: integer + description: Access token lifetime in seconds. + refresh_token: + type: string + description: Opaque refresh token. Rotates on every successful refresh; presenting an already-rotated token revokes the entire family. + scope: + type: string + description: Space-delimited scopes granted with this token. + + OAuthTokenError: + type: object + x-runtime: [cloud] + description: "[cloud-only] RFC 6749 §5.2 error response." + required: [error] + properties: + error: + type: string + description: 'RFC 6749 §5.2 error code: invalid_request, invalid_client, invalid_grant, unauthorized_client, unsupported_grant_type, invalid_scope.' + error_description: + type: string + description: Human-readable, no leak of internal storage state. + + OAuthRegisterRequest: + type: object + x-runtime: [cloud] + additionalProperties: false + description: "[cloud-only] RFC 7591 §2 client metadata document. Only the fields the server honors are listed; presence of `scope` or `resource_grants` in the request is rejected (`invalid_client_metadata`) because those are server-owned for dynamic clients." + required: + - redirect_uris + - application_type + properties: + redirect_uris: + type: array + items: + type: string + minItems: 1 + maxItems: 5 + description: 1–5 redirect URIs. Validated against `application_type` policy. + client_name: + type: string + maxLength: 100 + description: Human-readable name shown in the consent UI. Reserved-name list rejects impersonation of major clients. + application_type: + type: string + enum: [native, web] + description: | + RFC 7591 §2 application_type. **REQUIRED** — clients MUST declare intent; the server does not default this field. `native` for desktop / CLI / MCP-spec-strict clients (loopback redirects); `web` for hosted clients (HTTPS only, host must be allowlisted). A missing or explicitly empty `application_type` rejects with `invalid_client_metadata`. + token_endpoint_auth_method: + type: string + enum: [none] + description: 'Public clients only this phase — must be `none` if present. The server forces `none` regardless.' + grant_types: + type: array + items: + type: string + enum: [authorization_code, refresh_token] + description: Optional. Defaults to `["authorization_code","refresh_token"]`. + response_types: + type: array + items: + type: string + enum: [code] + description: Optional. Defaults to `["code"]`. + scope: + type: string + nullable: true + description: "**REJECTED IF PRESENT.** Dynamic clients do not pick scopes — the server assigns scopes from the active resource's published list. Sending `scope` in the registration body is treated as a privilege-escalation attempt and returns `invalid_client_metadata`." + resource_grants: + type: object + nullable: true + additionalProperties: + type: array + items: + type: string + description: "**REJECTED IF PRESENT.** Same reason as `scope`. The set of resources and scopes a dynamic client may request is server-policy, not request-driven." + client_uri: + type: string + nullable: true + description: "**REJECTED IF PRESENT.** Unsupported RFC 7591 metadata for this public-client phase." + logo_uri: + type: string + nullable: true + description: "**REJECTED IF PRESENT.** Unsupported RFC 7591 metadata for this public-client phase." + tos_uri: + type: string + nullable: true + description: "**REJECTED IF PRESENT.** Unsupported RFC 7591 metadata for this public-client phase." + policy_uri: + type: string + nullable: true + description: "**REJECTED IF PRESENT.** Unsupported RFC 7591 metadata for this public-client phase." + software_id: + type: string + nullable: true + description: "**REJECTED IF PRESENT.** Unsupported RFC 7591 metadata for this public-client phase." + software_version: + type: string + nullable: true + description: "**REJECTED IF PRESENT.** Unsupported RFC 7591 metadata for this public-client phase." + contacts: + type: array + nullable: true + items: + type: string + description: "**REJECTED IF PRESENT.** Unsupported RFC 7591 metadata for this public-client phase." + jwks: + type: object + nullable: true + additionalProperties: true + description: "**REJECTED IF PRESENT.** Unsupported RFC 7591 metadata for this public-client phase." + jwks_uri: + type: string + nullable: true + description: "**REJECTED IF PRESENT.** Unsupported RFC 7591 metadata for this public-client phase." + + OAuthRegisterResponse: + type: object + x-runtime: [cloud] + description: "[cloud-only] RFC 7591 §3.2.1 successful registration response." + required: + - client_id + - client_id_issued_at + - redirect_uris + - grant_types + - response_types + - token_endpoint_auth_method + - application_type + properties: + client_id: + type: string + description: Server-generated client_id. + client_id_issued_at: + type: integer + format: int64 + description: Unix timestamp (seconds) when the client was registered. + client_name: + type: string + redirect_uris: + type: array + items: + type: string + grant_types: + type: array + items: + type: string + response_types: + type: array + items: + type: string + token_endpoint_auth_method: + type: string + enum: [none] + application_type: + type: string + enum: [native, web] + + OAuthRegisterError: + type: object + x-runtime: [cloud] + description: "[cloud-only] RFC 7591 §3.2.2 error response." + required: + - error + properties: + error: + type: string + enum: [invalid_redirect_uri, invalid_client_metadata] + error_description: + type: string + nullable: true + BillingBalance: type: object x-runtime: [cloud] @@ -7545,16 +9630,15 @@ components: description: List of plan features BillingStatus: - type: object + type: string x-runtime: [cloud] - description: "[cloud-only] Overall billing and subscription status." - properties: - subscription: - $ref: "#/components/schemas/BillingSubscription" - balance: - $ref: "#/components/schemas/BillingBalance" - has_payment_method: - type: boolean + description: "[cloud-only] Overall billing/payment lifecycle status." + enum: + - awaiting_payment_method + - pending_payment + - paid + - payment_failed + - inactive BillingSubscription: type: object @@ -7616,6 +9700,12 @@ components: type: string name: type: string + type: + type: string + enum: + - personal + - team + description: Workspace type (personal vs. team). owner_id: type: string member_count: @@ -7685,11 +9775,16 @@ components: required: - id - name + - description properties: id: type: string name: type: string + description: + type: string + maxLength: 5000 + description: User-provided description of the key's purpose. Always present in responses; empty string when no description was supplied on create. prefix: type: string description: First few characters of the key for identification @@ -7710,12 +9805,17 @@ components: required: - id - name + - description - key properties: id: type: string name: type: string + description: + type: string + maxLength: 5000 + description: User-provided description of the key's purpose. Always present in responses; empty string when no description was supplied on create. key: type: string description: Full API key value (only returned on creation) @@ -8116,4 +10216,1534 @@ components: items: $ref: "#/components/schemas/TaskEntry" pagination: - $ref: "#/components/schemas/PaginationInfo" \ No newline at end of file + $ref: "#/components/schemas/PaginationInfo" + + # ===== Cloud-only schemas (Comfy-Org/cloud runtime, BE-1106) ===== + AssetDownloadResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Acknowledgement of an async asset download task; clients poll GET /api/tasks/{task_id} for status.' + required: + - task_id + - status + properties: + task_id: + type: string + format: uuid + description: Task ID for tracking download progress via GET /api/tasks/{task_id} + status: + type: string + enum: + - created + - running + - completed + - failed + description: Current task status + message: + type: string + description: Human-readable message + example: Download task created. Use task_id to track progress. + + AssetMetadataResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Metadata for a remotely hosted asset resolved by URL.' + required: + - content_length + properties: + content_length: + type: integer + format: int64 + description: Size of the asset in bytes (-1 if unknown) + example: 4294967296 + content_type: + type: string + description: MIME type of the asset + example: application/octet-stream + filename: + type: string + description: Suggested filename for the asset from source + example: realistic-vision-v5.safetensors + name: + type: string + description: Display name or title for the asset from source + example: Realistic Vision v5.0 + tags: + type: array + items: + type: string + description: Tags for categorization from source + example: + - models + - checkpoint + preview_image: + type: string + description: Preview image as base64-encoded data URL + example: data:image/jpeg;base64,/9j/4AAQSkZJRg... + validation: + description: Validation results for the file + allOf: + - $ref: '#/components/schemas/ValidationResult' + + BillingBalanceResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Current credit balance and usage details for a workspace.' + required: + - amount_micros + - currency + properties: + amount_micros: + type: number + format: double + description: The total remaining balance in microamount (1/1,000,000 of the currency unit) + prepaid_balance_micros: + type: number + format: double + description: The remaining balance from prepaid commits in microamount + cloud_credit_balance_micros: + type: number + format: double + description: The remaining balance from cloud credits in microamount + pending_charges_micros: + type: number + format: double + description: The total amount of pending/unbilled charges from draft invoices in microamount + effective_balance_micros: + type: number + format: double + description: The effective balance (total balance minus pending charges). Can be negative if pending charges exceed + the balance. + currency: + type: string + example: usd + description: Currency code + + BillingPlansResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] List of available billing plans for subscription.' + required: + - plans + properties: + current_plan_slug: + type: string + description: Current plan slug if subscribed + plans: + type: array + items: + $ref: '#/components/schemas/Plan' + + BillingStatusResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Current billing and subscription status for a workspace.' + required: + - is_active + - has_funds + properties: + is_active: + type: boolean + description: Whether the workspace has an active subscription + subscription_status: + type: string + enum: + - active + - ended + - canceled + description: Subscription activity status (scheduled subscriptions are not returned) + subscription_tier: + $ref: '#/components/schemas/SubscriptionTier' + subscription_duration: + $ref: '#/components/schemas/SubscriptionDuration' + plan_slug: + type: string + description: Plan identifier (e.g., standard-monthly, team-pro-annual) + billing_status: + $ref: '#/components/schemas/BillingStatus' + has_funds: + type: boolean + description: Whether the workspace has available credits + cancel_at: + type: string + format: date-time + description: When the subscription will become inactive (if canceled) + renewal_date: + type: string + format: date-time + description: When the current billing period ends and the next one begins + + GetUserDataResponseFull: + type: array + x-runtime: [cloud] + description: '[cloud-only] List of user data file entries (each with path, size, and modification time) returned when full_info=true.' + items: + $ref: '#/components/schemas/GetUserDataResponseFullFile' + + HistoryDetailEntry: + type: object + x-runtime: [cloud] + description: '[cloud-only] History entry with full prompt data' + properties: + prompt: + type: object + description: Full prompt execution data + properties: + priority: + type: number + format: double + description: Execution priority + prompt_id: + type: string + description: The prompt ID + prompt: + type: object + description: The workflow nodes + additionalProperties: true + extra_data: + type: object + description: Additional execution data + additionalProperties: true + outputs_to_execute: + type: array + items: + type: string + description: Output nodes to execute + outputs: + type: object + description: Output data from execution (generated images, files, etc.) + additionalProperties: true + status: + type: object + description: Execution status and timeline information + additionalProperties: true + meta: + type: object + description: Metadata about the execution and nodes + additionalProperties: true + + HistoryDetailResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Detailed execution history response for a specific prompt. + + Returns a dictionary with prompt_id as key and full history data as value. + + ' + additionalProperties: + $ref: '#/components/schemas/HistoryDetailEntry' + + HistoryResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Execution history response with history array. + + Returns an object with a "history" key containing an array of history entries. + + Each entry includes prompt_id as a property along with execution data. + + ' + required: + - history + properties: + history: + type: array + description: Array of history entries ordered by creation time (newest first) + items: + $ref: '#/components/schemas/HistoryEntry' + + HubLabelInfo: + type: object + x-runtime: [cloud] + description: '[cloud-only] Metadata for a single Hub label.' + required: + - name + - display_name + - type + properties: + name: + type: string + description: Slug identifier. + display_name: + type: string + description: Human-readable display name. + description: + type: string + description: Optional description of the label. + type: + type: string + enum: + - tag + - model + - custom_node + description: Label category. + + HubLabelListResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Response wrapper for the available Hub label catalog.' + required: + - labels + properties: + labels: + type: array + items: + $ref: '#/components/schemas/HubLabelInfo' + description: Available labels, optionally filtered by type. + + HubProfileSummary: + type: object + x-runtime: [cloud] + description: '[cloud-only] Abbreviated Hub profile used in workflow listings.' + required: + - username + properties: + username: + type: string + display_name: + type: string + avatar_url: + type: string + description: Public URL of the profile avatar image. + + HubWorkflowListResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Paginated list of Hub workflows matching search criteria.' + required: + - workflows + properties: + workflows: + type: array + items: + anyOf: + - $ref: '#/components/schemas/HubWorkflowSummary' + - $ref: '#/components/schemas/HubWorkflowDetail' + description: Array of HubWorkflowSummary (default) or HubWorkflowDetail (when detail=true). + next_cursor: + type: string + description: Cursor for the next page, empty if no more results. + + HubWorkflowStatus: + type: string + x-runtime: [cloud] + description: '[cloud-only] Public workflow status. NULL in the database is represented as pending in API responses.' + enum: + - pending + - approved + - rejected + - deprecated + + HubWorkflowSummary: + type: object + x-runtime: [cloud] + description: '[cloud-only] Abbreviated Hub workflow metadata used in search and listing results.' + required: + - share_id + - name + - profile + - status + properties: + share_id: + type: string + name: + type: string + status: + $ref: '#/components/schemas/HubWorkflowStatus' + description: + type: string + tags: + type: array + items: + $ref: '#/components/schemas/LabelRef' + models: + type: array + items: + $ref: '#/components/schemas/LabelRef' + custom_nodes: + type: array + items: + $ref: '#/components/schemas/LabelRef' + thumbnail_type: + type: string + enum: + - image + - video + - image_comparison + thumbnail_url: + type: string + thumbnail_comparison_url: + type: string + publish_time: + type: string + format: date-time + nullable: true + profile: + $ref: '#/components/schemas/HubProfileSummary' + metadata: + type: object + additionalProperties: true + tutorial_url: + type: string + sample_image_urls: + type: array + items: + type: string + + HubWorkflowTemplateEntry: + type: object + x-runtime: [cloud] + description: '[cloud-only] Entry in the curated workflow template gallery shown on the home page.' + required: + - name + - title + - status + properties: + name: + type: string + description: Slug identifier for the template + title: + type: string + status: + $ref: '#/components/schemas/HubWorkflowStatus' + description: + type: string + tags: + type: array + items: + type: string + models: + type: array + items: + type: string + requiresCustomNodes: + type: array + items: + type: string + thumbnailVariant: + type: string + mediaType: + type: string + mediaSubtype: + type: string + size: + type: integer + format: int64 + description: Workflow asset size in bytes. + vram: + type: integer + format: int64 + description: Approximate VRAM requirement in bytes. + usage: + type: integer + format: int64 + description: Usage count reported upstream. + searchRank: + type: integer + format: int64 + description: Search ranking score reported upstream. + isEssential: + type: boolean + description: Whether the template belongs to a module marked as essential. + openSource: + type: boolean + profile: + $ref: '#/components/schemas/HubProfileSummary' + tutorialUrl: + type: string + logos: + type: array + items: + type: object + additionalProperties: true + date: + type: string + description: Publication date in YYYY-MM-DD format + io: + type: object + properties: + inputs: + type: array + items: + type: object + additionalProperties: true + outputs: + type: array + items: + type: object + additionalProperties: true + includeOnDistributions: + type: array + items: + type: string + thumbnailUrl: + type: string + description: Public URL of the primary thumbnail + thumbnailComparisonUrl: + type: string + description: Public URL of the comparison thumbnail + shareId: + type: string + description: Share ID for linking to the hub workflow detail + extendedDescription: + type: string + description: AI-generated extended description of the workflow + metaDescription: + type: string + description: AI-generated SEO meta description (under 160 chars) + howToUse: + type: array + items: + type: string + description: AI-generated step-by-step usage instructions + suggestedUseCases: + type: array + items: + type: string + description: AI-generated suggested use cases + faqItems: + type: array + items: + type: object + required: + - question + - answer + properties: + question: + type: string + answer: + type: string + description: AI-generated FAQ items + contentTemplate: + type: string + description: Content template used for generation (tutorial, showcase, comparison, breakthrough) + + JobStatusResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Job status information' + properties: + id: + type: string + format: uuid + description: The job ID + status: + type: string + enum: + - waiting_to_dispatch + - pending + - in_progress + - completed + - error + - cancelled + description: Current job status + created_at: + type: string + format: date-time + description: When the job was created + updated_at: + type: string + format: date-time + description: When the job was last updated + last_state_update: + type: string + format: date-time + description: When the job status was last changed + assigned_inference: + type: string + nullable: true + description: The inference instance assigned to this job (if any) + error_message: + type: string + nullable: true + description: Error message if the job failed + required: + - id + - status + - created_at + - updated_at + + JobsListResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Paginated list of jobs for the authenticated user.' + required: + - jobs + - pagination + properties: + jobs: + type: array + description: Array of jobs ordered by specified sort field + items: + $ref: '#/components/schemas/JobEntry' + pagination: + $ref: '#/components/schemas/PaginationInfo' + + LabelRef: + type: object + x-runtime: [cloud] + description: '[cloud-only] Reference to a Hub label by ID.' + required: + - name + - display_name + properties: + name: + type: string + description: Slug identifier (e.g. "video-generation", "flux"). + display_name: + type: string + description: Human-readable display name (e.g. "Video Generation", "Flux"). + + LogsResponse: + type: array + x-runtime: [cloud] + description: '[cloud-only] System logs response' + items: + type: object + properties: + timestamp: + type: string + format: date-time + description: When the log entry was created + level: + type: string + enum: + - debug + - info + - warn + - error + description: Log level + message: + type: string + description: Log message + source: + type: string + description: Source of the log entry + metadata: + type: object + additionalProperties: true + description: Additional log metadata + + Member: + type: object + x-runtime: [cloud] + description: '[cloud-only] Workspace member with profile and role information.' + required: + - id + - name + - email + - role + - joined_at + properties: + id: + type: string + description: User ID + name: + type: string + description: User's display name + email: + type: string + format: email + description: User's email address + role: + type: string + enum: + - owner + - member + description: User's role in the workspace + joined_at: + type: string + format: date-time + description: When the user joined the workspace + + OAuthRegisterBadRequestResponse: + x-runtime: [cloud] + description: "[cloud-only] Union of the two 400 shapes /oauth/register can emit. `OAuthRegisterError` is the handler-shaped\ + \ RFC 7591 \xA73.2.2 error; `BindingErrorResponse` is the strict-server binding-layer error fired when the request body\ + \ fails OpenAPI-schema validation before the handler runs.\n" + oneOf: + - $ref: '#/components/schemas/OAuthRegisterError' + - $ref: '#/components/schemas/BindingErrorResponse' + + PendingInvite: + type: object + x-runtime: [cloud] + description: '[cloud-only] An outstanding workspace invitation that has not yet been accepted.' + required: + - id + - email + - invited_at + - expires_at + properties: + id: + type: string + description: Invite ID + email: + type: string + format: email + description: Email address of the invited user + token: + type: string + description: Invite token for constructing invite links. Empty for expired invites. + invited_at: + type: string + format: date-time + description: When the invite was created + expires_at: + type: string + format: date-time + description: When the invite expires + + Plan: + type: object + x-runtime: [cloud] + description: '[cloud-only] Billing plan details including pricing, limits, and features.' + required: + - slug + - tier + - duration + - price_cents + - credits_cents + - max_seats + - availability + - seat_summary + properties: + slug: + type: string + description: Plan identifier (e.g., "pro-monthly", "team-standard-annual") + example: pro-monthly + tier: + $ref: '#/components/schemas/SubscriptionTier' + duration: + $ref: '#/components/schemas/SubscriptionDuration' + price_cents: + type: integer + format: int64 + description: Per-member price in cents (base + one seat) + example: 10000 + credits_cents: + type: integer + format: int64 + description: Per-member credits in cents (base + one seat) + example: 10000 + max_seats: + type: integer + format: int64 + description: Maximum number of seats allowed for this plan + example: 20 + availability: + $ref: '#/components/schemas/PlanAvailability' + seat_summary: + $ref: '#/components/schemas/PlanSeatSummary' + + PlanAvailability: + type: object + x-runtime: [cloud] + description: '[cloud-only] Availability and eligibility information for a billing plan.' + required: + - available + properties: + available: + type: boolean + description: Whether the workspace can subscribe to this plan + reason: + $ref: '#/components/schemas/PlanAvailabilityReason' + + PlanAvailabilityReason: + type: string + x-runtime: [cloud] + enum: + - same_plan + - incompatible_transition + - requires_team + - requires_personal + - exceeds_max_seats + description: '[cloud-only] Reason why a plan is unavailable' + + PlanSeatSummary: + type: object + x-runtime: [cloud] + description: '[cloud-only] Summary of seat costs based on current workspace members' + required: + - seat_count + - total_cost_cents + - total_credits_cents + properties: + seat_count: + type: integer + description: Total number of seats (owner + members) that would be charged + example: 5 + total_cost_cents: + type: integer + format: int64 + description: Total cost for all seats in cents + example: 50000 + total_credits_cents: + type: integer + format: int64 + description: Total credits granted for all seats in cents + example: 50000 + + PreviewPlanInfo: + type: object + x-runtime: [cloud] + description: '[cloud-only] Plan information for preview display' + required: + - slug + - tier + - duration + - price_cents + - credits_cents + - seat_summary + properties: + slug: + type: string + description: Plan slug + example: team-pro-monthly + tier: + $ref: '#/components/schemas/SubscriptionTier' + duration: + $ref: '#/components/schemas/SubscriptionDuration' + price_cents: + type: integer + format: int64 + description: Per-seat price in cents + example: 10000 + credits_cents: + type: integer + format: int64 + description: Per-seat credits in cents + example: 10000 + seat_summary: + $ref: '#/components/schemas/PlanSeatSummary' + period_start: + type: string + format: date-time + description: Current billing period start (only for current_plan) + period_end: + type: string + format: date-time + description: Current billing period end (only for current_plan) + + PreviewSubscribeResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Itemized cost preview for a pending subscription change.' + required: + - allowed + - transition_type + - effective_at + - is_immediate + - cost_today_cents + - cost_next_period_cents + - credits_today_cents + - credits_next_period_cents + - new_plan + properties: + allowed: + type: boolean + description: Whether this subscription change is allowed + reason: + type: string + description: Reason why the change is not allowed (only present if allowed=false) + transition_type: + type: string + enum: + - new_subscription + - upgrade + - downgrade + - duration_change + description: Type of subscription transition + effective_at: + type: string + format: date-time + description: When the change takes effect + is_immediate: + type: boolean + description: Whether the change takes effect immediately (true) or at period end (false) + cost_today_cents: + type: integer + format: int64 + description: Amount to charge today in cents (0 for downgrades) + example: 5000 + cost_next_period_cents: + type: integer + format: int64 + description: Amount that will be charged at next billing period in cents + example: 10000 + credits_today_cents: + type: integer + format: int64 + description: Credits granted today in cents (prorated for mid-period upgrades) + example: 5000 + credits_next_period_cents: + type: integer + format: int64 + description: Credits that will be granted at next billing period in cents + example: 10000 + current_plan: + $ref: '#/components/schemas/PreviewPlanInfo' + new_plan: + $ref: '#/components/schemas/PreviewPlanInfo' + + PublishedWorkflowDetail: + type: object + x-runtime: [cloud] + description: '[cloud-only] Full detail of a publicly published workflow on the Hub.' + required: + - share_id + - workflow_id + - name + - listed + - workflow_json + - assets + properties: + share_id: + type: string + workflow_id: + type: string + name: + type: string + description: Human-readable workflow name. + listed: + type: boolean + publish_time: + type: string + format: date-time + nullable: true + workflow_json: + type: object + additionalProperties: true + description: The workflow JSON content at publish time. + assets: + type: array + description: Published assets with their library status for the caller. + items: + $ref: '#/components/schemas/AssetInfo' + + SecretResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] User secret metadata (the secret value itself is never returned after creation).' + required: + - id + - name + - created_at + - updated_at + properties: + id: + type: string + format: uuid + description: Unique identifier for the secret + name: + type: string + description: User-provided label for the secret + provider: + type: string + description: Provider identifier (e.g., huggingface, civitai) + last_used_at: + type: string + format: date-time + description: When the secret was last used for decryption + created_at: + type: string + format: date-time + description: When the secret was created + updated_at: + type: string + format: date-time + description: When the secret was last updated + + SubscriptionDuration: + type: string + x-runtime: [cloud] + enum: + - MONTHLY + - ANNUAL + description: '[cloud-only] Billing period (uppercase to match comfy-api)' + + SubscriptionTier: + type: string + x-runtime: [cloud] + enum: + - FREE + - STANDARD + - CREATOR + - PRO + - FOUNDERS_EDITION + description: '[cloud-only] Subscription tier (uppercase to match comfy-api)' + + UserDataResponseFull: + type: object + x-runtime: [cloud] + description: '[cloud-only] User data listing entry with file metadata (path, size, modification time).' + properties: + path: + type: string + size: + type: integer + modified: + type: integer + format: int64 + description: UNIX timestamp of the last modification in milliseconds. + + ValidationError: + type: object + x-runtime: [cloud] + description: '[cloud-only] Details of a single validation error encountered during asset operations.' + required: + - code + - message + - field + properties: + code: + type: string + description: Machine-readable error code + example: FORMAT_NOT_ALLOWED + message: + type: string + description: Human-readable error message + example: 'File format "PickleTensor" is not allowed. Allowed formats: [SafeTensor]' + field: + type: string + description: Field that failed validation + example: format + + ValidationResult: + type: object + x-runtime: [cloud] + description: '[cloud-only] Result of validating a set of asset operations.' + required: + - is_valid + properties: + is_valid: + type: boolean + description: Overall validation status (true if all checks passed) + example: true + errors: + type: array + items: + $ref: '#/components/schemas/ValidationError' + description: Blocking validation errors that prevent download + warnings: + type: array + items: + $ref: '#/components/schemas/ValidationError' + description: Non-blocking validation warnings (informational only) + + WorkflowForkedFrom: + type: object + x-runtime: [cloud] + description: '[cloud-only] Reference to the parent workflow from which this workflow was forked.' + properties: + workflow_id: + type: string + workflow_version_id: + type: string + + WorkflowResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Full workflow entity including metadata and version history.' + required: + - id + - latest_version + - created_by + - created_at + - updated_at + properties: + id: + type: string + name: + type: string + description: + type: string + default_view: + type: string + enum: + - workflow + - app + latest_version: + type: integer + forked_from: + $ref: '#/components/schemas/WorkflowForkedFrom' + created_by: + type: string + created_at: + type: string + format: date-time + updated_at: + type: string + format: date-time + + WorkflowVersionContentResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Full workflow version including the serialized workflow JSON.' + required: + - id + - version + - workflow_json + - created_by + - created_at + properties: + id: + type: string + version: + type: integer + workflow_json: + type: object + additionalProperties: true + created_by: + type: string + created_at: + type: string + format: date-time + dependency_asset_ids: + type: array + items: + type: string + + WorkspaceAPIKeyInfo: + type: object + x-runtime: [cloud] + description: '[cloud-only] Metadata for a workspace-scoped API key (secret is never returned).' + required: + - id + - workspace_id + - user_id + - name + - description + - key_prefix + - created_at + properties: + id: + type: string + format: uuid + description: API key ID + workspace_id: + type: string + description: Workspace this key belongs to + user_id: + type: string + description: User who created this key + name: + type: string + description: User-provided label + description: + type: string + description: User-provided description of the key's purpose. Limit is byte-based (UTF-8 encoding); 5000 bytes equals + 5000 ASCII characters or fewer multi-byte characters. + maxLength: 5000 + key_prefix: + type: string + description: First 8 chars after prefix for display + expires_at: + type: string + format: date-time + description: When the key expires (if set) + last_used_at: + type: string + format: date-time + description: Last time the key was used + revoked_at: + type: string + format: date-time + description: When the key was revoked (if revoked) + created_at: + type: string + format: date-time + description: When the key was created + + WorkspaceSummary: + type: object + x-runtime: [cloud] + description: '[cloud-only] Abbreviated workspace metadata used in list responses.' + required: + - id + - name + - type + properties: + id: + type: string + example: w-a1b2c3d4-5678-90ab-cdef-1234567890ab + name: + type: string + example: My Team + type: + type: string + enum: + - personal + - team + + WorkspaceWithRole: + type: object + x-runtime: [cloud] + description: '[cloud-only] Workspace entity annotated with the requesting user''s role.' + required: + - id + - name + - type + - role + - created_at + - joined_at + properties: + id: + type: string + example: w-a1b2c3d4-5678-90ab-cdef-1234567890ab + name: + type: string + example: My Team + type: + type: string + enum: + - personal + - team + role: + type: string + enum: + - owner + - member + created_at: + type: string + format: date-time + description: When the workspace was created + joined_at: + type: string + format: date-time + description: When the user joined the workspace (same as created_at for the workspace creator) + subscription_tier: + $ref: '#/components/schemas/SubscriptionTier' + + BindingErrorResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Error shape returned when request binding or validation fails before the handler runs.' + required: + - message + properties: + message: + type: string + + ErrorResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Standard error response from cloud endpoints with a machine-readable code and human-readable message.' + required: + - code + - message + properties: + code: + type: string + description: Machine-readable error code + message: + type: string + description: Human-readable error message + + AcceptInviteResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Response returned after successfully accepting a workspace invitation.' + required: + - workspace_id + - workspace_name + properties: + workspace_id: + type: string + description: ID of the workspace joined + workspace_name: + type: string + description: Name of the workspace joined + + BillingEventsResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Paginated list of billing events for a workspace.' + required: + - total + - events + - page + - limit + - totalPages + properties: + total: + type: integer + description: Total number of events + events: + type: array + items: + $ref: '#/components/schemas/BillingEvent' + page: + type: integer + description: Current page number (1-indexed) + limit: + type: integer + description: Items per page + totalPages: + type: integer + description: Total number of pages + + BillingOpStatusResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Status of an asynchronous billing operation.' + required: + - id + - status + - started_at + properties: + id: + type: string + description: Unique identifier for the billing operation + status: + type: string + enum: + - pending + - succeeded + - failed + description: Current status of the operation + error_message: + type: string + description: Error message if status is failed + started_at: + type: string + format: date-time + description: When the operation was initiated + completed_at: + type: string + format: date-time + description: When the operation completed (success or failure) + + CancelSubscriptionResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Response after successfully cancelling a subscription.' + required: + - cancel_at + - billing_op_id + properties: + billing_op_id: + type: string + description: Billing operation ID to poll for status via GET /api/billing/ops/{id} + cancel_at: + type: string + format: date-time + description: The date when the subscription will end (end of current billing period) + + CreateTopupResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Response after successfully purchasing a credit top-up.' + required: + - topup_id + - status + - amount_cents + - billing_op_id + properties: + billing_op_id: + type: string + description: Billing operation ID to poll for status via GET /api/billing/ops/{id} + topup_id: + type: string + description: Unique identifier for the top-up request (same as billing_op_id, deprecated) + status: + type: string + enum: + - pending + - completed + - failed + description: Current status of the top-up + amount_cents: + type: integer + format: int64 + description: Amount being charged in cents + + CreateWorkspaceAPIKeyResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Response containing the newly created workspace API key.' + required: + - id + - name + - description + - key + - key_prefix + - created_at + properties: + id: + type: string + format: uuid + description: API key ID + name: + type: string + description: User-provided label + description: + type: string + description: User-provided description of the key's purpose. Limit is byte-based (UTF-8 encoding); 5000 bytes equals + 5000 ASCII characters or fewer multi-byte characters. + maxLength: 5000 + key: + type: string + description: The full plaintext API key (only shown once) + key_prefix: + type: string + description: First 8 chars after prefix for display + expires_at: + type: string + format: date-time + description: When the key expires (if set) + created_at: + type: string + format: date-time + description: When the key was created + + ExchangeTokenResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Response containing the issued Cloud JWT and its expiry.' + required: + - token + - expires_at + - workspace + - role + - permissions + properties: + token: + type: string + description: Cloud JWT token + expires_at: + type: string + format: date-time + description: Token expiration time (RFC 3339) + workspace: + $ref: '#/components/schemas/WorkspaceSummary' + role: + type: string + enum: + - owner + - member + description: User's role in the workspace + permissions: + type: array + items: + type: string + description: Permission strings for the role + example: + - owner:* + + JobCancelResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Response for POST /api/jobs/{job_id}/cancel. Returned on both fresh cancels and idempotent no-ops.' + required: + - cancelled + properties: + cancelled: + type: boolean + description: "True when a cancel event was successfully dispatched by this call.\nFalse when the job was already in\ + \ a terminal or cancelling state,\nin which case the call is a no-op (still 200 \u2014 idempotent).\n" + + ResubscribeResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Response after successfully resubscribing to a billing plan.' + required: + - status + - billing_op_id + properties: + billing_op_id: + type: string + description: Billing operation ID to poll for status via GET /api/billing/ops/{id} + status: + type: string + enum: + - active + description: The subscription status after resubscribing + message: + type: string + description: Human-readable confirmation message + + SubscribeResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Response after successfully subscribing to a billing plan.' + required: + - status + - billing_op_id + properties: + billing_op_id: + type: string + description: Billing operation ID to poll for status via GET /api/billing/ops/{id} + status: + type: string + enum: + - subscribed + - needs_payment_method + - pending_payment + description: 'Status of the subscription operation: + + - subscribed: Subscription is active immediately + + - needs_payment_method: User must add payment method via payment_method_url + + - pending_payment: Upgrade initiated, waiting for payment to complete + + ' + effective_at: + type: string + format: date-time + description: When the subscription became/becomes active (present when status=subscribed or pending_payment) + payment_method_url: + type: string + description: URL to redirect user to add payment method (present when status=needs_payment_method) + + UserResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] User information response' + required: + - id + - status + properties: + id: + type: string + description: Firebase UID of the authenticated user + status: + type: string + description: User status (always "active" for authenticated users) + + WorkflowListResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Paginated list of saved workflows.' + required: + - data + - pagination + properties: + data: + type: array + items: + $ref: '#/components/schemas/WorkflowResponse' + pagination: + $ref: '#/components/schemas/PaginationInfo' + + FeedbackRequest: + type: object + x-runtime: [cloud] + description: "[cloud-only] User feedback submission body." + required: + - message + properties: + type: + type: string + enum: + - missing_nodes + - general + - missing_models + description: Feedback category + category: + type: string + description: Additional category metadata + message: + type: string + description: User-provided feedback message diff --git a/pyproject.toml b/pyproject.toml index 0a1554428..1e449b4a3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.21.1" +version = "0.22.0" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.10" diff --git a/requirements.txt b/requirements.txt index f499a10ae..651315cb2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -comfyui-frontend-package==1.43.18 -comfyui-workflow-templates==0.9.77 -comfyui-embedded-docs==0.5.0 +comfyui-frontend-package==1.44.19 +comfyui-workflow-templates==0.9.85 +comfyui-embedded-docs==0.5.1 torch torchsde torchvision @@ -21,9 +21,9 @@ psutil alembic SQLAlchemy>=2.0.0 filelock -av>=14.2.0 -comfy-kitchen>=0.2.8 -comfy-aimdo==0.3.0 +av>=16.0.0 +comfy-kitchen==0.2.9 +comfy-aimdo==0.4.5 requests simpleeval>=1.0.0 blake3 diff --git a/server.py b/server.py index 44470b904..268441bd1 100644 --- a/server.py +++ b/server.py @@ -646,18 +646,37 @@ class PromptServer(): @routes.get("/system_stats") async def system_stats(request): - device = comfy.model_management.get_torch_device() - device_name = comfy.model_management.get_torch_device_name(device) + primary_device = comfy.model_management.get_torch_device() cpu_device = comfy.model_management.torch.device("cpu") ram_total = comfy.model_management.get_total_memory(cpu_device) ram_free = comfy.model_management.get_free_memory(cpu_device) - vram_total, torch_vram_total = comfy.model_management.get_total_memory(device, torch_total_too=True) - vram_free, torch_vram_free = comfy.model_management.get_free_memory(device, torch_free_too=True) required_frontend_version = FrontendManager.get_required_frontend_version() installed_templates_version = FrontendManager.get_installed_templates_version() required_templates_version = FrontendManager.get_required_templates_version() comfy_package_versions = FrontendManager.get_comfy_package_versions() + # Report every torch device visible to multigpu, with the primary + # device first so existing clients that read devices[0] keep working. + torch_devices = comfy.model_management.get_all_torch_devices() + if primary_device in torch_devices: + torch_devices = [primary_device] + [d for d in torch_devices if d != primary_device] + else: + torch_devices = [primary_device] + list(torch_devices) + + device_entries = [] + for d in torch_devices: + vram_total, torch_vram_total = comfy.model_management.get_total_memory(d, torch_total_too=True) + vram_free, torch_vram_free = comfy.model_management.get_free_memory(d, torch_free_too=True) + device_entries.append({ + "name": comfy.model_management.get_torch_device_name(d), + "type": d.type, + "index": d.index, + "vram_total": vram_total, + "vram_free": vram_free, + "torch_vram_total": torch_vram_total, + "torch_vram_free": torch_vram_free, + }) + system_stats = { "system": { "os": sys.platform, @@ -673,17 +692,7 @@ class PromptServer(): "embedded_python": os.path.split(os.path.split(sys.executable)[0])[1] == "python_embeded", "argv": sys.argv }, - "devices": [ - { - "name": device_name, - "type": device.type, - "index": device.index, - "vram_total": vram_total, - "vram_free": vram_free, - "torch_vram_total": torch_vram_total, - "torch_vram_free": torch_vram_free, - } - ] + "devices": device_entries } return web.json_response(system_stats) diff --git a/tests-unit/assets_test/conftest.py b/tests-unit/assets_test/conftest.py index 6c5c56113..9867b4e14 100644 --- a/tests-unit/assets_test/conftest.py +++ b/tests-unit/assets_test/conftest.py @@ -236,6 +236,8 @@ def seeded_asset(request: pytest.FixtureRequest, http: requests.Session, api_bas r = http.post(api_base + "/api/assets", files=files, data=form_data, timeout=120) body = r.json() assert r.status_code == 201, body + from helpers import assert_hash_fields_consistent + assert_hash_fields_consistent(body) return body diff --git a/tests-unit/assets_test/helpers.py b/tests-unit/assets_test/helpers.py index 770e011f4..ae3de6dc3 100644 --- a/tests-unit/assets_test/helpers.py +++ b/tests-unit/assets_test/helpers.py @@ -26,3 +26,26 @@ def trigger_sync_seed_assets(session: requests.Session, base_url: str) -> None: def get_asset_filename(asset_hash: str, extension: str) -> str: return asset_hash.removeprefix("blake3:") + extension + + +def assert_hash_fields_consistent(body: dict, expected_hash: str | None = None) -> None: + """Assert hash and asset_hash invariants on an Asset response. + + Both must be present or both absent (so a regression that drops only one + is caught). When present, they must equal each other and, if expected_hash + is provided, must equal that value. + """ + hash_present = "hash" in body + asset_hash_present = "asset_hash" in body + assert hash_present == asset_hash_present, ( + f"hash and asset_hash must both be present or both absent: " + f"hash present={hash_present}, asset_hash present={asset_hash_present}" + ) + if hash_present: + h = body["hash"] + ah = body["asset_hash"] + assert h == ah, f"hash and asset_hash must match: hash={h!r}, asset_hash={ah!r}" + if expected_hash is not None: + assert h == expected_hash, ( + f"hash must equal expected: got {h!r}, expected {expected_hash!r}" + ) diff --git a/tests-unit/assets_test/test_assets_missing_sync.py b/tests-unit/assets_test/test_assets_missing_sync.py index 47dc130cb..29ec1d09d 100644 --- a/tests-unit/assets_test/test_assets_missing_sync.py +++ b/tests-unit/assets_test/test_assets_missing_sync.py @@ -40,7 +40,9 @@ def test_seed_asset_removed_when_file_is_deleted( # there should be exactly one with that name matches = [a for a in body1.get("assets", []) if a.get("name") == name] assert matches - assert matches[0].get("asset_hash") is None + # Seed assets have no hash; exclude_none drops both keys from the response + assert "asset_hash" not in matches[0] + assert "hash" not in matches[0] asset_info_id = matches[0]["id"] # Remove the underlying file and sync again diff --git a/tests-unit/assets_test/test_crud.py b/tests-unit/assets_test/test_crud.py index 07310223e..fd2e9a098 100644 --- a/tests-unit/assets_test/test_crud.py +++ b/tests-unit/assets_test/test_crud.py @@ -21,6 +21,8 @@ def test_create_from_hash_success( b1 = r1.json() assert r1.status_code == 201, b1 assert b1["asset_hash"] == h + assert b1["hash"] == h + assert b1["hash"] == b1["asset_hash"] assert b1["created_new"] is False aid = b1["id"] @@ -39,6 +41,7 @@ def test_get_and_delete_asset(http: requests.Session, api_base: str, seeded_asse detail = rg.json() assert rg.status_code == 200, detail assert detail["id"] == aid + assert detail["hash"] == detail["asset_hash"] assert "user_metadata" in detail assert "filename" in detail["user_metadata"] @@ -97,6 +100,7 @@ def test_delete_upon_reference_count( copy = r2.json() assert r2.status_code == 201, copy assert copy["asset_hash"] == src_hash + assert copy["hash"] == src_hash assert copy["created_new"] is False # Soft-delete original reference (default) -> asset identity must remain @@ -139,6 +143,7 @@ def test_update_asset_fields(http: requests.Session, api_base: str, seeded_asset body = ru.json() assert ru.status_code == 200, body assert body["name"] == payload["name"] + assert body["hash"] == body["asset_hash"] assert body["tags"] == original_tags # tags unchanged assert body["user_metadata"]["purpose"] == "updated" # filename should still be present and normalized by server @@ -289,7 +294,9 @@ def test_metadata_filename_is_set_for_seed_asset_without_hash( assert r1.status_code == 200, body matches = [a for a in body.get("assets", []) if a.get("name") == name] assert matches, "Seed asset should be visible after sync" - assert matches[0].get("asset_hash") is None # still a seed + # Seed assets have no hash; exclude_none drops both keys from the response + assert "asset_hash" not in matches[0] + assert "hash" not in matches[0] aid = matches[0]["id"] r2 = http.get(f"{api_base}/api/assets/{aid}", timeout=120) diff --git a/tests-unit/assets_test/test_list_filter.py b/tests-unit/assets_test/test_list_filter.py index dcb7a73ca..17bbea5c6 100644 --- a/tests-unit/assets_test/test_list_filter.py +++ b/tests-unit/assets_test/test_list_filter.py @@ -3,6 +3,7 @@ import uuid import pytest import requests +from helpers import assert_hash_fields_consistent def test_list_assets_paging_and_sort(http: requests.Session, api_base: str, asset_factory, make_asset_bytes): @@ -26,6 +27,10 @@ def test_list_assets_paging_and_sort(http: requests.Session, api_base: str, asse got1 = [a["name"] for a in b1["assets"]] assert got1 == sorted(names)[:2] assert b1["has_more"] is True + # Populated assets in list responses must carry both `hash` and `asset_hash` consistently + for asset in b1["assets"]: + assert_hash_fields_consistent(asset) + assert "hash" in asset, "populated asset must emit hash on list endpoint" r2 = http.get( api_base + "/api/assets", diff --git a/tests-unit/assets_test/test_uploads.py b/tests-unit/assets_test/test_uploads.py index 0f2b124a3..427a417cc 100644 --- a/tests-unit/assets_test/test_uploads.py +++ b/tests-unit/assets_test/test_uploads.py @@ -5,6 +5,20 @@ from concurrent.futures import ThreadPoolExecutor import requests import pytest +from app.assets.api.schemas_out import Asset, AssetCreated + + +def test_asset_created_inherits_hash_field(): + """AssetCreated must inherit `hash` from Asset so POST /api/assets responses emit it. + + Schema-level guard: integration tests cover the wire shape, but inheritance + drift (e.g. AssetCreated ever being redefined to no longer extend Asset) + would silently drop `hash` from a major endpoint without this check. + """ + assert "hash" in Asset.model_fields + assert "hash" in AssetCreated.model_fields + assert AssetCreated.model_fields["hash"].annotation == Asset.model_fields["hash"].annotation + def test_upload_ok_duplicate_reference(http: requests.Session, api_base: str, make_asset_bytes): name = "dup_a.safetensors" @@ -17,6 +31,7 @@ def test_upload_ok_duplicate_reference(http: requests.Session, api_base: str, ma a1 = r1.json() assert r1.status_code == 201, a1 assert a1["created_new"] is True + assert a1["hash"] == a1["asset_hash"] # Second upload with the same data and name creates a new AssetReference (duplicates allowed) # Returns 200 because Asset already exists, but a new AssetReference is created @@ -26,6 +41,7 @@ def test_upload_ok_duplicate_reference(http: requests.Session, api_base: str, ma a2 = r2.json() assert r2.status_code in (200, 201), a2 assert a2["asset_hash"] == a1["asset_hash"] + assert a2["hash"] == a1["hash"] assert a2["id"] != a1["id"] # new reference with same content # Third upload with the same data but different name also creates new AssetReference @@ -50,6 +66,7 @@ def test_upload_fastpath_from_existing_hash_no_file(http: requests.Session, api_ b1 = r1.json() assert r1.status_code == 201, b1 h = b1["asset_hash"] + assert b1["hash"] == h # Now POST /api/assets with only hash and no file files = [ @@ -63,6 +80,7 @@ def test_upload_fastpath_from_existing_hash_no_file(http: requests.Session, api_ assert r2.status_code == 200, b2 # fast path returns 200 with created_new == False assert b2["created_new"] is False assert b2["asset_hash"] == h + assert b2["hash"] == h def test_upload_fastpath_with_known_hash_and_file( @@ -75,6 +93,7 @@ def test_upload_fastpath_with_known_hash_and_file( b1 = r1.json() assert r1.status_code == 201, b1 h = b1["asset_hash"] + assert b1["hash"] == h # Send both file and hash of existing content -> server must drain file and create from hash (200) files = {"file": ("ignored.bin", b"ignored" * 10, "application/octet-stream")} @@ -84,6 +103,7 @@ def test_upload_fastpath_with_known_hash_and_file( assert r2.status_code == 200, b2 assert b2["created_new"] is False assert b2["asset_hash"] == h + assert b2["hash"] == h def test_upload_multiple_tags_fields_are_merged(http: requests.Session, api_base: str): @@ -142,6 +162,8 @@ def test_concurrent_upload_identical_bytes_different_names( assert r1.status_code in (200, 201), b1 assert r2.status_code in (200, 201), b2 assert b1["asset_hash"] == b2["asset_hash"] + assert b1["hash"] == b2["hash"] + assert b1["hash"] == b1["asset_hash"] assert b1["id"] != b2["id"] created_flags = sorted([bool(b1.get("created_new")), bool(b2.get("created_new"))]) diff --git a/tests/execution/test_async_nodes.py b/tests/execution/test_async_nodes.py index c771b4b36..54660c112 100644 --- a/tests/execution/test_async_nodes.py +++ b/tests/execution/test_async_nodes.py @@ -14,7 +14,6 @@ from tests.execution.test_execution import ComfyClient, run_warmup class TestAsyncNodes: @fixture(scope="class", autouse=True, params=[ (False, 0), - (True, 0), (True, 100), ]) def _server(self, args_pytest, request): @@ -29,6 +28,8 @@ class TestAsyncNodes: use_lru, lru_size = request.param if use_lru: pargs += ['--cache-lru', str(lru_size)] + else: + pargs += ['--cache-classic'] # Running server with args: pargs p = subprocess.Popen(pargs) yield diff --git a/tests/execution/test_execution.py b/tests/execution/test_execution.py index f73ca7e3c..15e2304fc 100644 --- a/tests/execution/test_execution.py +++ b/tests/execution/test_execution.py @@ -183,8 +183,7 @@ class TestExecution: # Initialize server and client # @fixture(scope="class", autouse=True, params=[ - { "extra_args" : [], "should_cache_results" : True }, - { "extra_args" : ["--cache-lru", 0], "should_cache_results" : True }, + { "extra_args" : ["--cache-classic"], "should_cache_results" : True }, { "extra_args" : ["--cache-lru", 100], "should_cache_results" : True }, { "extra_args" : ["--cache-none"], "should_cache_results" : False }, ])