mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-07-03 21:20:49 +08:00
Merge remote-tracking branch 'origin/master' into pixal3d
# Conflicts: # comfy/clip_vision.py # comfy/image_encoders/dino3.py # comfy/supported_models.py # comfy_extras/nodes_save_3d.py
This commit is contained in:
commit
3af63b8961
@ -1,5 +1,4 @@
|
||||
As of the time of writing this you need this driver for best results:
|
||||
https://www.amd.com/en/resources/support-articles/release-notes/RN-AMDGPU-WINDOWS-PYTORCH-7-1-1.html
|
||||
As of the time of writing this you need a recent driver. Updating to the latest driver is recommended.
|
||||
|
||||
HOW TO RUN:
|
||||
|
||||
@ -7,9 +6,9 @@ If you have a AMD gpu:
|
||||
|
||||
run_amd_gpu.bat
|
||||
|
||||
If you have memory issues you can try disabling the smart memory management by running comfyui with:
|
||||
If you have memory issues you can try enabling the new dynamic memory management by running comfyui with:
|
||||
|
||||
run_amd_gpu_disable_smart_memory.bat
|
||||
run_amd_gpu_enable_dynamic_vram.bat
|
||||
|
||||
IF YOU GET A RED ERROR IN THE UI MAKE SURE YOU HAVE A MODEL/CHECKPOINT IN: ComfyUI\models\checkpoints
|
||||
|
||||
|
||||
519
.github/workflows/backport_release.yaml
vendored
Normal file
519
.github/workflows/backport_release.yaml
vendored
Normal file
@ -0,0 +1,519 @@
|
||||
name: Backport Release
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
commit:
|
||||
description: 'Full 40-char SHA of the tip commit of the backport source branch (the PR head commit that passed tests). The branch is resolved from this SHA and must be unique.'
|
||||
required: true
|
||||
type: string
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: read
|
||||
checks: read
|
||||
|
||||
jobs:
|
||||
backport-release:
|
||||
name: Create backport release
|
||||
runs-on: ubuntu-latest
|
||||
environment: backport release
|
||||
|
||||
steps:
|
||||
- name: Generate GitHub App token
|
||||
id: app-token
|
||||
uses: actions/create-github-app-token@bcd2ba49218906704ab6c1aa796996da409d3eb1
|
||||
with:
|
||||
app-id: ${{ secrets.FEN_RELEASE_APP_ID }}
|
||||
private-key: ${{ secrets.FEN_RELEASE_PRIVATE_KEY }}
|
||||
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
|
||||
with:
|
||||
token: ${{ steps.app-token.outputs.token }}
|
||||
fetch-depth: 0
|
||||
fetch-tags: true
|
||||
|
||||
- name: Configure git
|
||||
run: |
|
||||
git config user.name "fen-release[bot]"
|
||||
git config user.email "fen-release[bot]@users.noreply.github.com"
|
||||
|
||||
- name: Resolve source branch from commit SHA
|
||||
id: resolve
|
||||
env:
|
||||
SOURCE_COMMIT: ${{ inputs.commit }}
|
||||
DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
# Require a full 40-char lowercase-hex SHA. Short SHAs are ambiguous
|
||||
# and we will be comparing this value against API responses (PR head
|
||||
# SHA, ref tips) that always return the full form.
|
||||
if [[ ! "${SOURCE_COMMIT}" =~ ^[0-9a-f]{40}$ ]]; then
|
||||
echo "::error::Input commit '${SOURCE_COMMIT}' is not a full 40-char lowercase hex SHA."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Fetch all remote branches so we can search for which one(s) point
|
||||
# at this SHA. `actions/checkout` with fetch-depth: 0 fetches full
|
||||
# history of the checked-out ref but does not necessarily populate
|
||||
# every refs/remotes/origin/*, so do it explicitly.
|
||||
git fetch --prune origin '+refs/heads/*:refs/remotes/origin/*'
|
||||
|
||||
# Verify the commit actually exists in this repo's object DB.
|
||||
if ! git cat-file -e "${SOURCE_COMMIT}^{commit}" 2>/dev/null; then
|
||||
echo "::error::Commit ${SOURCE_COMMIT} was not found in the repository."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Find every remote branch whose tip == SOURCE_COMMIT. Exactly one
|
||||
# branch must point at it. If zero, the commit isn't anyone's tip
|
||||
# (likely stale, force-pushed past, or never the PR head). If more
|
||||
# than one, the (branch -> SHA) mapping is ambiguous and we refuse
|
||||
# to guess — the operator must give us a unique branch to release.
|
||||
mapfile -t matching_branches < <(
|
||||
git for-each-ref \
|
||||
--format='%(refname:strip=3)' \
|
||||
--points-at="${SOURCE_COMMIT}" \
|
||||
refs/remotes/origin/ \
|
||||
| grep -vx 'HEAD' || true
|
||||
)
|
||||
|
||||
if [[ "${#matching_branches[@]}" -eq 0 ]]; then
|
||||
echo "::error::No branch on origin has ${SOURCE_COMMIT} as its tip."
|
||||
echo "::error::Either the branch was updated after you copied this SHA, or this commit was never the head of a branch."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ "${#matching_branches[@]}" -gt 1 ]]; then
|
||||
echo "::error::More than one branch on origin has ${SOURCE_COMMIT} as its tip; cannot pick one:"
|
||||
for b in "${matching_branches[@]}"; do
|
||||
echo "::error:: - ${b}"
|
||||
done
|
||||
echo "::error::Refusing to proceed with an ambiguous source branch."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
source_branch="${matching_branches[0]}"
|
||||
|
||||
if [[ "${source_branch}" == "${DEFAULT_BRANCH}" ]]; then
|
||||
echo "::error::Source branch must not be the default branch ('${DEFAULT_BRANCH}')."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Resolved commit ${SOURCE_COMMIT} to branch '${source_branch}'."
|
||||
echo "source_branch=${source_branch}" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Determine latest stable release
|
||||
id: latest
|
||||
env:
|
||||
GH_TOKEN: ${{ steps.app-token.outputs.token }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
# List all tags matching vMAJOR.MINOR.PATCH and pick the highest by numeric
|
||||
# comparison of each component. We DO NOT use `sort -V` because it treats
|
||||
# v0.19.99 as higher than v0.20.1.
|
||||
latest_tag="$(
|
||||
git tag --list 'v[0-9]*.[0-9]*.[0-9]*' \
|
||||
| grep -E '^v[0-9]+\.[0-9]+\.[0-9]+$' \
|
||||
| awk -F'[v.]' '{ printf "%010d %010d %010d %s\n", $2, $3, $4, $0 }' \
|
||||
| sort -k1,1n -k2,2n -k3,3n \
|
||||
| tail -n1 \
|
||||
| awk '{print $4}'
|
||||
)"
|
||||
|
||||
if [[ -z "${latest_tag}" ]]; then
|
||||
echo "::error::No stable release tags (vMAJOR.MINOR.PATCH) were found."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Parse components
|
||||
ver="${latest_tag#v}"
|
||||
major="${ver%%.*}"
|
||||
rest="${ver#*.}"
|
||||
minor="${rest%%.*}"
|
||||
patch="${rest#*.}"
|
||||
|
||||
new_patch=$((patch + 1))
|
||||
new_version="v${major}.${minor}.${new_patch}"
|
||||
release_branch="release/v${major}.${minor}"
|
||||
|
||||
latest_sha="$(git rev-list -n 1 "refs/tags/${latest_tag}")"
|
||||
|
||||
echo "latest_tag=${latest_tag}" >> "$GITHUB_OUTPUT"
|
||||
echo "latest_sha=${latest_sha}" >> "$GITHUB_OUTPUT"
|
||||
echo "major=${major}" >> "$GITHUB_OUTPUT"
|
||||
echo "minor=${minor}" >> "$GITHUB_OUTPUT"
|
||||
echo "patch=${patch}" >> "$GITHUB_OUTPUT"
|
||||
echo "new_version=${new_version}" >> "$GITHUB_OUTPUT"
|
||||
echo "new_version_no_v=${major}.${minor}.${new_patch}" >> "$GITHUB_OUTPUT"
|
||||
echo "release_branch=${release_branch}" >> "$GITHUB_OUTPUT"
|
||||
|
||||
echo "Latest stable release: ${latest_tag} (${latest_sha})"
|
||||
echo "New version will be: ${new_version}"
|
||||
echo "Release branch: ${release_branch}"
|
||||
|
||||
- name: Validate source branch is cut directly from the latest stable release
|
||||
env:
|
||||
SOURCE_BRANCH: ${{ steps.resolve.outputs.source_branch }}
|
||||
SOURCE_COMMIT: ${{ inputs.commit }}
|
||||
LATEST_TAG_SHA: ${{ steps.latest.outputs.latest_sha }}
|
||||
LATEST_TAG: ${{ steps.latest.outputs.latest_tag }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
# Use the user-provided SHA directly rather than re-resolving the branch
|
||||
# tip — the resolve step already proved the branch tip equals SOURCE_COMMIT,
|
||||
# and pinning to the SHA here makes the rest of the job TOCTOU-safe against
|
||||
# someone pushing to the branch mid-run.
|
||||
source_sha="${SOURCE_COMMIT}"
|
||||
|
||||
# Walking first-parent from the source tip must reach LATEST_TAG_SHA.
|
||||
# We capture rev-list into a variable and grep against a here-string
|
||||
# rather than piping `rev-list | grep -q`: under `set -o pipefail`,
|
||||
# `grep -q` would exit on first match and SIGPIPE the still-streaming
|
||||
# `rev-list`, propagating exit 141 as a spurious "not found".
|
||||
first_parent_chain="$(git rev-list --first-parent "${source_sha}")"
|
||||
if ! grep -Fxq "${LATEST_TAG_SHA}" <<< "${first_parent_chain}"; then
|
||||
echo "::error::Source branch '${SOURCE_BRANCH}' is not cut from '${LATEST_TAG}'."
|
||||
echo "::error::Its first-parent history does not include ${LATEST_TAG_SHA}."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Additionally, every commit added on top of the tag (the set we are
|
||||
# about to publish) must itself be a descendant of the tag along
|
||||
# first-parent — i.e. no sibling commits from master sneak in via a
|
||||
# non-first-parent path. Enforce by requiring that the symmetric
|
||||
# difference is empty in one direction: commits in source that are
|
||||
# NOT first-parent-reachable from source starting at the tag.
|
||||
# We do this by intersecting:
|
||||
# A = commits reachable from source but not from tag (full DAG)
|
||||
# B = commits on the first-parent chain from source down to tag
|
||||
# and requiring A == B.
|
||||
all_added="$(git rev-list "${LATEST_TAG_SHA}..${source_sha}" | sort)"
|
||||
first_parent_added="$(
|
||||
git rev-list --first-parent "${LATEST_TAG_SHA}..${source_sha}" | sort
|
||||
)"
|
||||
|
||||
if [[ "${all_added}" != "${first_parent_added}" ]]; then
|
||||
echo "::error::Source branch '${SOURCE_BRANCH}' contains commits not on its first-parent chain from '${LATEST_TAG}'."
|
||||
echo "::error::This usually means the branch was cut from master (not from the tag) or contains a merge from master."
|
||||
echo "Commits reachable but not on first-parent chain:"
|
||||
comm -23 <(printf '%s\n' "${all_added}") <(printf '%s\n' "${first_parent_added}") \
|
||||
| while read -r sha; do
|
||||
echo " $(git log -1 --format='%h %s' "${sha}")"
|
||||
done
|
||||
exit 1
|
||||
fi
|
||||
|
||||
added_count="$(printf '%s\n' "${all_added}" | grep -c . || true)"
|
||||
echo "Source branch is cut directly from ${LATEST_TAG} with ${added_count} commit(s) on top."
|
||||
|
||||
- name: Validate PR exists, is open, named correctly, has latest commit, and checks pass
|
||||
env:
|
||||
GH_TOKEN: ${{ steps.app-token.outputs.token }}
|
||||
SOURCE_BRANCH: ${{ steps.resolve.outputs.source_branch }}
|
||||
SOURCE_COMMIT: ${{ inputs.commit }}
|
||||
NEW_VERSION: ${{ steps.latest.outputs.new_version }}
|
||||
REPO: ${{ github.repository }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
expected_title="ComfyUI backport release ${NEW_VERSION}"
|
||||
|
||||
# Find open PRs from this branch into master. The --state open filter
|
||||
# is load-bearing: a closed/merged PR with passing checks must not be
|
||||
# accepted as authorization for a new release.
|
||||
pr_json="$(
|
||||
gh pr list \
|
||||
--repo "${REPO}" \
|
||||
--state open \
|
||||
--head "${SOURCE_BRANCH}" \
|
||||
--base master \
|
||||
--json number,title,headRefOid,state \
|
||||
--limit 10
|
||||
)"
|
||||
|
||||
pr_count="$(echo "${pr_json}" | jq 'length')"
|
||||
if [[ "${pr_count}" -eq 0 ]]; then
|
||||
echo "::error::No open PR found from '${SOURCE_BRANCH}' into 'master'. The PR must exist and be open."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Pick the PR matching the expected title
|
||||
pr_number="$(echo "${pr_json}" | jq -r --arg t "${expected_title}" '
|
||||
map(select(.title == $t)) | .[0].number // empty
|
||||
')"
|
||||
pr_head_sha="$(echo "${pr_json}" | jq -r --arg t "${expected_title}" '
|
||||
map(select(.title == $t)) | .[0].headRefOid // empty
|
||||
')"
|
||||
|
||||
if [[ -z "${pr_number}" ]]; then
|
||||
echo "::error::No open PR from '${SOURCE_BRANCH}' into 'master' is titled '${expected_title}'."
|
||||
echo "Found PRs:"
|
||||
echo "${pr_json}" | jq -r '.[] | " #\(.number): \(.title)"'
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# The PR's current head commit must equal the SHA the operator gave us.
|
||||
# This is what closes the door on releasing stale code: if anyone has
|
||||
# pushed to the branch since the operator validated tests passed, the
|
||||
# PR head will have advanced past SOURCE_COMMIT and we abort. (The
|
||||
# resolve step already proved the branch tip == SOURCE_COMMIT; this
|
||||
# ties that same SHA to the PR that authorizes the release.)
|
||||
if [[ "${pr_head_sha}" != "${SOURCE_COMMIT}" ]]; then
|
||||
echo "::error::PR #${pr_number} head commit is ${pr_head_sha}, but the operator-provided commit is ${SOURCE_COMMIT}."
|
||||
echo "::error::The PR has new commits since this release was authorized. Re-run with the new head SHA after verifying its checks."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Found open PR #${pr_number} titled '${expected_title}' at head ${pr_head_sha} (matches operator-provided commit)."
|
||||
|
||||
# Verify all check runs on the head commit have completed successfully.
|
||||
# A check is considered passing if conclusion is success, neutral, or skipped.
|
||||
checks_json="$(
|
||||
gh api \
|
||||
--paginate \
|
||||
"repos/${REPO}/commits/${pr_head_sha}/check-runs" \
|
||||
--jq '.check_runs[] | {name: .name, status: .status, conclusion: .conclusion}'
|
||||
)"
|
||||
|
||||
if [[ -z "${checks_json}" ]]; then
|
||||
echo "::error::No check runs found on PR head commit ${pr_head_sha}."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Check runs on ${pr_head_sha}:"
|
||||
echo "${checks_json}" | jq -s '.'
|
||||
|
||||
failing="$(echo "${checks_json}" | jq -s '
|
||||
map(select(
|
||||
.status != "completed"
|
||||
or (.conclusion as $c
|
||||
| ["success","neutral","skipped"]
|
||||
| index($c) | not)
|
||||
))
|
||||
')"
|
||||
|
||||
failing_count="$(echo "${failing}" | jq 'length')"
|
||||
if [[ "${failing_count}" -gt 0 ]]; then
|
||||
echo "::error::One or more checks have not passed on PR head commit ${pr_head_sha}:"
|
||||
echo "${failing}" | jq -r '.[] | " - \(.name): status=\(.status) conclusion=\(.conclusion)"'
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "All checks have passed on ${pr_head_sha}."
|
||||
|
||||
- name: Prepare release branch
|
||||
id: prepare
|
||||
env:
|
||||
GH_TOKEN: ${{ steps.app-token.outputs.token }}
|
||||
REPO: ${{ github.repository }}
|
||||
RELEASE_BRANCH: ${{ steps.latest.outputs.release_branch }}
|
||||
LATEST_TAG: ${{ steps.latest.outputs.latest_tag }}
|
||||
LATEST_TAG_SHA: ${{ steps.latest.outputs.latest_sha }}
|
||||
PATCH: ${{ steps.latest.outputs.patch }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
# Try to fetch the release branch. If patch == 0, it shouldn't exist yet
|
||||
# and we'll create it from the latest stable tag. If patch > 0, it must
|
||||
# already exist and its tip must equal the latest stable tag commit (i.e.
|
||||
# the previous patch release).
|
||||
if git ls-remote --exit-code --heads origin "${RELEASE_BRANCH}" >/dev/null 2>&1; then
|
||||
echo "Release branch '${RELEASE_BRANCH}' already exists on origin."
|
||||
git fetch origin "refs/heads/${RELEASE_BRANCH}:refs/remotes/origin/${RELEASE_BRANCH}"
|
||||
git checkout -B "${RELEASE_BRANCH}" "refs/remotes/origin/${RELEASE_BRANCH}"
|
||||
|
||||
current_tip="$(git rev-parse HEAD)"
|
||||
if [[ "${current_tip}" != "${LATEST_TAG_SHA}" ]]; then
|
||||
echo "::error::Release branch '${RELEASE_BRANCH}' tip (${current_tip}) is not at the latest stable release '${LATEST_TAG}' (${LATEST_TAG_SHA})."
|
||||
echo "::error::Refusing to release on top of a divergent branch."
|
||||
exit 1
|
||||
fi
|
||||
echo "branch_existed=true" >> "$GITHUB_OUTPUT"
|
||||
else
|
||||
if [[ "${PATCH}" != "0" ]]; then
|
||||
echo "::error::Release branch '${RELEASE_BRANCH}' does not exist on origin, but the latest stable release '${LATEST_TAG}' has patch=${PATCH} (>0). This is inconsistent."
|
||||
exit 1
|
||||
fi
|
||||
echo "Release branch '${RELEASE_BRANCH}' does not exist. Creating from ${LATEST_TAG}."
|
||||
git checkout -B "${RELEASE_BRANCH}" "refs/tags/${LATEST_TAG}"
|
||||
echo "branch_existed=false" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
- name: Fast-forward merge source branch into release branch
|
||||
env:
|
||||
SOURCE_BRANCH: ${{ steps.resolve.outputs.source_branch }}
|
||||
SOURCE_COMMIT: ${{ inputs.commit }}
|
||||
RELEASE_BRANCH: ${{ steps.latest.outputs.release_branch }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
# --ff-only guarantees no merge commit is created. If a fast-forward is
|
||||
# not possible (i.e. the release branch has commits the source branch
|
||||
# doesn't), the merge will fail and we abort. Because we already validated
|
||||
# that the source branch is rooted on the latest stable tag, and the
|
||||
# release branch tip equals that same tag, this fast-forward should
|
||||
# always succeed for a well-formed backport branch.
|
||||
#
|
||||
# We merge the operator-provided SHA, not the branch ref, so a push to
|
||||
# the branch in the window between resolve and now cannot smuggle new
|
||||
# commits into the release.
|
||||
if ! git merge --ff-only "${SOURCE_COMMIT}"; then
|
||||
echo "::error::Cannot fast-forward '${RELEASE_BRANCH}' to ${SOURCE_COMMIT} (tip of '${SOURCE_BRANCH}'). A merge commit would be required. Aborting."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Fast-forwarded '${RELEASE_BRANCH}' to ${SOURCE_COMMIT} (tip of '${SOURCE_BRANCH}')."
|
||||
|
||||
- name: Bump version files
|
||||
env:
|
||||
NEW_VERSION_NO_V: ${{ steps.latest.outputs.new_version_no_v }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
if [[ ! -f comfyui_version.py ]]; then
|
||||
echo "::error::comfyui_version.py not found in repo root."
|
||||
exit 1
|
||||
fi
|
||||
if [[ ! -f pyproject.toml ]]; then
|
||||
echo "::error::pyproject.toml not found in repo root."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Replace the version string in comfyui_version.py.
|
||||
# Expected format: __version__ = "X.Y.Z"
|
||||
python3 - "$NEW_VERSION_NO_V" <<'PY'
|
||||
import re, sys, pathlib
|
||||
new = sys.argv[1]
|
||||
|
||||
p = pathlib.Path("comfyui_version.py")
|
||||
src = p.read_text()
|
||||
new_src, n = re.subn(
|
||||
r'(__version__\s*=\s*[\'"])[^\'"]+([\'"])',
|
||||
lambda m: f'{m.group(1)}{new}{m.group(2)}',
|
||||
src,
|
||||
count=1,
|
||||
)
|
||||
if n != 1:
|
||||
sys.exit("Could not find __version__ assignment in comfyui_version.py")
|
||||
p.write_text(new_src)
|
||||
|
||||
p = pathlib.Path("pyproject.toml")
|
||||
src = p.read_text()
|
||||
# Replace the first `version = "..."` inside [project] or [tool.poetry].
|
||||
new_src, n = re.subn(
|
||||
r'(?m)^(version\s*=\s*")[^"]+(")',
|
||||
lambda m: f'{m.group(1)}{new}{m.group(2)}',
|
||||
src,
|
||||
count=1,
|
||||
)
|
||||
if n != 1:
|
||||
sys.exit("Could not find version assignment in pyproject.toml")
|
||||
p.write_text(new_src)
|
||||
PY
|
||||
|
||||
echo "Updated version to ${NEW_VERSION_NO_V} in comfyui_version.py and pyproject.toml."
|
||||
git --no-pager diff -- comfyui_version.py pyproject.toml
|
||||
|
||||
- name: Commit version bump and tag release
|
||||
env:
|
||||
NEW_VERSION: ${{ steps.latest.outputs.new_version }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
git add comfyui_version.py pyproject.toml
|
||||
git commit -m "ComfyUI ${NEW_VERSION}"
|
||||
|
||||
if git rev-parse -q --verify "refs/tags/${NEW_VERSION}" >/dev/null; then
|
||||
echo "::error::Tag ${NEW_VERSION} already exists locally."
|
||||
exit 1
|
||||
fi
|
||||
git tag "${NEW_VERSION}"
|
||||
|
||||
- name: Verify tag does not already exist on origin
|
||||
env:
|
||||
NEW_VERSION: ${{ steps.latest.outputs.new_version }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
if git ls-remote --exit-code --tags origin "refs/tags/${NEW_VERSION}" >/dev/null 2>&1; then
|
||||
echo "::error::Tag ${NEW_VERSION} already exists on origin. Aborting."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Push release branch and tag
|
||||
env:
|
||||
RELEASE_BRANCH: ${{ steps.latest.outputs.release_branch }}
|
||||
NEW_VERSION: ${{ steps.latest.outputs.new_version }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
# Push the branch first, then the tag. Atomic-ish: if the branch push
|
||||
# fails we never publish the tag.
|
||||
git push origin "refs/heads/${RELEASE_BRANCH}:refs/heads/${RELEASE_BRANCH}"
|
||||
git push origin "refs/tags/${NEW_VERSION}"
|
||||
|
||||
echo "Released ${NEW_VERSION} on ${RELEASE_BRANCH}."
|
||||
|
||||
- name: Delete remote source branch
|
||||
env:
|
||||
GH_TOKEN: ${{ steps.app-token.outputs.token }}
|
||||
REPO: ${{ github.repository }}
|
||||
SOURCE_BRANCH: ${{ steps.resolve.outputs.source_branch }}
|
||||
SOURCE_COMMIT: ${{ inputs.commit }}
|
||||
RELEASE_BRANCH: ${{ steps.latest.outputs.release_branch }}
|
||||
DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
# Belt-and-braces: the resolve step already refuses the default branch,
|
||||
# but never delete the default or the release branch under any
|
||||
# circumstances.
|
||||
if [[ "${SOURCE_BRANCH}" == "${DEFAULT_BRANCH}" || "${SOURCE_BRANCH}" == "${RELEASE_BRANCH}" ]]; then
|
||||
echo "::error::Refusing to delete '${SOURCE_BRANCH}' (matches default or release branch)."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Delete the source branch on origin, but only if its tip is still the
|
||||
# SHA we released from. If someone pushed new commits to it after we
|
||||
# resolved it, leave it alone — those commits would be silently lost.
|
||||
current_tip="$(git ls-remote origin "refs/heads/${SOURCE_BRANCH}" | awk '{print $1}')"
|
||||
if [[ -z "${current_tip}" ]]; then
|
||||
echo "Source branch '${SOURCE_BRANCH}' no longer exists on origin; nothing to delete."
|
||||
exit 0
|
||||
fi
|
||||
if [[ "${current_tip}" != "${SOURCE_COMMIT}" ]]; then
|
||||
echo "::warning::Source branch '${SOURCE_BRANCH}' tip (${current_tip}) no longer matches released commit (${SOURCE_COMMIT}). Leaving it in place."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
git push origin --delete "refs/heads/${SOURCE_BRANCH}"
|
||||
echo "Deleted remote branch '${SOURCE_BRANCH}'."
|
||||
|
||||
- name: Summary
|
||||
if: always()
|
||||
env:
|
||||
NEW_VERSION: ${{ steps.latest.outputs.new_version }}
|
||||
RELEASE_BRANCH: ${{ steps.latest.outputs.release_branch }}
|
||||
LATEST_TAG: ${{ steps.latest.outputs.latest_tag }}
|
||||
SOURCE_BRANCH: ${{ steps.resolve.outputs.source_branch }}
|
||||
SOURCE_COMMIT: ${{ inputs.commit }}
|
||||
run: |
|
||||
# SOURCE_BRANCH is empty if the resolve step never produced an output
|
||||
# (e.g. the workflow failed in or before that step). Show a placeholder
|
||||
# in that case so the summary table still renders cleanly.
|
||||
source_branch_display="${SOURCE_BRANCH:-(unresolved)}"
|
||||
{
|
||||
echo "## Backport release"
|
||||
echo ""
|
||||
echo "| Field | Value |"
|
||||
echo "|---|---|"
|
||||
echo "| Source commit | \`${SOURCE_COMMIT}\` |"
|
||||
echo "| Source branch | \`${source_branch_display}\` |"
|
||||
echo "| Previous stable | \`${LATEST_TAG}\` |"
|
||||
echo "| New version | \`${NEW_VERSION}\` |"
|
||||
echo "| Release branch | \`${RELEASE_BRANCH}\` |"
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
2
.github/workflows/check-line-endings.yml
vendored
2
.github/workflows/check-line-endings.yml
vendored
@ -17,7 +17,7 @@ jobs:
|
||||
- name: Check for Windows line endings (CRLF)
|
||||
run: |
|
||||
# Get the list of changed files in the PR
|
||||
CHANGED_FILES=$(git diff --name-only ${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }})
|
||||
CHANGED_FILES=$(git diff --name-only ${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }} -- ':!.ci')
|
||||
|
||||
# Flag to track if CRLF is found
|
||||
CRLF_FOUND=false
|
||||
|
||||
24
.github/workflows/detect-unreviewed-merge.yml
vendored
Normal file
24
.github/workflows/detect-unreviewed-merge.yml
vendored
Normal file
@ -0,0 +1,24 @@
|
||||
name: Detect Unreviewed Merge
|
||||
|
||||
# SOC 2 compliance — reusable workflow lives in Comfy-Org/github-workflows,
|
||||
# tracking issues are filed in Comfy-Org/unreviewed-merges.
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [master]
|
||||
|
||||
concurrency:
|
||||
group: detect-unreviewed-merge-${{ github.sha }}
|
||||
cancel-in-progress: false
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: read
|
||||
|
||||
jobs:
|
||||
detect:
|
||||
uses: Comfy-Org/github-workflows/.github/workflows/detect-unreviewed-merge.yml@4d9cb6b87f953bb7cd69954280e1465fb9bd2040 # v1
|
||||
with:
|
||||
approval-mode: latest-per-reviewer
|
||||
secrets:
|
||||
UNREVIEWED_MERGES_TOKEN: ${{ secrets.UNREVIEWED_MERGES_TOKEN }}
|
||||
@ -20,7 +20,7 @@
|
||||
[website-url]: https://www.comfy.org/
|
||||
<!-- Workaround to display total user from https://github.com/badges/shields/issues/4500#issuecomment-2060079995 -->
|
||||
[discord-shield]: https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fdiscord.com%2Fapi%2Finvites%2Fcomfyorg%3Fwith_counts%3Dtrue&query=%24.approximate_member_count&logo=discord&logoColor=white&label=Discord&color=green&suffix=%20total
|
||||
[discord-url]: https://www.comfy.org/discord
|
||||
[discord-url]: https://discord.com/invite/comfyorg
|
||||
[twitter-shield]: https://img.shields.io/twitter/follow/ComfyUI
|
||||
[twitter-url]: https://x.com/ComfyUI
|
||||
|
||||
@ -433,7 +433,7 @@ See also: [https://www.comfy.org/](https://www.comfy.org/)
|
||||
|
||||
## Frontend Development
|
||||
|
||||
As of August 15, 2024, we have transitioned to a new frontend, which is now hosted in a separate repository: [ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend). This repository now hosts the compiled JS (from TS/Vue) under the `web/` directory.
|
||||
As of August 15, 2024, we have transitioned to a new frontend, which is now hosted in a separate repository: [ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend). The compiled JS files (from TS/Vue) are published to [pypi](https://pypi.org/project/comfyui-frontend-package) and installed as a dependency in ComfyUI.
|
||||
|
||||
### Reporting Issues and Requesting Features
|
||||
|
||||
|
||||
39
alembic_db/versions/0004_drop_tag_type.py
Normal file
39
alembic_db/versions/0004_drop_tag_type.py
Normal file
@ -0,0 +1,39 @@
|
||||
"""
|
||||
Drop the vestigial tags.tag_type column.
|
||||
|
||||
tag_type was always "user" in practice — no code path ever set it to anything
|
||||
else (no system/seeded classification was ever wired up) and nothing queried it.
|
||||
The column, its index (ix_tags_tag_type), and the corresponding API field were
|
||||
dead weight, so they are removed.
|
||||
|
||||
Revision ID: 0004_drop_tag_type
|
||||
Revises: 0003_add_metadata_job_id
|
||||
Create Date: 2026-06-03
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
revision = "0004_drop_tag_type"
|
||||
down_revision = "0003_add_metadata_job_id"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
with op.batch_alter_table("tags") as batch_op:
|
||||
batch_op.drop_index("ix_tags_tag_type")
|
||||
batch_op.drop_column("tag_type")
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
with op.batch_alter_table("tags") as batch_op:
|
||||
batch_op.add_column(
|
||||
sa.Column(
|
||||
"tag_type",
|
||||
sa.String(length=32),
|
||||
nullable=False,
|
||||
server_default="user",
|
||||
)
|
||||
)
|
||||
batch_op.create_index("ix_tags_tag_type", ["tag_type"])
|
||||
@ -39,6 +39,7 @@ from app.assets.services import (
|
||||
update_asset_metadata,
|
||||
upload_from_temp_path,
|
||||
)
|
||||
from app.assets.services.cursor import InvalidCursorError
|
||||
from app.assets.services.tagging import list_tag_histogram
|
||||
|
||||
ROUTES = web.RouteTableDef()
|
||||
@ -160,10 +161,12 @@ def _build_asset_response(result: schemas.AssetDetailResult | schemas.UploadResu
|
||||
preview_url = None
|
||||
else:
|
||||
preview_url = _build_preview_url_from_view(result.tags, result.ref.user_metadata)
|
||||
asset_content_hash = result.asset.hash if result.asset else None
|
||||
return schemas_out.Asset(
|
||||
id=result.ref.id,
|
||||
name=result.ref.name,
|
||||
asset_hash=result.asset.hash if result.asset else None,
|
||||
hash=asset_content_hash,
|
||||
asset_hash=asset_content_hash,
|
||||
size=int(result.asset.size_bytes) if result.asset else None,
|
||||
mime_type=result.asset.mime_type if result.asset else None,
|
||||
tags=result.tags,
|
||||
@ -172,7 +175,7 @@ def _build_asset_response(result: schemas.AssetDetailResult | schemas.UploadResu
|
||||
user_metadata=result.ref.user_metadata or {},
|
||||
metadata=result.ref.system_metadata,
|
||||
job_id=result.ref.job_id,
|
||||
prompt_id=result.ref.job_id, # deprecated: mirrors job_id for cloud compat
|
||||
prompt_id=result.ref.job_id, # deprecated alias of job_id, kept for compatibility
|
||||
created_at=result.ref.created_at,
|
||||
updated_at=result.ref.updated_at,
|
||||
last_access_time=result.ref.last_access_time,
|
||||
@ -209,24 +212,37 @@ async def list_assets_route(request: web.Request) -> web.Response:
|
||||
order_candidate = (q.order or "desc").lower()
|
||||
order = order_candidate if order_candidate in {"asc", "desc"} else "desc"
|
||||
|
||||
result = list_assets_page(
|
||||
owner_id=USER_MANAGER.get_request_user_id(request),
|
||||
include_tags=q.include_tags,
|
||||
exclude_tags=q.exclude_tags,
|
||||
name_contains=q.name_contains,
|
||||
metadata_filter=q.metadata_filter,
|
||||
limit=q.limit,
|
||||
offset=q.offset,
|
||||
sort=sort,
|
||||
order=order,
|
||||
)
|
||||
try:
|
||||
result = list_assets_page(
|
||||
owner_id=USER_MANAGER.get_request_user_id(request),
|
||||
include_tags=q.include_tags,
|
||||
exclude_tags=q.exclude_tags,
|
||||
name_contains=q.name_contains,
|
||||
metadata_filter=q.metadata_filter,
|
||||
limit=q.limit,
|
||||
offset=q.offset,
|
||||
sort=sort,
|
||||
order=order,
|
||||
after=q.after,
|
||||
)
|
||||
except InvalidCursorError as e:
|
||||
return _build_error_response(400, "INVALID_CURSOR", str(e))
|
||||
|
||||
summaries = [_build_asset_response(item) for item in result.items]
|
||||
|
||||
# has_more semantics differ by mode:
|
||||
# - cursor mode: a non-empty next_cursor means there are more results.
|
||||
# - offset mode: derived from total - (offset + page size).
|
||||
if q.after is not None:
|
||||
has_more = result.next_cursor is not None
|
||||
else:
|
||||
has_more = (q.offset + len(summaries)) < result.total
|
||||
|
||||
payload = schemas_out.AssetsList(
|
||||
assets=summaries,
|
||||
total=result.total,
|
||||
has_more=(q.offset + len(summaries)) < result.total,
|
||||
has_more=has_more,
|
||||
next_cursor=result.next_cursor,
|
||||
)
|
||||
return web.json_response(payload.model_dump(mode="json", exclude_none=True))
|
||||
|
||||
@ -517,18 +533,14 @@ async def update_asset_route(request: web.Request) -> web.Response:
|
||||
@_require_assets_feature_enabled
|
||||
async def delete_asset_route(request: web.Request) -> web.Response:
|
||||
reference_id = str(uuid.UUID(request.match_info["id"]))
|
||||
delete_content_param = request.query.get("delete_content")
|
||||
delete_content = (
|
||||
False
|
||||
if delete_content_param is None
|
||||
else delete_content_param.lower() not in {"0", "false", "no"}
|
||||
)
|
||||
|
||||
try:
|
||||
# Deleting an asset is a soft delete of the reference; the underlying
|
||||
# content is preserved (it may be shared with other references).
|
||||
deleted = delete_asset_reference(
|
||||
reference_id=reference_id,
|
||||
owner_id=USER_MANAGER.get_request_user_id(request),
|
||||
delete_content_if_orphan=delete_content,
|
||||
delete_content_if_orphan=False,
|
||||
)
|
||||
except Exception:
|
||||
logging.exception(
|
||||
@ -573,8 +585,8 @@ async def get_tags(request: web.Request) -> web.Response:
|
||||
)
|
||||
|
||||
tags = [
|
||||
schemas_out.TagUsage(name=name, count=count, type=tag_type)
|
||||
for (name, tag_type, count) in rows
|
||||
schemas_out.TagUsage(name=name, count=count)
|
||||
for (name, count) in rows
|
||||
]
|
||||
payload = schemas_out.TagsList(
|
||||
tags=tags, total=total, has_more=(query.offset + len(tags)) < total
|
||||
|
||||
@ -59,6 +59,11 @@ class ListAssetsQuery(BaseModel):
|
||||
|
||||
limit: conint(ge=1, le=500) = 20
|
||||
offset: conint(ge=0) = 0
|
||||
# Opaque keyset cursor. When supplied, `offset` is ignored. Cursor pagination
|
||||
# is supported for sort values `created_at`, `updated_at`, `name`, `size`.
|
||||
# Supplying `after` together with `sort=last_access_time` returns
|
||||
# 400 INVALID_CURSOR; that sort only supports offset/limit.
|
||||
after: str | None = None
|
||||
|
||||
sort: Literal["name", "created_at", "updated_at", "size", "last_access_time"] = (
|
||||
"created_at"
|
||||
|
||||
@ -10,6 +10,7 @@ class Asset(BaseModel):
|
||||
|
||||
id: str
|
||||
name: str
|
||||
hash: str | None = None
|
||||
asset_hash: str | None = None
|
||||
size: int | None = None
|
||||
mime_type: str | None = None
|
||||
@ -40,12 +41,13 @@ class AssetsList(BaseModel):
|
||||
assets: list[Asset]
|
||||
total: int
|
||||
has_more: bool
|
||||
# Opaque cursor for the next page. Omitted when there are no more results.
|
||||
next_cursor: str | None = None
|
||||
|
||||
|
||||
class TagUsage(BaseModel):
|
||||
name: str
|
||||
count: int
|
||||
type: str
|
||||
|
||||
|
||||
class TagsList(BaseModel):
|
||||
|
||||
@ -227,7 +227,6 @@ class Tag(Base):
|
||||
__tablename__ = "tags"
|
||||
|
||||
name: Mapped[str] = mapped_column(String(512), primary_key=True)
|
||||
tag_type: Mapped[str] = mapped_column(String(32), nullable=False, default="user")
|
||||
|
||||
asset_reference_links: Mapped[list[AssetReferenceTag]] = relationship(
|
||||
back_populates="tag",
|
||||
@ -240,7 +239,5 @@ class Tag(Base):
|
||||
overlaps="asset_reference_links,tag_links,tags,asset_reference",
|
||||
)
|
||||
|
||||
__table_args__ = (Index("ix_tags_tag_type", "tag_type"),)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<Tag {self.name}>"
|
||||
|
||||
@ -266,9 +266,18 @@ def list_references_page(
|
||||
metadata_filter: dict | None = None,
|
||||
sort: str | None = None,
|
||||
order: str | None = None,
|
||||
after_cursor_value: object | None = None,
|
||||
after_cursor_id: str | None = None,
|
||||
) -> tuple[list[AssetReference], dict[str, list[str]], int]:
|
||||
"""List references with pagination, filtering, and sorting.
|
||||
|
||||
When ``after_cursor_value``/``after_cursor_id`` are supplied the query uses
|
||||
keyset pagination — ``offset`` is ignored and a WHERE clause selects rows
|
||||
strictly after the given ``(sort_col, id)`` position in the active sort
|
||||
direction. The cursor value must already be typed for the column
|
||||
(datetime for time sorts, int for size, str for name); the caller decodes
|
||||
the opaque cursor string and resolves to the typed value.
|
||||
|
||||
Returns (references, tag_map, total_count).
|
||||
"""
|
||||
base = (
|
||||
@ -297,9 +306,31 @@ def list_references_page(
|
||||
"size": Asset.size_bytes,
|
||||
}
|
||||
sort_col = sort_map.get(sort, AssetReference.created_at)
|
||||
sort_exp = sort_col.desc() if order == "desc" else sort_col.asc()
|
||||
descending = order == "desc"
|
||||
|
||||
base = base.order_by(sort_exp).limit(limit).offset(offset)
|
||||
# Keyset WHERE: (sort_col, id) strictly less-than / greater-than the cursor.
|
||||
# Equivalent to: sort_col <op> v OR (sort_col = v AND id <op> cursor_id).
|
||||
if after_cursor_value is not None and after_cursor_id is not None:
|
||||
if descending:
|
||||
keyset = sa.or_(
|
||||
sort_col < after_cursor_value,
|
||||
sa.and_(sort_col == after_cursor_value, AssetReference.id < after_cursor_id),
|
||||
)
|
||||
else:
|
||||
keyset = sa.or_(
|
||||
sort_col > after_cursor_value,
|
||||
sa.and_(sort_col == after_cursor_value, AssetReference.id > after_cursor_id),
|
||||
)
|
||||
base = base.where(keyset)
|
||||
|
||||
# Secondary ORDER BY id (matching the primary direction) gives the keyset
|
||||
# comparison a deterministic tiebreaker on duplicate sort_col values.
|
||||
id_exp = AssetReference.id.desc() if descending else AssetReference.id.asc()
|
||||
sort_exp = sort_col.desc() if descending else sort_col.asc()
|
||||
|
||||
base = base.order_by(sort_exp, id_exp).limit(limit)
|
||||
if after_cursor_id is None:
|
||||
base = base.offset(offset)
|
||||
|
||||
count_stmt = (
|
||||
select(sa.func.count())
|
||||
|
||||
@ -55,13 +55,11 @@ def validate_tags_exist(session: Session, tags: list[str]) -> None:
|
||||
raise ValueError(f"Unknown tags: {missing}")
|
||||
|
||||
|
||||
def ensure_tags_exist(
|
||||
session: Session, names: Iterable[str], tag_type: str = "user"
|
||||
) -> None:
|
||||
def ensure_tags_exist(session: Session, names: Iterable[str]) -> None:
|
||||
wanted = normalize_tags(list(names))
|
||||
if not wanted:
|
||||
return
|
||||
rows = [{"name": n, "tag_type": tag_type} for n in list(dict.fromkeys(wanted))]
|
||||
rows = [{"name": n} for n in list(dict.fromkeys(wanted))]
|
||||
ins = (
|
||||
sqlite.insert(Tag)
|
||||
.values(rows)
|
||||
@ -97,7 +95,7 @@ def set_reference_tags(
|
||||
to_remove = [t for t in current if t not in desired]
|
||||
|
||||
if to_add:
|
||||
ensure_tags_exist(session, to_add, tag_type="user")
|
||||
ensure_tags_exist(session, to_add)
|
||||
session.add_all(
|
||||
[
|
||||
AssetReferenceTag(
|
||||
@ -142,7 +140,7 @@ def add_tags_to_reference(
|
||||
return AddTagsResult(added=[], already_present=[], total_tags=total)
|
||||
|
||||
if create_if_missing:
|
||||
ensure_tags_exist(session, norm, tag_type="user")
|
||||
ensure_tags_exist(session, norm)
|
||||
|
||||
current = set(get_reference_tags(session, reference_id))
|
||||
|
||||
@ -289,7 +287,6 @@ def list_tags_with_usage(
|
||||
q = (
|
||||
select(
|
||||
Tag.name,
|
||||
Tag.tag_type,
|
||||
func.coalesce(counts_sq.c.cnt, 0).label("count"),
|
||||
)
|
||||
.select_from(Tag)
|
||||
@ -331,7 +328,7 @@ def list_tags_with_usage(
|
||||
rows = (session.execute(q.limit(limit).offset(offset))).all()
|
||||
total = (session.execute(total_q)).scalar_one()
|
||||
|
||||
rows_norm = [(name, ttype, int(count or 0)) for (name, ttype, count) in rows]
|
||||
rows_norm = [(name, int(count or 0)) for (name, count) in rows]
|
||||
return rows_norm, int(total or 0)
|
||||
|
||||
|
||||
|
||||
@ -33,6 +33,7 @@ from app.assets.services.file_utils import (
|
||||
verify_file_unchanged,
|
||||
)
|
||||
from app.assets.services.hashing import HashCheckpoint, compute_blake3_hash
|
||||
from app.assets.services.image_dimensions import extract_image_dimensions
|
||||
from app.assets.services.metadata_extract import extract_file_metadata
|
||||
from app.assets.services.path_utils import (
|
||||
compute_relative_filename,
|
||||
@ -354,7 +355,7 @@ def insert_asset_specs(specs: list[SeedAssetSpec], tag_pool: set[str]) -> int:
|
||||
return 0
|
||||
with create_session() as sess:
|
||||
if tag_pool:
|
||||
ensure_tags_exist(sess, tag_pool, tag_type="user")
|
||||
ensure_tags_exist(sess, tag_pool)
|
||||
result = batch_insert_seed_assets(sess, specs=specs, owner_id="")
|
||||
sess.commit()
|
||||
return result.inserted_refs
|
||||
@ -506,6 +507,10 @@ def enrich_asset(
|
||||
|
||||
if extract_metadata and metadata:
|
||||
system_metadata = metadata.to_user_metadata()
|
||||
if mime_type and mime_type.startswith("image/"):
|
||||
dims = extract_image_dimensions(file_path, mime_type=mime_type)
|
||||
if dims:
|
||||
system_metadata.update(dims)
|
||||
set_reference_system_metadata(session, reference_id, system_metadata)
|
||||
|
||||
if full_hash:
|
||||
|
||||
@ -1,8 +1,19 @@
|
||||
import contextlib
|
||||
import mimetypes
|
||||
import os
|
||||
from datetime import timezone
|
||||
from typing import Sequence
|
||||
|
||||
from app.assets.services.cursor import (
|
||||
CursorPayload,
|
||||
InvalidCursorError,
|
||||
decode_cursor,
|
||||
decode_cursor_int,
|
||||
decode_cursor_time,
|
||||
encode_cursor,
|
||||
encode_cursor_from_time,
|
||||
)
|
||||
|
||||
|
||||
from app.assets.database.models import Asset
|
||||
from app.assets.database.queries import (
|
||||
@ -149,6 +160,16 @@ def delete_asset_reference(
|
||||
owner_id: str,
|
||||
delete_content_if_orphan: bool = True,
|
||||
) -> bool:
|
||||
"""Delete an asset reference.
|
||||
|
||||
With ``delete_content_if_orphan=False`` (a soft delete), the reference is
|
||||
hidden and the underlying content is preserved. With ``True``, the content
|
||||
is also removed once it becomes orphaned.
|
||||
|
||||
Note: the public DELETE /api/assets/{id} endpoint always soft-deletes
|
||||
(passes ``False``); the orphan-reclamation path is intentionally
|
||||
internal-only, retained for a future GC/admin caller.
|
||||
"""
|
||||
with create_session() as session:
|
||||
if not delete_content_if_orphan:
|
||||
# Soft delete: mark the reference as deleted but keep everything
|
||||
@ -242,6 +263,11 @@ def get_asset_by_hash(asset_hash: str) -> AssetData | None:
|
||||
return extract_asset_data(asset)
|
||||
|
||||
|
||||
# Sort fields that support cursor pagination. `last_access_time` is not
|
||||
# in this list — it falls back to offset/limit.
|
||||
_CURSOR_SORT_FIELDS = ("created_at", "updated_at", "name", "size")
|
||||
|
||||
|
||||
def list_assets_page(
|
||||
owner_id: str = "",
|
||||
include_tags: Sequence[str] | None = None,
|
||||
@ -252,7 +278,39 @@ def list_assets_page(
|
||||
offset: int = 0,
|
||||
sort: str = "created_at",
|
||||
order: str = "desc",
|
||||
after: str | None = None,
|
||||
) -> ListAssetsResult:
|
||||
"""List assets with optional cursor pagination.
|
||||
|
||||
When ``after`` is supplied it overrides ``offset``. The cursor's sort field
|
||||
must match ``sort`` and be in the cursor-supported allowlist; mismatches
|
||||
raise InvalidCursorError so the handler can map to 400 INVALID_CURSOR.
|
||||
"""
|
||||
cursor_value: object | None = None
|
||||
cursor_id: str | None = None
|
||||
# Mint next_cursor on every page where the sort is cursor-supported, not
|
||||
# only when the request itself arrived with a cursor. Otherwise a first
|
||||
# request (no `after`) returns next_cursor=None and the client can never
|
||||
# enter cursor mode.
|
||||
mint_cursor = sort in _CURSOR_SORT_FIELDS
|
||||
|
||||
if after is not None:
|
||||
if sort not in _CURSOR_SORT_FIELDS:
|
||||
raise InvalidCursorError(
|
||||
f"cursor pagination is not supported for sort={sort!r}"
|
||||
)
|
||||
payload = decode_cursor(after, _CURSOR_SORT_FIELDS, expected_order=order)
|
||||
if payload.sort_field != sort:
|
||||
raise InvalidCursorError(
|
||||
f"cursor sort field {payload.sort_field!r} does not match request sort {sort!r}"
|
||||
)
|
||||
cursor_value, cursor_id = _resolve_cursor_value(payload), payload.id
|
||||
|
||||
# Over-fetch by one row so we can distinguish "exactly `limit` rows total
|
||||
# remaining" from "more rows past this page" without a second query. Drop
|
||||
# the sentinel before returning.
|
||||
fetch_limit = limit + 1 if mint_cursor else limit
|
||||
|
||||
with create_session() as session:
|
||||
refs, tag_map, total = list_references_page(
|
||||
session,
|
||||
@ -261,12 +319,22 @@ def list_assets_page(
|
||||
exclude_tags=exclude_tags,
|
||||
name_contains=name_contains,
|
||||
metadata_filter=metadata_filter,
|
||||
limit=limit,
|
||||
limit=fetch_limit,
|
||||
offset=offset,
|
||||
sort=sort,
|
||||
order=order,
|
||||
after_cursor_value=cursor_value,
|
||||
after_cursor_id=cursor_id,
|
||||
)
|
||||
|
||||
next_cursor: str | None = None
|
||||
if mint_cursor and len(refs) > limit:
|
||||
# There's at least one more row past this page — mint a cursor from
|
||||
# the last row of the page (i.e. index `limit - 1`, since we
|
||||
# over-fetched), and drop the sentinel.
|
||||
next_cursor = _encode_next_cursor(refs[limit - 1], sort, order)
|
||||
refs = refs[:limit]
|
||||
|
||||
items: list[AssetSummaryData] = []
|
||||
for ref in refs:
|
||||
items.append(
|
||||
@ -277,7 +345,39 @@ def list_assets_page(
|
||||
)
|
||||
)
|
||||
|
||||
return ListAssetsResult(items=items, total=total)
|
||||
return ListAssetsResult(items=items, total=total, next_cursor=next_cursor)
|
||||
|
||||
|
||||
def _resolve_cursor_value(payload: CursorPayload) -> object:
|
||||
"""Map a decoded cursor payload to a column-typed Python value."""
|
||||
if payload.sort_field in ("created_at", "updated_at"):
|
||||
# DB stores naive UTC; strip tzinfo so the comparison binds against a
|
||||
# `TIMESTAMP WITHOUT TIME ZONE` column without an offset shift.
|
||||
return decode_cursor_time(payload).replace(tzinfo=None)
|
||||
if payload.sort_field == "size":
|
||||
return decode_cursor_int(payload)
|
||||
return payload.value # name, str-typed
|
||||
|
||||
|
||||
def _encode_next_cursor(ref, sort: str, order: str) -> str | None:
|
||||
"""Mint a cursor pointing at *ref* for the given sort dimension.
|
||||
|
||||
Returns None when the boundary row carries a NULL sort value (e.g. an asset
|
||||
record whose size_bytes hasn't been backfilled). Continuing pagination
|
||||
across a NULL boundary is undefined under keyset ordering — better to
|
||||
truncate cleanly here than to mint a cursor that mis-positions.
|
||||
"""
|
||||
if sort == "name":
|
||||
return encode_cursor("name", ref.name, ref.id, order=order)
|
||||
if sort == "size":
|
||||
if ref.asset is None or ref.asset.size_bytes is None:
|
||||
return None
|
||||
return encode_cursor("size", str(ref.asset.size_bytes), ref.id, order=order)
|
||||
# created_at / updated_at — DB datetimes are naive UTC; attach tz before encoding.
|
||||
value = ref.created_at if sort == "created_at" else ref.updated_at
|
||||
if value is None:
|
||||
return None
|
||||
return encode_cursor_from_time(sort, value.replace(tzinfo=timezone.utc), ref.id, order=order)
|
||||
|
||||
|
||||
def resolve_hash_to_path(
|
||||
|
||||
213
app/assets/services/cursor.py
Normal file
213
app/assets/services/cursor.py
Normal file
@ -0,0 +1,213 @@
|
||||
"""Opaque keyset-pagination cursor for /api/assets.
|
||||
|
||||
Payload JSON uses short keys to keep the encoded length small:
|
||||
|
||||
{"s": <sort_field>, "v": <value>, "id": <id>, "o": <order>}
|
||||
|
||||
The `o` key binds the cursor to the sort direction it was minted under,
|
||||
so replaying a `desc` cursor against an `asc` request fails with
|
||||
``INVALID_CURSOR`` rather than silently walking the wrong direction.
|
||||
`o` is mandatory on every payload — a cursor without it is rejected as
|
||||
malformed.
|
||||
|
||||
Encoding is base64url with no padding. Cursors are opaque tokens: the
|
||||
payload format is internal to this server, and clients must treat a
|
||||
cursor as a black box handed back via `next_cursor`. No byte-level
|
||||
compatibility with any other implementation is required.
|
||||
|
||||
Time values are serialized as Unix microseconds (UTC) — microsecond
|
||||
precision is sufficient to round-trip the timestamps stored by the
|
||||
database without rounding rows in the same millisecond bucket.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from typing import Iterable, Optional
|
||||
|
||||
|
||||
class InvalidCursorError(ValueError):
|
||||
"""Raised on a malformed, oversized, or unsupported-sort-field cursor.
|
||||
|
||||
Map to a 400 response with code ``INVALID_CURSOR`` at the handler.
|
||||
"""
|
||||
|
||||
|
||||
# Wire-format length caps. Cursors are user-controlled, so caps protect the
|
||||
# decode path from oversized allocations and downstream SQL predicates from
|
||||
# unbounded strings.
|
||||
#
|
||||
# MAX_CURSOR_VALUE_LENGTH is 512 to fit the `AssetReference.name` column max
|
||||
# (`String(512)`) — otherwise a long-named asset would mint a cursor the same
|
||||
# server then refuses on the next request.
|
||||
#
|
||||
# MAX_ENCODED_CURSOR_LENGTH is the decode-path guard, sized comfortably above
|
||||
# the largest cursor the per-field caps can produce. Worst case is value + id
|
||||
# at their caps with every character JSON-escaping to the six-byte `\uXXXX`
|
||||
# form (control characters), which is ~5.2 KB once base64url-encoded. At 8192
|
||||
# the encoder can never mint a cursor that exceeds it, so a freshly minted
|
||||
# cursor always decodes on the next request and there is no user-visible
|
||||
# "cursor too long" failure.
|
||||
MAX_ENCODED_CURSOR_LENGTH = 8192
|
||||
MAX_CURSOR_VALUE_LENGTH = 512
|
||||
MAX_CURSOR_ID_LENGTH = 128
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class CursorPayload:
|
||||
sort_field: str
|
||||
value: str
|
||||
id: str
|
||||
order: str
|
||||
|
||||
|
||||
_VALID_ORDERS = ("asc", "desc")
|
||||
|
||||
|
||||
def encode_cursor(sort_field: str, value: str, id: str, order: str = "desc") -> str:
|
||||
"""Encode a cursor payload as a base64url (no-padding) string.
|
||||
|
||||
`order` binds the cursor to the sort direction it was minted under so a
|
||||
later request with a flipped `order` query parameter is rejected with
|
||||
``INVALID_CURSOR`` rather than silently walking the wrong direction.
|
||||
"""
|
||||
if order not in _VALID_ORDERS:
|
||||
raise InvalidCursorError(f"order must be one of {_VALID_ORDERS}, got {order!r}")
|
||||
# Symmetric input validation: the encoder must reject anything the
|
||||
# decoder rejects, or the same server will mint cursors it then 400s on
|
||||
# the next request.
|
||||
if not id:
|
||||
raise InvalidCursorError("id must be non-empty")
|
||||
if len(id) > MAX_CURSOR_ID_LENGTH:
|
||||
raise InvalidCursorError("id exceeds maximum length")
|
||||
if len(value) > MAX_CURSOR_VALUE_LENGTH:
|
||||
raise InvalidCursorError("value exceeds maximum length")
|
||||
payload = {"s": sort_field, "v": value, "id": id, "o": order}
|
||||
raw = json.dumps(payload, separators=(",", ":"), ensure_ascii=False)
|
||||
# No mint-time length guard is needed: the per-field caps above bound the
|
||||
# encoded length well below MAX_ENCODED_CURSOR_LENGTH (see its definition),
|
||||
# so the encoder can never produce a cursor the decode path would reject.
|
||||
return base64.urlsafe_b64encode(raw.encode("utf-8")).rstrip(b"=").decode("ascii")
|
||||
|
||||
|
||||
def encode_cursor_from_time(sort_field: str, t: datetime, id: str, order: str = "desc") -> str:
|
||||
"""Encode a time-typed cursor at Unix microsecond precision.
|
||||
|
||||
Accepts an aware datetime (any timezone) and normalizes to UTC. Naive
|
||||
datetimes are rejected so callers can't accidentally encode the local
|
||||
wall-clock value of a UTC-stored timestamp.
|
||||
"""
|
||||
if t.tzinfo is None:
|
||||
raise ValueError("encode_cursor_from_time requires an aware datetime")
|
||||
micros = _datetime_to_unix_micros(t.astimezone(timezone.utc))
|
||||
return encode_cursor(sort_field, str(micros), id, order=order)
|
||||
|
||||
|
||||
def decode_cursor(
|
||||
cursor: str,
|
||||
allowed_sort_fields: Iterable[str],
|
||||
expected_order: str | None = None,
|
||||
) -> CursorPayload:
|
||||
"""Parse an opaque cursor.
|
||||
|
||||
``allowed_sort_fields`` is the endpoint's accepted sort-field list — a
|
||||
cursor carrying a field outside this set is rejected so a cursor minted
|
||||
for one column can't be replayed against another (e.g. a ``created_at``
|
||||
timestamp string compared against a ``name`` column).
|
||||
|
||||
``expected_order`` (``"asc"``/``"desc"``), when supplied, must match the
|
||||
payload's ``o`` field. ``o`` is required on every payload; a cursor
|
||||
missing it is rejected as malformed.
|
||||
|
||||
Passing no allowed fields rejects every cursor.
|
||||
"""
|
||||
if len(cursor) > MAX_ENCODED_CURSOR_LENGTH:
|
||||
raise InvalidCursorError("cursor exceeds maximum length")
|
||||
|
||||
try:
|
||||
# urlsafe_b64decode requires correct padding; we strip on encode, so
|
||||
# restore the trailing '=' pad here.
|
||||
padding = "=" * (-len(cursor) % 4)
|
||||
raw = base64.urlsafe_b64decode(cursor + padding)
|
||||
except (ValueError, base64.binascii.Error) as e:
|
||||
raise InvalidCursorError(f"encoding: {e}") from e
|
||||
|
||||
try:
|
||||
decoded = json.loads(raw)
|
||||
except (json.JSONDecodeError, UnicodeDecodeError) as e:
|
||||
raise InvalidCursorError(f"payload: {e}") from e
|
||||
|
||||
if not isinstance(decoded, dict):
|
||||
raise InvalidCursorError("payload: expected object")
|
||||
|
||||
sort_field = decoded.get("s")
|
||||
value = decoded.get("v")
|
||||
id = decoded.get("id")
|
||||
order = decoded.get("o")
|
||||
|
||||
if not isinstance(sort_field, str) or not isinstance(value, str) or not isinstance(id, str):
|
||||
raise InvalidCursorError("payload: missing or non-string s/v/id")
|
||||
|
||||
if id == "":
|
||||
raise InvalidCursorError("missing id")
|
||||
if len(id) > MAX_CURSOR_ID_LENGTH:
|
||||
raise InvalidCursorError("id exceeds maximum length")
|
||||
if len(value) > MAX_CURSOR_VALUE_LENGTH:
|
||||
raise InvalidCursorError("value exceeds maximum length")
|
||||
|
||||
if sort_field not in allowed_sort_fields:
|
||||
raise InvalidCursorError(f"unsupported sort field {sort_field!r}")
|
||||
|
||||
if not isinstance(order, str):
|
||||
raise InvalidCursorError("missing or non-string o")
|
||||
if order not in _VALID_ORDERS:
|
||||
raise InvalidCursorError(f"unsupported order {order!r}")
|
||||
if expected_order is not None and order != expected_order:
|
||||
raise InvalidCursorError(
|
||||
f"cursor order {order!r} does not match request order {expected_order!r}"
|
||||
)
|
||||
|
||||
return CursorPayload(sort_field=sort_field, value=value, id=id, order=order)
|
||||
|
||||
|
||||
def decode_cursor_time(payload: Optional[CursorPayload]) -> datetime:
|
||||
"""Parse a time-typed cursor value as Unix microseconds, returning UTC."""
|
||||
if payload is None:
|
||||
raise InvalidCursorError("nil cursor payload")
|
||||
try:
|
||||
micros = int(payload.value)
|
||||
except ValueError as e:
|
||||
raise InvalidCursorError(f"value is not a valid timestamp: {e}") from e
|
||||
try:
|
||||
return _unix_micros_to_datetime(micros)
|
||||
except (OverflowError, OSError, ValueError) as e:
|
||||
# Crafted out-of-range microseconds (e.g. > datetime.MAX_YEAR) blow up
|
||||
# in fromtimestamp / datetime construction. Map to 400, not 500.
|
||||
raise InvalidCursorError(f"value is out of representable range: {e}") from e
|
||||
|
||||
|
||||
def decode_cursor_int(payload: Optional[CursorPayload]) -> int:
|
||||
"""Parse a cursor value as a base-10 integer."""
|
||||
if payload is None:
|
||||
raise InvalidCursorError("nil cursor payload")
|
||||
try:
|
||||
return int(payload.value)
|
||||
except ValueError as e:
|
||||
raise InvalidCursorError(f"value is not a valid integer: {e}") from e
|
||||
|
||||
|
||||
_EPOCH = datetime(1970, 1, 1, tzinfo=timezone.utc)
|
||||
|
||||
|
||||
def _datetime_to_unix_micros(t: datetime) -> int:
|
||||
"""Convert an aware UTC datetime to Unix microseconds (integer math)."""
|
||||
delta = t - _EPOCH
|
||||
return (delta.days * 86_400 + delta.seconds) * 1_000_000 + delta.microseconds
|
||||
|
||||
|
||||
def _unix_micros_to_datetime(micros: int) -> datetime:
|
||||
"""Convert Unix microseconds to a UTC datetime, preserving precision."""
|
||||
seconds, micro_remainder = divmod(micros, 1_000_000)
|
||||
return datetime.fromtimestamp(seconds, tz=timezone.utc).replace(microsecond=micro_remainder)
|
||||
63
app/assets/services/image_dimensions.py
Normal file
63
app/assets/services/image_dimensions.py
Normal file
@ -0,0 +1,63 @@
|
||||
"""Image dimension extraction for asset ingest.
|
||||
|
||||
Reads only the image header via Pillow to capture width/height cheaply,
|
||||
without a full pixel decode. Returns a metadata dict suitable for merging
|
||||
into ``AssetReference.system_metadata``.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def extract_image_dimensions(
|
||||
file_path: str, mime_type: str | None = None
|
||||
) -> dict[str, Any] | None:
|
||||
"""Extract image dimensions for the file at ``file_path``.
|
||||
|
||||
Args:
|
||||
file_path: Absolute path to a file on disk.
|
||||
mime_type: Optional MIME type hint. When provided and not prefixed
|
||||
with ``image/``, extraction is skipped without touching the file.
|
||||
|
||||
Returns:
|
||||
``{"kind": "image", "width": W, "height": H}`` when the file is a
|
||||
recognizable image with positive dimensions, otherwise ``None``.
|
||||
|
||||
The dict shape is intended to be merged into ``system_metadata`` so the
|
||||
asset response surfaces ``metadata.kind`` plus dimension fields for image
|
||||
assets. Forward-compatible: future media kinds (e.g. ``"video"`` with
|
||||
duration/fps) can extend this shape without schema changes.
|
||||
"""
|
||||
if mime_type is not None and not mime_type.startswith("image/"):
|
||||
return None
|
||||
|
||||
try:
|
||||
from PIL import Image, UnidentifiedImageError
|
||||
except ImportError:
|
||||
logger.debug(
|
||||
"Pillow not available; skipping image dimension extraction for %s",
|
||||
file_path,
|
||||
)
|
||||
return None
|
||||
|
||||
try:
|
||||
with Image.open(file_path) as img:
|
||||
width, height = img.size
|
||||
except (OSError, UnidentifiedImageError, ValueError) as exc:
|
||||
logger.debug(
|
||||
"Failed to read image dimensions from %s: %s", file_path, exc
|
||||
)
|
||||
return None
|
||||
|
||||
if (
|
||||
not isinstance(width, int)
|
||||
or not isinstance(height, int)
|
||||
or width <= 0
|
||||
or height <= 0
|
||||
):
|
||||
return None
|
||||
|
||||
return {"kind": "image", "width": width, "height": height}
|
||||
@ -17,9 +17,11 @@ from app.assets.database.queries import (
|
||||
get_reference_by_file_path,
|
||||
get_reference_tags,
|
||||
get_or_create_reference,
|
||||
list_references_by_asset_id,
|
||||
reference_exists,
|
||||
remove_missing_tag_for_asset_id,
|
||||
set_reference_metadata,
|
||||
set_reference_system_metadata,
|
||||
set_reference_tags,
|
||||
update_asset_hash_and_mime,
|
||||
upsert_asset,
|
||||
@ -29,6 +31,7 @@ from app.assets.database.queries import (
|
||||
from app.assets.helpers import get_utc_now, normalize_tags
|
||||
from app.assets.services.bulk_ingest import batch_insert_seed_assets
|
||||
from app.assets.services.file_utils import get_size_and_mtime_ns
|
||||
from app.assets.services.image_dimensions import extract_image_dimensions
|
||||
from app.assets.services.path_utils import (
|
||||
compute_relative_filename,
|
||||
get_name_and_tags_from_asset_path,
|
||||
@ -118,6 +121,14 @@ def _ingest_file_from_path(
|
||||
user_metadata=user_metadata,
|
||||
)
|
||||
|
||||
_maybe_store_image_dimensions(
|
||||
session,
|
||||
reference_id=reference_id,
|
||||
file_path=locator,
|
||||
mime_type=mime_type,
|
||||
current_system_metadata=ref.system_metadata,
|
||||
)
|
||||
|
||||
try:
|
||||
remove_missing_tag_for_asset_id(session, asset_id=asset.id)
|
||||
except Exception:
|
||||
@ -288,6 +299,13 @@ def _register_existing_asset(
|
||||
user_metadata=new_meta,
|
||||
)
|
||||
|
||||
_backfill_image_dimensions_from_siblings(
|
||||
session,
|
||||
asset_id=asset.id,
|
||||
new_reference_id=ref.id,
|
||||
current_system_metadata=ref.system_metadata,
|
||||
)
|
||||
|
||||
if tags is not None:
|
||||
set_reference_tags(
|
||||
session,
|
||||
@ -334,6 +352,87 @@ def _update_metadata_with_filename(
|
||||
)
|
||||
|
||||
|
||||
_IMAGE_DIMENSION_KEYS = ("kind", "width", "height")
|
||||
|
||||
|
||||
def _maybe_store_image_dimensions(
|
||||
session: Session,
|
||||
reference_id: str,
|
||||
file_path: str,
|
||||
mime_type: str | None,
|
||||
current_system_metadata: dict | None,
|
||||
) -> None:
|
||||
"""Populate ``kind``/``width``/``height`` on system_metadata for image refs.
|
||||
|
||||
Non-image MIME types are a no-op. Pre-existing keys (e.g. enricher-written
|
||||
safetensors metadata, download provenance) are preserved by merge.
|
||||
"""
|
||||
if not mime_type or not mime_type.startswith("image/"):
|
||||
return
|
||||
|
||||
dims = extract_image_dimensions(file_path, mime_type=mime_type)
|
||||
if not dims:
|
||||
return
|
||||
|
||||
current = current_system_metadata or {}
|
||||
merged = dict(current)
|
||||
merged.update(dims)
|
||||
if merged != current:
|
||||
set_reference_system_metadata(
|
||||
session,
|
||||
reference_id=reference_id,
|
||||
system_metadata=merged,
|
||||
)
|
||||
|
||||
|
||||
def _backfill_image_dimensions_from_siblings(
|
||||
session: Session,
|
||||
asset_id: str,
|
||||
new_reference_id: str,
|
||||
current_system_metadata: dict | None,
|
||||
) -> None:
|
||||
"""Copy image dimension keys from any sibling reference of the same asset.
|
||||
|
||||
The from-hash path doesn't read the file bytes, so dimensions can't be
|
||||
extracted there directly. When another reference of the same asset already
|
||||
carries image dimensions, copy them onto the new reference so consumers
|
||||
see consistent metadata regardless of how the asset was registered.
|
||||
|
||||
Best-effort: missing siblings, non-image siblings, or absent dimension
|
||||
keys leave the target reference unchanged.
|
||||
"""
|
||||
current = current_system_metadata or {}
|
||||
if current.get("kind") == "image" and "width" in current and "height" in current:
|
||||
return
|
||||
|
||||
for sibling in list_references_by_asset_id(session, asset_id):
|
||||
if sibling.id == new_reference_id:
|
||||
continue
|
||||
meta = sibling.system_metadata or {}
|
||||
if meta.get("kind") != "image":
|
||||
continue
|
||||
width = meta.get("width")
|
||||
height = meta.get("height")
|
||||
if (
|
||||
type(width) is not int
|
||||
or type(height) is not int
|
||||
or width <= 0
|
||||
or height <= 0
|
||||
):
|
||||
continue
|
||||
merged = dict(current)
|
||||
merged["kind"] = "image"
|
||||
merged["width"] = width
|
||||
merged["height"] = height
|
||||
if merged != current:
|
||||
set_reference_system_metadata(
|
||||
session,
|
||||
reference_id=new_reference_id,
|
||||
system_metadata=merged,
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
def _sanitize_filename(name: str | None, fallback: str) -> str:
|
||||
n = os.path.basename((name or "").strip() or fallback)
|
||||
return n if n else fallback
|
||||
|
||||
@ -4,7 +4,6 @@ Tier 1: Filesystem metadata (zero parsing)
|
||||
Tier 2: Safetensors header metadata (fast JSON read only)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
|
||||
@ -56,7 +56,6 @@ class IngestResult:
|
||||
|
||||
class TagUsage(NamedTuple):
|
||||
name: str
|
||||
tag_type: str
|
||||
count: int
|
||||
|
||||
|
||||
@ -71,6 +70,7 @@ class AssetSummaryData:
|
||||
class ListAssetsResult:
|
||||
items: list[AssetSummaryData]
|
||||
total: int
|
||||
next_cursor: str | None = None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
|
||||
@ -75,7 +75,7 @@ def list_tags(
|
||||
owner_id=owner_id,
|
||||
)
|
||||
|
||||
return [TagUsage(name, tag_type, count) for name, tag_type, count in rows], total
|
||||
return [TagUsage(name, count) for name, count in rows], total
|
||||
|
||||
|
||||
def list_tag_histogram(
|
||||
|
||||
@ -1,5 +1,3 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import folder_paths
|
||||
import glob
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
@ -62,6 +61,8 @@ def get_comfy_package_versions():
|
||||
def check_comfy_packages_versions():
|
||||
"""Warn for every comfy* package whose installed version is below requirements.txt."""
|
||||
from packaging.version import InvalidVersion, parse as parse_pep440
|
||||
outdated_packages = []
|
||||
|
||||
for pkg in get_comfy_package_versions():
|
||||
installed_str = pkg["installed"]
|
||||
required_str = pkg["required"]
|
||||
@ -73,19 +74,26 @@ def check_comfy_packages_versions():
|
||||
logging.error(f"Failed to check {pkg['name']} version: {e}")
|
||||
continue
|
||||
if outdated:
|
||||
app.logger.log_startup_warning(
|
||||
f"""
|
||||
outdated_packages.append((pkg["name"], installed_str, required_str))
|
||||
else:
|
||||
logging.info("{} version: {}".format(pkg["name"], installed_str))
|
||||
|
||||
if outdated_packages:
|
||||
package_warnings = "\n".join(
|
||||
f"Installed {name} version {installed} is lower than the recommended version {required}."
|
||||
for name, installed, required in outdated_packages
|
||||
)
|
||||
app.logger.log_startup_warning(
|
||||
f"""
|
||||
________________________________________________________________________
|
||||
WARNING WARNING WARNING WARNING WARNING
|
||||
|
||||
Installed {pkg["name"]} version {installed_str} is lower than the recommended version {required_str}.
|
||||
{package_warnings}
|
||||
|
||||
{get_missing_requirements_message()}
|
||||
________________________________________________________________________
|
||||
""".strip()
|
||||
)
|
||||
else:
|
||||
logging.info("{} version: {}".format(pkg["name"], installed_str))
|
||||
)
|
||||
|
||||
|
||||
REQUEST_TIMEOUT = 10 # seconds
|
||||
|
||||
@ -5,6 +5,40 @@ import logging
|
||||
import sys
|
||||
import threading
|
||||
|
||||
ANSI_NAMED_COLORS = {
|
||||
'black': '\033[30m',
|
||||
'red': '\033[31m',
|
||||
'green': '\033[32m',
|
||||
'yellow': '\033[33m',
|
||||
'blue': '\033[34m',
|
||||
'magenta': '\033[35m',
|
||||
'cyan': '\033[36m',
|
||||
'white': '\033[37m',
|
||||
}
|
||||
|
||||
ANSI_LEVEL_COLORS = {
|
||||
'DEBUG': ANSI_NAMED_COLORS['cyan'],
|
||||
'INFO': ANSI_NAMED_COLORS['green'],
|
||||
'WARNING': ANSI_NAMED_COLORS['yellow'],
|
||||
'ERROR': ANSI_NAMED_COLORS['red'],
|
||||
'CRITICAL': ANSI_NAMED_COLORS['magenta'],
|
||||
}
|
||||
|
||||
ANSI_RESET = '\033[0m'
|
||||
ANSI_BOLD = '\033[1m'
|
||||
|
||||
|
||||
class ColoredFormatter(logging.Formatter):
|
||||
def format(self, record):
|
||||
color = ANSI_LEVEL_COLORS.get(record.levelname, '')
|
||||
bold = ANSI_BOLD if record.levelno >= logging.WARNING else ''
|
||||
level_tag = f"{bold}{color}[{record.levelname}]{ANSI_RESET} "
|
||||
message = super().format(record)
|
||||
line_color = ANSI_NAMED_COLORS.get(getattr(record, 'color', ''), '')
|
||||
if line_color:
|
||||
return f"{level_tag}{line_color}{message}{ANSI_RESET}"
|
||||
return level_tag + message
|
||||
|
||||
logs = None
|
||||
stdout_interceptor = None
|
||||
stderr_interceptor = None
|
||||
@ -68,8 +102,10 @@ def setup_logger(log_level: str = 'INFO', capacity: int = 300, use_stdout: bool
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(log_level)
|
||||
|
||||
formatter = ColoredFormatter("%(message)s")
|
||||
|
||||
stream_handler = logging.StreamHandler()
|
||||
stream_handler.setFormatter(logging.Formatter("%(message)s"))
|
||||
stream_handler.setFormatter(formatter)
|
||||
|
||||
if use_stdout:
|
||||
# Only errors and critical to stderr
|
||||
@ -77,7 +113,7 @@ def setup_logger(log_level: str = 'INFO', capacity: int = 300, use_stdout: bool
|
||||
|
||||
# Lesser to stdout
|
||||
stdout_handler = logging.StreamHandler(sys.stdout)
|
||||
stdout_handler.setFormatter(logging.Formatter("%(message)s"))
|
||||
stdout_handler.setFormatter(formatter)
|
||||
stdout_handler.addFilter(lambda record: record.levelno < logging.ERROR)
|
||||
logger.addHandler(stdout_handler)
|
||||
|
||||
|
||||
@ -1,5 +1,3 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import base64
|
||||
import json
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
from __future__ import annotations
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
2091
blueprints/Audio Generation (Stable Audio 3 Medium Base).json
Normal file
2091
blueprints/Audio Generation (Stable Audio 3 Medium Base).json
Normal file
File diff suppressed because one or more lines are too long
2091
blueprints/Audio Generation (Stable Audio 3 Medium).json
Normal file
2091
blueprints/Audio Generation (Stable Audio 3 Medium).json
Normal file
File diff suppressed because one or more lines are too long
@ -1553,7 +1553,7 @@
|
||||
"VHS_MetadataImage": true,
|
||||
"VHS_KeepIntermediate": true
|
||||
},
|
||||
"category": "Image generation and editing/Canny to image",
|
||||
"category": "Image generation and editing/Conditioned",
|
||||
"description": "Generates an image from a Canny edge map using Z-Image-Turbo, with text conditioning."
|
||||
}
|
||||
]
|
||||
|
||||
@ -3600,7 +3600,7 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Video generation and editing/Canny to video",
|
||||
"category": "Video generation and editing/Conditioned",
|
||||
"description": "Generates video from Canny edge maps using LTX-2, with optional synchronized audio."
|
||||
}
|
||||
]
|
||||
|
||||
@ -1401,7 +1401,7 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image generation and editing/ControlNet",
|
||||
"category": "Image generation and editing/Conditioned",
|
||||
"description": "Generates images from a text prompt and ControlNet conditioning (e.g. depth, canny) using Z-Image-Turbo."
|
||||
}
|
||||
]
|
||||
|
||||
@ -1579,7 +1579,7 @@
|
||||
"VHS_MetadataImage": true,
|
||||
"VHS_KeepIntermediate": true
|
||||
},
|
||||
"category": "Image generation and editing/Depth to image",
|
||||
"category": "Image generation and editing/Conditioned",
|
||||
"description": "Generates an image from a depth map using Z-Image-Turbo with text conditioning."
|
||||
},
|
||||
{
|
||||
|
||||
@ -4233,7 +4233,7 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Video generation and editing/Depth to video",
|
||||
"category": "Video generation and editing/Conditioned",
|
||||
"description": "Generates depth-controlled video with LTX-2: motion and structure follow a depth-reference video alongside text prompting, optional first-frame image conditioning, with optional synchronized audio."
|
||||
},
|
||||
{
|
||||
|
||||
@ -3350,7 +3350,7 @@
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Video generation and editing/First-Last-Frame to Video",
|
||||
"category": "Video generation and editing/Conditioned",
|
||||
"description": "Generates a video interpolating between first and last keyframes using LTX-2.3."
|
||||
}
|
||||
]
|
||||
|
||||
@ -3350,7 +3350,7 @@
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Video generation and editing/First-Last-Frame to Video",
|
||||
"category": "Video generation and editing/FLF2V",
|
||||
"description": "Generates a video that interpolates between the first and last keyframes using LTX-2.3, including optional audio."
|
||||
}
|
||||
]
|
||||
|
||||
1266
blueprints/Geometry Estimation (MoGe).json
Normal file
1266
blueprints/Geometry Estimation (MoGe).json
Normal file
File diff suppressed because it is too large
Load Diff
@ -310,9 +310,9 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Text generation/Image Captioning",
|
||||
"category": "Image Tools",
|
||||
"description": "Generates descriptive captions for images using Google's Gemini multimodal LLM."
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,19 +1,18 @@
|
||||
{
|
||||
"id": "6af0a6c1-0161-4528-8685-65776e838d44",
|
||||
"revision": 0,
|
||||
"last_node_id": 75,
|
||||
"last_link_id": 245,
|
||||
"last_node_id": 76,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 75,
|
||||
"type": "488652fd-6edf-4d06-8f9f-4d84d3a34eaf",
|
||||
"id": 76,
|
||||
"type": "96338968-1242-4f02-b6a1-d496af4bcffe",
|
||||
"pos": [
|
||||
600,
|
||||
830
|
||||
670,
|
||||
1280
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
110
|
||||
201.3125
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
@ -59,47 +58,44 @@
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"title": "Image Depth Estimation (Lotus Depth)",
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"-1",
|
||||
"28",
|
||||
"sigma"
|
||||
],
|
||||
[
|
||||
"-1",
|
||||
"10",
|
||||
"unet_name"
|
||||
],
|
||||
[
|
||||
"-1",
|
||||
"14",
|
||||
"vae_name"
|
||||
]
|
||||
],
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.14.1"
|
||||
},
|
||||
"widgets_values": [
|
||||
999.0000000000002,
|
||||
"lotus-depth-d-v1-1.safetensors",
|
||||
"vae-ft-mse-840000-ema-pruned.safetensors"
|
||||
]
|
||||
"widgets_values": []
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"groups": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "488652fd-6edf-4d06-8f9f-4d84d3a34eaf",
|
||||
"id": "96338968-1242-4f02-b6a1-d496af4bcffe",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 1,
|
||||
"lastNodeId": 75,
|
||||
"lastNodeId": 76,
|
||||
"lastLinkId": 245,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Image to Depth Map (Lotus)",
|
||||
"name": "Image Depth Estimation (Lotus Depth)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -191,12 +187,12 @@
|
||||
"id": 10,
|
||||
"type": "UNETLoader",
|
||||
"pos": [
|
||||
108.05555555555557,
|
||||
-253.05555555555557
|
||||
110,
|
||||
-250
|
||||
],
|
||||
"size": [
|
||||
254.93706597222226,
|
||||
82
|
||||
260,
|
||||
90
|
||||
],
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
@ -234,9 +230,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "UNETLoader",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.34",
|
||||
"Node name for S&R": "UNETLoader",
|
||||
"models": [
|
||||
{
|
||||
"name": "lotus-depth-d-v1-1.safetensors",
|
||||
@ -255,12 +251,12 @@
|
||||
"id": 18,
|
||||
"type": "DisableNoise",
|
||||
"pos": [
|
||||
607.0641494069639,
|
||||
-268.33337840371513
|
||||
610,
|
||||
-270
|
||||
],
|
||||
"size": [
|
||||
175,
|
||||
33.333333333333336
|
||||
180,
|
||||
40
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
@ -278,26 +274,25 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "DisableNoise",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.34",
|
||||
"Node name for S&R": "DisableNoise",
|
||||
"widget_ue_connectable": {}
|
||||
},
|
||||
"widgets_values": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 23,
|
||||
"id": 74,
|
||||
"type": "VAEEncode",
|
||||
"pos": [
|
||||
620,
|
||||
160
|
||||
],
|
||||
"size": [
|
||||
175,
|
||||
180,
|
||||
50
|
||||
],
|
||||
"flags": {},
|
||||
"order": 10,
|
||||
"order": 11,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -325,12 +320,11 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "VAEEncode",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.34",
|
||||
"Node name for S&R": "VAEEncode",
|
||||
"widget_ue_connectable": {}
|
||||
},
|
||||
"widgets_values": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 21,
|
||||
@ -341,7 +335,7 @@
|
||||
],
|
||||
"size": [
|
||||
210,
|
||||
58
|
||||
60
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
@ -369,9 +363,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "KSamplerSelect",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.34",
|
||||
"Node name for S&R": "KSamplerSelect",
|
||||
"widget_ue_connectable": {}
|
||||
},
|
||||
"widgets_values": [
|
||||
@ -386,7 +380,7 @@
|
||||
-170
|
||||
],
|
||||
"size": [
|
||||
175,
|
||||
180,
|
||||
50
|
||||
],
|
||||
"flags": {},
|
||||
@ -418,12 +412,11 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "BasicGuider",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.34",
|
||||
"Node name for S&R": "BasicGuider",
|
||||
"widget_ue_connectable": {}
|
||||
},
|
||||
"widgets_values": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 16,
|
||||
@ -433,8 +426,8 @@
|
||||
-130
|
||||
],
|
||||
"size": [
|
||||
295.99609375,
|
||||
271.65798611111114
|
||||
300,
|
||||
280
|
||||
],
|
||||
"flags": {},
|
||||
"order": 6,
|
||||
@ -490,12 +483,11 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "SamplerCustomAdvanced",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.34",
|
||||
"Node name for S&R": "SamplerCustomAdvanced",
|
||||
"widget_ue_connectable": {}
|
||||
},
|
||||
"widgets_values": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 28,
|
||||
@ -506,10 +498,10 @@
|
||||
],
|
||||
"size": [
|
||||
210,
|
||||
58
|
||||
60
|
||||
],
|
||||
"flags": {},
|
||||
"order": 11,
|
||||
"order": 10,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -540,9 +532,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "SetFirstSigma",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.34",
|
||||
"Node name for S&R": "SetFirstSigma",
|
||||
"widget_ue_connectable": {}
|
||||
},
|
||||
"widgets_values": [
|
||||
@ -557,7 +549,7 @@
|
||||
-120
|
||||
],
|
||||
"size": [
|
||||
175,
|
||||
180,
|
||||
50
|
||||
],
|
||||
"flags": {},
|
||||
@ -589,12 +581,11 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "VAEDecode",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.34",
|
||||
"Node name for S&R": "VAEDecode",
|
||||
"widget_ue_connectable": {}
|
||||
},
|
||||
"widgets_values": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 22,
|
||||
@ -604,8 +595,8 @@
|
||||
-220
|
||||
],
|
||||
"size": [
|
||||
175,
|
||||
33.333333333333336
|
||||
180,
|
||||
40
|
||||
],
|
||||
"flags": {},
|
||||
"order": 9,
|
||||
@ -630,12 +621,11 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ImageInvert",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.34",
|
||||
"Node name for S&R": "ImageInvert",
|
||||
"widget_ue_connectable": {}
|
||||
},
|
||||
"widgets_values": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
@ -645,8 +635,8 @@
|
||||
-90
|
||||
],
|
||||
"size": [
|
||||
254.93706597222226,
|
||||
58
|
||||
260,
|
||||
60
|
||||
],
|
||||
"flags": {},
|
||||
"order": 5,
|
||||
@ -675,9 +665,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "VAELoader",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.34",
|
||||
"Node name for S&R": "VAELoader",
|
||||
"models": [
|
||||
{
|
||||
"name": "vae-ft-mse-840000-ema-pruned.safetensors",
|
||||
@ -692,15 +682,15 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 68,
|
||||
"id": 75,
|
||||
"type": "LotusConditioning",
|
||||
"pos": [
|
||||
400,
|
||||
-150
|
||||
],
|
||||
"size": [
|
||||
175,
|
||||
33.333333333333336
|
||||
180,
|
||||
40
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
@ -718,12 +708,11 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "LotusConditioning",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.34",
|
||||
"Node name for S&R": "LotusConditioning",
|
||||
"widget_ue_connectable": {}
|
||||
},
|
||||
"widgets_values": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 20,
|
||||
@ -734,7 +723,7 @@
|
||||
],
|
||||
"size": [
|
||||
210,
|
||||
106
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 8,
|
||||
@ -786,9 +775,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "BasicScheduler",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.34",
|
||||
"Node name for S&R": "BasicScheduler",
|
||||
"widget_ue_connectable": {}
|
||||
},
|
||||
"widgets_values": [
|
||||
@ -850,7 +839,7 @@
|
||||
},
|
||||
{
|
||||
"id": 201,
|
||||
"origin_id": 23,
|
||||
"origin_id": 74,
|
||||
"origin_slot": 0,
|
||||
"target_id": 16,
|
||||
"target_slot": 4,
|
||||
@ -866,7 +855,7 @@
|
||||
},
|
||||
{
|
||||
"id": 238,
|
||||
"origin_id": 68,
|
||||
"origin_id": 75,
|
||||
"origin_slot": 0,
|
||||
"target_id": 19,
|
||||
"target_slot": 1,
|
||||
@ -892,7 +881,7 @@
|
||||
"id": 38,
|
||||
"origin_id": 14,
|
||||
"origin_slot": 0,
|
||||
"target_id": 23,
|
||||
"target_id": 74,
|
||||
"target_slot": 1,
|
||||
"type": "VAE"
|
||||
},
|
||||
@ -908,7 +897,7 @@
|
||||
"id": 37,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 23,
|
||||
"target_id": 74,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
@ -948,12 +937,11 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image generation and editing/Depth to image",
|
||||
"category": "Conditioning & Preprocessors/Depth",
|
||||
"description": "Estimates a monocular depth map from an input image using the Lotus depth estimation model."
|
||||
}
|
||||
]
|
||||
},
|
||||
"config": {},
|
||||
"extra": {
|
||||
"ds": {
|
||||
"scale": 1.3589709866044692,
|
||||
@ -961,8 +949,6 @@
|
||||
-138.53613935617864,
|
||||
-786.0629126022195
|
||||
]
|
||||
},
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
}
|
||||
}
|
||||
1154
blueprints/Image Depth Estimation (MoGe).json
Normal file
1154
blueprints/Image Depth Estimation (MoGe).json
Normal file
File diff suppressed because it is too large
Load Diff
779
blueprints/Image Face Detection (Mediapipe).json
Normal file
779
blueprints/Image Face Detection (Mediapipe).json
Normal file
@ -0,0 +1,779 @@
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 33,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 33,
|
||||
"type": "6062babb-b649-4a71-be9e-20ebce567744",
|
||||
"pos": [
|
||||
-450,
|
||||
4240
|
||||
],
|
||||
"size": [
|
||||
420,
|
||||
400
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "face_landmarker",
|
||||
"type": "FACE_LANDMARKER",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "detector_variant",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "detector_variant"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "num_faces",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "num_faces"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "custom_face_oval",
|
||||
"name": "regions.face_oval",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "regions.face_oval"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "custom_lips",
|
||||
"name": "regions.lips",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "regions.lips"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "custom_left_eye",
|
||||
"name": "regions.left_eye",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "regions.left_eye"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "custom_right_eye",
|
||||
"name": "regions.right_eye",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "regions.right_eye"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "custom_irises",
|
||||
"name": "regions.irises",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "regions.irises"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "model_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "model_name"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "face_landmarks",
|
||||
"name": "face_landmarks",
|
||||
"type": "FACE_LANDMARKS",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"localized_name": "bboxes",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"label": "mask",
|
||||
"name": "MASK_1",
|
||||
"type": "MASK",
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"title": "Image Face Detection (Mediapipe)",
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"11",
|
||||
"detector_variant"
|
||||
],
|
||||
[
|
||||
"11",
|
||||
"num_faces"
|
||||
],
|
||||
[
|
||||
"20",
|
||||
"regions.face_oval"
|
||||
],
|
||||
[
|
||||
"20",
|
||||
"regions.lips"
|
||||
],
|
||||
[
|
||||
"20",
|
||||
"regions.left_eye"
|
||||
],
|
||||
[
|
||||
"20",
|
||||
"regions.right_eye"
|
||||
],
|
||||
[
|
||||
"20",
|
||||
"regions.irises"
|
||||
],
|
||||
[
|
||||
"2",
|
||||
"model_name"
|
||||
]
|
||||
],
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.22.0",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": []
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "6062babb-b649-4a71-be9e-20ebce567744",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 2,
|
||||
"lastNodeId": 158,
|
||||
"lastLinkId": 140,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Image Face Detection (Mediapipe)",
|
||||
"description": "Detects facial landmarks from an image using MediaPipe, outputting landmark data, face bounding boxes, and an optional face-region mask.",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-710,
|
||||
4300,
|
||||
148.880859375,
|
||||
248
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
140,
|
||||
4480,
|
||||
137.677734375,
|
||||
108
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "705dc1ae-6dc9-4155-92df-52f816ad451e",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
60
|
||||
],
|
||||
"localized_name": "image",
|
||||
"pos": [
|
||||
-585.119140625,
|
||||
4324
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "d6277190-732c-4604-b7cd-d3a9588bf761",
|
||||
"name": "face_landmarker",
|
||||
"type": "FACE_LANDMARKER",
|
||||
"linkIds": [
|
||||
74
|
||||
],
|
||||
"pos": [
|
||||
-585.119140625,
|
||||
4344
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "ac473a08-6a86-42a7-b460-e70c6c5e1e2b",
|
||||
"name": "detector_variant",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
75
|
||||
],
|
||||
"pos": [
|
||||
-585.119140625,
|
||||
4364
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "1bec2252-ca2d-496e-8a33-33a61d21f897",
|
||||
"name": "num_faces",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
76
|
||||
],
|
||||
"pos": [
|
||||
-585.119140625,
|
||||
4384
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "17994fa2-0ea0-4c9b-a70a-19789c459c80",
|
||||
"name": "regions.face_oval",
|
||||
"type": "BOOLEAN",
|
||||
"linkIds": [
|
||||
77
|
||||
],
|
||||
"label": "custom_face_oval",
|
||||
"pos": [
|
||||
-585.119140625,
|
||||
4404
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "1c6c5893-2aee-4c37-b702-15ef2e20d863",
|
||||
"name": "regions.lips",
|
||||
"type": "BOOLEAN",
|
||||
"linkIds": [
|
||||
78
|
||||
],
|
||||
"label": "custom_lips",
|
||||
"pos": [
|
||||
-585.119140625,
|
||||
4424
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "f353fcea-4b6f-42a1-8fdd-32b3aa1e1f09",
|
||||
"name": "regions.left_eye",
|
||||
"type": "BOOLEAN",
|
||||
"linkIds": [
|
||||
79
|
||||
],
|
||||
"label": "custom_left_eye",
|
||||
"pos": [
|
||||
-585.119140625,
|
||||
4444
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "1387e121-c1fb-4522-8f0d-43459e11dd86",
|
||||
"name": "regions.right_eye",
|
||||
"type": "BOOLEAN",
|
||||
"linkIds": [
|
||||
80
|
||||
],
|
||||
"label": "custom_right_eye",
|
||||
"pos": [
|
||||
-585.119140625,
|
||||
4464
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "14acb0a0-d1f4-48f3-ba31-811b26236ef9",
|
||||
"name": "regions.irises",
|
||||
"type": "BOOLEAN",
|
||||
"linkIds": [
|
||||
81
|
||||
],
|
||||
"label": "custom_irises",
|
||||
"pos": [
|
||||
-585.119140625,
|
||||
4484
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "25a82859-87de-42c8-8431-09948665546e",
|
||||
"name": "model_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
86
|
||||
],
|
||||
"pos": [
|
||||
-585.119140625,
|
||||
4504
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "d2ba3f92-e8b1-49c3-9590-cfad56c54cf4",
|
||||
"name": "face_landmarks",
|
||||
"type": "FACE_LANDMARKS",
|
||||
"linkIds": [
|
||||
44
|
||||
],
|
||||
"localized_name": "face_landmarks",
|
||||
"pos": [
|
||||
164,
|
||||
4504
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "4f356bb0-d4c4-4f93-b4cf-0845a65c4e6d",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"linkIds": [
|
||||
25
|
||||
],
|
||||
"localized_name": "bboxes",
|
||||
"pos": [
|
||||
164,
|
||||
4524
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "f6309e1d-6397-4363-b38f-778a122abc51",
|
||||
"name": "MASK_1",
|
||||
"type": "MASK",
|
||||
"linkIds": [
|
||||
83
|
||||
],
|
||||
"label": "mask",
|
||||
"pos": [
|
||||
164,
|
||||
4544
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 11,
|
||||
"type": "MediaPipeFaceLandmarker",
|
||||
"pos": [
|
||||
-280,
|
||||
4280
|
||||
],
|
||||
"size": [
|
||||
350,
|
||||
220
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "face_detection_model",
|
||||
"name": "face_detection_model",
|
||||
"type": "FACE_DETECTION_MODEL",
|
||||
"link": 66
|
||||
},
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 60
|
||||
},
|
||||
{
|
||||
"localized_name": "detector_variant",
|
||||
"name": "detector_variant",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "detector_variant"
|
||||
},
|
||||
"link": 75
|
||||
},
|
||||
{
|
||||
"localized_name": "num_faces",
|
||||
"name": "num_faces",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "num_faces"
|
||||
},
|
||||
"link": 76
|
||||
},
|
||||
{
|
||||
"localized_name": "min_confidence",
|
||||
"name": "min_confidence",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "min_confidence"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "missing_frame_fallback",
|
||||
"name": "missing_frame_fallback",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "missing_frame_fallback"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "face_landmarker",
|
||||
"type": "FACE_LANDMARKER",
|
||||
"link": 74
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "face_landmarks",
|
||||
"name": "face_landmarks",
|
||||
"type": "FACE_LANDMARKS",
|
||||
"links": [
|
||||
44,
|
||||
46
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "bboxes",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"links": [
|
||||
25
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "MediaPipeFaceLandmarker",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.22.0",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [
|
||||
"full",
|
||||
0,
|
||||
0.5,
|
||||
"empty"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "LoadMediaPipeFaceLandmarker",
|
||||
"pos": [
|
||||
-290,
|
||||
4060
|
||||
],
|
||||
"size": [
|
||||
350,
|
||||
140
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "model_name",
|
||||
"name": "model_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "model_name"
|
||||
},
|
||||
"link": 86
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "FACE_DETECTION_MODEL",
|
||||
"name": "FACE_DETECTION_MODEL",
|
||||
"type": "FACE_DETECTION_MODEL",
|
||||
"links": [
|
||||
66
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "LoadMediaPipeFaceLandmarker",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.22.0",
|
||||
"models": [
|
||||
{
|
||||
"name": "mediapipe_face_fp32.safetensors",
|
||||
"url": "https://huggingface.co/Comfy-Org/mediapipe/resolve/main/detection/mediapipe_face_fp32.safetensors",
|
||||
"directory": "detection"
|
||||
}
|
||||
],
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [
|
||||
"mediapipe_face_fp32.safetensors"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 20,
|
||||
"type": "MediaPipeFaceMask",
|
||||
"pos": [
|
||||
-290,
|
||||
4560
|
||||
],
|
||||
"size": [
|
||||
360,
|
||||
180
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "face_landmarks",
|
||||
"name": "face_landmarks",
|
||||
"type": "FACE_LANDMARKS",
|
||||
"link": 46
|
||||
},
|
||||
{
|
||||
"localized_name": "regions",
|
||||
"name": "regions",
|
||||
"type": "COMFY_DYNAMICCOMBO_V3",
|
||||
"widget": {
|
||||
"name": "regions"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "regions.face_oval",
|
||||
"name": "regions.face_oval",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "regions.face_oval"
|
||||
},
|
||||
"link": 77
|
||||
},
|
||||
{
|
||||
"localized_name": "regions.lips",
|
||||
"name": "regions.lips",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "regions.lips"
|
||||
},
|
||||
"link": 78
|
||||
},
|
||||
{
|
||||
"localized_name": "regions.left_eye",
|
||||
"name": "regions.left_eye",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "regions.left_eye"
|
||||
},
|
||||
"link": 79
|
||||
},
|
||||
{
|
||||
"localized_name": "regions.right_eye",
|
||||
"name": "regions.right_eye",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "regions.right_eye"
|
||||
},
|
||||
"link": 80
|
||||
},
|
||||
{
|
||||
"localized_name": "regions.irises",
|
||||
"name": "regions.irises",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "regions.irises"
|
||||
},
|
||||
"link": 81
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "MASK",
|
||||
"name": "MASK",
|
||||
"type": "MASK",
|
||||
"links": [
|
||||
83
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "MediaPipeFaceMask",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.22.0",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [
|
||||
"custom",
|
||||
true,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false
|
||||
]
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 66,
|
||||
"origin_id": 2,
|
||||
"origin_slot": 0,
|
||||
"target_id": 11,
|
||||
"target_slot": 0,
|
||||
"type": "FACE_DETECTION_MODEL"
|
||||
},
|
||||
{
|
||||
"id": 46,
|
||||
"origin_id": 11,
|
||||
"origin_slot": 0,
|
||||
"target_id": 20,
|
||||
"target_slot": 0,
|
||||
"type": "FACE_LANDMARKS"
|
||||
},
|
||||
{
|
||||
"id": 60,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 11,
|
||||
"target_slot": 1,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 44,
|
||||
"origin_id": 11,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "FACE_LANDMARKS"
|
||||
},
|
||||
{
|
||||
"id": 25,
|
||||
"origin_id": 11,
|
||||
"origin_slot": 1,
|
||||
"target_id": -20,
|
||||
"target_slot": 1,
|
||||
"type": "BOUNDING_BOX"
|
||||
},
|
||||
{
|
||||
"id": 74,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 11,
|
||||
"target_slot": 6,
|
||||
"type": "FACE_LANDMARKER"
|
||||
},
|
||||
{
|
||||
"id": 75,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 2,
|
||||
"target_id": 11,
|
||||
"target_slot": 2,
|
||||
"type": "COMBO"
|
||||
},
|
||||
{
|
||||
"id": 76,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 3,
|
||||
"target_id": 11,
|
||||
"target_slot": 3,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 77,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 4,
|
||||
"target_id": 20,
|
||||
"target_slot": 2,
|
||||
"type": "BOOLEAN"
|
||||
},
|
||||
{
|
||||
"id": 78,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 5,
|
||||
"target_id": 20,
|
||||
"target_slot": 3,
|
||||
"type": "BOOLEAN"
|
||||
},
|
||||
{
|
||||
"id": 79,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 6,
|
||||
"target_id": 20,
|
||||
"target_slot": 4,
|
||||
"type": "BOOLEAN"
|
||||
},
|
||||
{
|
||||
"id": 80,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 7,
|
||||
"target_id": 20,
|
||||
"target_slot": 5,
|
||||
"type": "BOOLEAN"
|
||||
},
|
||||
{
|
||||
"id": 81,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 8,
|
||||
"target_id": 20,
|
||||
"target_slot": 6,
|
||||
"type": "BOOLEAN"
|
||||
},
|
||||
{
|
||||
"id": 83,
|
||||
"origin_id": 20,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 2,
|
||||
"type": "MASK"
|
||||
},
|
||||
{
|
||||
"id": 86,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 9,
|
||||
"target_id": 2,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Conditioning & Preprocessors/Face Detection"
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
@ -703,7 +703,7 @@
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Image Tools/Image Segmentation",
|
||||
"category": "Conditioning & Preprocessors/Segmentation & Mask",
|
||||
"description": "Segments images into masks using Meta SAM3 from text prompts, points, or boxes."
|
||||
}
|
||||
]
|
||||
|
||||
@ -1302,7 +1302,7 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image generation and editing/Enhance",
|
||||
"category": "Image generation and editing/Upscale",
|
||||
"description": "Upscales images to higher resolution using Z-Image-Turbo."
|
||||
}
|
||||
]
|
||||
@ -1312,4 +1312,4 @@
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
}
|
||||
1206
blueprints/Image to Pose Map (SDPose Multi-Person).json
Normal file
1206
blueprints/Image to Pose Map (SDPose Multi-Person).json
Normal file
File diff suppressed because it is too large
Load Diff
888
blueprints/Image to Pose Map (SDPose-OOD).json
Normal file
888
blueprints/Image to Pose Map (SDPose-OOD).json
Normal file
@ -0,0 +1,888 @@
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 675,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 675,
|
||||
"type": "01b6a731-fb78-4070-9a38-c87146da9604",
|
||||
"pos": [
|
||||
-2480,
|
||||
3400
|
||||
],
|
||||
"size": [
|
||||
360,
|
||||
433.3125
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "input",
|
||||
"name": "input",
|
||||
"type": "IMAGE,MASK",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "resize_target_longer_size",
|
||||
"name": "resize_type.longer_size",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "resize_type.longer_size"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "scale_method",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "scale_method"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "draw_body",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "draw_body"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "draw_hands",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "draw_hands"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "draw_face",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "draw_face"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "draw_feet",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "draw_feet"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "stick_width",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "stick_width"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "face_point_size",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "face_point_size"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "score_threshold",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "score_threshold"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "ckpt_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "ckpt_name"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "bboxes",
|
||||
"shape": 7,
|
||||
"type": "BOUNDING_BOX",
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "IMAGE",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"name": "keypoints",
|
||||
"type": "POSE_KEYPOINT",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"674",
|
||||
"resize_type.longer_size"
|
||||
],
|
||||
[
|
||||
"674",
|
||||
"scale_method"
|
||||
],
|
||||
[
|
||||
"672",
|
||||
"draw_body"
|
||||
],
|
||||
[
|
||||
"672",
|
||||
"draw_hands"
|
||||
],
|
||||
[
|
||||
"672",
|
||||
"draw_face"
|
||||
],
|
||||
[
|
||||
"672",
|
||||
"draw_feet"
|
||||
],
|
||||
[
|
||||
"672",
|
||||
"stick_width"
|
||||
],
|
||||
[
|
||||
"672",
|
||||
"face_point_size"
|
||||
],
|
||||
[
|
||||
"672",
|
||||
"score_threshold"
|
||||
],
|
||||
[
|
||||
"673",
|
||||
"ckpt_name"
|
||||
]
|
||||
],
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.15.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [],
|
||||
"title": "Image to Pose Map (SDPose-OOD)"
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "01b6a731-fb78-4070-9a38-c87146da9604",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 0,
|
||||
"lastNodeId": 676,
|
||||
"lastLinkId": 1715,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Image to Pose Map (SDPose-OOD)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-3290,
|
||||
3590,
|
||||
190.8984375,
|
||||
288
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
-1756.2451602089645,
|
||||
3366,
|
||||
128,
|
||||
88
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "e24699c3-1356-4634-9eb4-19bb58e5c0b0",
|
||||
"name": "input",
|
||||
"type": "IMAGE,MASK",
|
||||
"linkIds": [
|
||||
1700
|
||||
],
|
||||
"localized_name": "input",
|
||||
"pos": [
|
||||
-3123.1015625,
|
||||
3614
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "088eefc1-cd8a-4573-993f-9e4da008a12d",
|
||||
"name": "resize_type.longer_size",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
1704
|
||||
],
|
||||
"label": "resize_target_longer_size",
|
||||
"pos": [
|
||||
-3123.1015625,
|
||||
3634
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "b6449bd3-73d4-41c8-b81f-cf8d33f76a2e",
|
||||
"name": "scale_method",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
1705
|
||||
],
|
||||
"pos": [
|
||||
-3123.1015625,
|
||||
3654
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "4cff52ad-ed07-4c97-8803-fcbd89554fd0",
|
||||
"name": "draw_body",
|
||||
"type": "BOOLEAN",
|
||||
"linkIds": [
|
||||
1706
|
||||
],
|
||||
"pos": [
|
||||
-3123.1015625,
|
||||
3674
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "7af63dce-f7df-4d7e-8215-d7c7f60bf81c",
|
||||
"name": "draw_hands",
|
||||
"type": "BOOLEAN",
|
||||
"linkIds": [
|
||||
1707
|
||||
],
|
||||
"pos": [
|
||||
-3123.1015625,
|
||||
3694
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "af3a9bce-61f9-4aca-b530-9f65e028b35e",
|
||||
"name": "draw_face",
|
||||
"type": "BOOLEAN",
|
||||
"linkIds": [
|
||||
1708
|
||||
],
|
||||
"pos": [
|
||||
-3123.1015625,
|
||||
3714
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "4620f6a3-2c85-4b79-ad8f-35d0326b568f",
|
||||
"name": "draw_feet",
|
||||
"type": "BOOLEAN",
|
||||
"linkIds": [
|
||||
1709
|
||||
],
|
||||
"pos": [
|
||||
-3123.1015625,
|
||||
3734
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "fee5d0c9-8d4b-4934-81d8-ba2206dc56cb",
|
||||
"name": "stick_width",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
1710
|
||||
],
|
||||
"pos": [
|
||||
-3123.1015625,
|
||||
3754
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "aafdd060-ba81-4324-a9cc-b656e1ebc133",
|
||||
"name": "face_point_size",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
1711
|
||||
],
|
||||
"pos": [
|
||||
-3123.1015625,
|
||||
3774
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "514c5503-f9e6-4d23-b1ae-1d3291acb2a3",
|
||||
"name": "score_threshold",
|
||||
"type": "FLOAT",
|
||||
"linkIds": [
|
||||
1712
|
||||
],
|
||||
"pos": [
|
||||
-3123.1015625,
|
||||
3794
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "ae46de61-2cc6-483e-8ee9-87e4144a2ffa",
|
||||
"name": "ckpt_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
1713
|
||||
],
|
||||
"pos": [
|
||||
-3123.1015625,
|
||||
3814
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "41bec0c6-dffa-4c78-9289-ee678715ae54",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"linkIds": [
|
||||
1714
|
||||
],
|
||||
"pos": [
|
||||
-3123.1015625,
|
||||
3834
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "f05ed8cc-9403-4f14-8085-4364b06f8a48",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
1701
|
||||
],
|
||||
"localized_name": "IMAGE",
|
||||
"pos": [
|
||||
-1732.2451602089645,
|
||||
3390
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "29a6584e-4685-4986-8ffd-e6d8539953fd",
|
||||
"name": "keypoints",
|
||||
"type": "POSE_KEYPOINT",
|
||||
"linkIds": [
|
||||
1715
|
||||
],
|
||||
"pos": [
|
||||
-1732.2451602089645,
|
||||
3410
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 671,
|
||||
"type": "SDPoseKeypointExtractor",
|
||||
"pos": [
|
||||
-2470,
|
||||
3250
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
180
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "model",
|
||||
"name": "model",
|
||||
"type": "MODEL",
|
||||
"link": 1696
|
||||
},
|
||||
{
|
||||
"localized_name": "vae",
|
||||
"name": "vae",
|
||||
"type": "VAE",
|
||||
"link": 1697
|
||||
},
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 1698
|
||||
},
|
||||
{
|
||||
"localized_name": "bboxes",
|
||||
"name": "bboxes",
|
||||
"shape": 7,
|
||||
"type": "BOUNDING_BOX",
|
||||
"link": 1714
|
||||
},
|
||||
{
|
||||
"localized_name": "batch_size",
|
||||
"name": "batch_size",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "batch_size"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "keypoints",
|
||||
"name": "keypoints",
|
||||
"type": "POSE_KEYPOINT",
|
||||
"links": [
|
||||
1699,
|
||||
1715
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "SDPoseKeypointExtractor",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.15.0",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [
|
||||
16
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 674,
|
||||
"type": "ResizeImageMaskNode",
|
||||
"pos": [
|
||||
-2960,
|
||||
3490
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "input",
|
||||
"name": "input",
|
||||
"type": "IMAGE,MASK",
|
||||
"link": 1700
|
||||
},
|
||||
{
|
||||
"localized_name": "resize_type",
|
||||
"name": "resize_type",
|
||||
"type": "COMFY_DYNAMICCOMBO_V3",
|
||||
"widget": {
|
||||
"name": "resize_type"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "resize_type.longer_size",
|
||||
"name": "resize_type.longer_size",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "resize_type.longer_size"
|
||||
},
|
||||
"link": 1704
|
||||
},
|
||||
{
|
||||
"localized_name": "scale_method",
|
||||
"name": "scale_method",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "scale_method"
|
||||
},
|
||||
"link": 1705
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "resized",
|
||||
"name": "resized",
|
||||
"type": "*",
|
||||
"links": [
|
||||
1698
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ResizeImageMaskNode",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.15.0",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [
|
||||
"scale longer dimension",
|
||||
1024,
|
||||
"area"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 672,
|
||||
"type": "SDPoseDrawKeypoints",
|
||||
"pos": [
|
||||
-2120,
|
||||
3260
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
280
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "keypoints",
|
||||
"name": "keypoints",
|
||||
"type": "POSE_KEYPOINT",
|
||||
"link": 1699
|
||||
},
|
||||
{
|
||||
"localized_name": "draw_body",
|
||||
"name": "draw_body",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "draw_body"
|
||||
},
|
||||
"link": 1706
|
||||
},
|
||||
{
|
||||
"localized_name": "draw_hands",
|
||||
"name": "draw_hands",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "draw_hands"
|
||||
},
|
||||
"link": 1707
|
||||
},
|
||||
{
|
||||
"localized_name": "draw_face",
|
||||
"name": "draw_face",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "draw_face"
|
||||
},
|
||||
"link": 1708
|
||||
},
|
||||
{
|
||||
"localized_name": "draw_feet",
|
||||
"name": "draw_feet",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "draw_feet"
|
||||
},
|
||||
"link": 1709
|
||||
},
|
||||
{
|
||||
"localized_name": "stick_width",
|
||||
"name": "stick_width",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "stick_width"
|
||||
},
|
||||
"link": 1710
|
||||
},
|
||||
{
|
||||
"localized_name": "face_point_size",
|
||||
"name": "face_point_size",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "face_point_size"
|
||||
},
|
||||
"link": 1711
|
||||
},
|
||||
{
|
||||
"localized_name": "score_threshold",
|
||||
"name": "score_threshold",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "score_threshold"
|
||||
},
|
||||
"link": 1712
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "IMAGE",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
1701
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "SDPoseDrawKeypoints",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.15.0",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
4,
|
||||
2,
|
||||
0.5
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 673,
|
||||
"type": "CheckpointLoaderSimple",
|
||||
"pos": [
|
||||
-2960,
|
||||
3250
|
||||
],
|
||||
"size": [
|
||||
390,
|
||||
190
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "ckpt_name",
|
||||
"name": "ckpt_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "ckpt_name"
|
||||
},
|
||||
"link": 1713
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "MODEL",
|
||||
"name": "MODEL",
|
||||
"type": "MODEL",
|
||||
"links": [
|
||||
1696
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "CLIP",
|
||||
"name": "CLIP",
|
||||
"type": "CLIP",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"localized_name": "VAE",
|
||||
"name": "VAE",
|
||||
"type": "VAE",
|
||||
"links": [
|
||||
1697
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CheckpointLoaderSimple",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.15.0",
|
||||
"models": [
|
||||
{
|
||||
"name": "sdpose_wholebody_fp16.safetensors",
|
||||
"url": "https://huggingface.co/Comfy-Org/SDPose/resolve/main/checkpoints/sdpose_wholebody_fp16.safetensors",
|
||||
"directory": "checkpoints"
|
||||
}
|
||||
],
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [
|
||||
"sdpose_wholebody_fp16.safetensors"
|
||||
]
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 1696,
|
||||
"origin_id": 673,
|
||||
"origin_slot": 0,
|
||||
"target_id": 671,
|
||||
"target_slot": 0,
|
||||
"type": "MODEL"
|
||||
},
|
||||
{
|
||||
"id": 1697,
|
||||
"origin_id": 673,
|
||||
"origin_slot": 2,
|
||||
"target_id": 671,
|
||||
"target_slot": 1,
|
||||
"type": "VAE"
|
||||
},
|
||||
{
|
||||
"id": 1698,
|
||||
"origin_id": 674,
|
||||
"origin_slot": 0,
|
||||
"target_id": 671,
|
||||
"target_slot": 2,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 1699,
|
||||
"origin_id": 671,
|
||||
"origin_slot": 0,
|
||||
"target_id": 672,
|
||||
"target_slot": 0,
|
||||
"type": "POSE_KEYPOINT"
|
||||
},
|
||||
{
|
||||
"id": 1700,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 674,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE,MASK"
|
||||
},
|
||||
{
|
||||
"id": 1701,
|
||||
"origin_id": 672,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 1704,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 674,
|
||||
"target_slot": 2,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 1705,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 2,
|
||||
"target_id": 674,
|
||||
"target_slot": 3,
|
||||
"type": "COMBO"
|
||||
},
|
||||
{
|
||||
"id": 1706,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 3,
|
||||
"target_id": 672,
|
||||
"target_slot": 1,
|
||||
"type": "BOOLEAN"
|
||||
},
|
||||
{
|
||||
"id": 1707,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 4,
|
||||
"target_id": 672,
|
||||
"target_slot": 2,
|
||||
"type": "BOOLEAN"
|
||||
},
|
||||
{
|
||||
"id": 1708,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 5,
|
||||
"target_id": 672,
|
||||
"target_slot": 3,
|
||||
"type": "BOOLEAN"
|
||||
},
|
||||
{
|
||||
"id": 1709,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 6,
|
||||
"target_id": 672,
|
||||
"target_slot": 4,
|
||||
"type": "BOOLEAN"
|
||||
},
|
||||
{
|
||||
"id": 1710,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 7,
|
||||
"target_id": 672,
|
||||
"target_slot": 5,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 1711,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 8,
|
||||
"target_id": 672,
|
||||
"target_slot": 6,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 1712,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 9,
|
||||
"target_id": 672,
|
||||
"target_slot": 7,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 1713,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 10,
|
||||
"target_id": 673,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
},
|
||||
{
|
||||
"id": 1714,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 11,
|
||||
"target_id": 671,
|
||||
"target_slot": 3,
|
||||
"type": "BOUNDING_BOX"
|
||||
},
|
||||
{
|
||||
"id": 1715,
|
||||
"origin_id": 671,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 1,
|
||||
"type": "POSE_KEYPOINT"
|
||||
}
|
||||
],
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Conditioning & Preprocessors/Pose",
|
||||
"description": "Extracts human pose keypoints and stick-figure visuals from an image using SDPose-OOD, with optional bounding-box input per subject."
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {
|
||||
"ue_links": []
|
||||
}
|
||||
}
|
||||
1219
blueprints/Merge Videos.json
Normal file
1219
blueprints/Merge Videos.json
Normal file
File diff suppressed because it is too large
Load Diff
@ -1298,7 +1298,7 @@
|
||||
"VHS_MetadataImage": true,
|
||||
"VHS_KeepIntermediate": true
|
||||
},
|
||||
"category": "Image generation and editing/Pose to image",
|
||||
"category": "Image generation and editing/Conditioned",
|
||||
"description": "Generates an image from pose keypoints using Z-Image-Turbo with text conditioning."
|
||||
}
|
||||
]
|
||||
|
||||
@ -3870,7 +3870,7 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Video generation and editing/Pose to video",
|
||||
"category": "Video generation and editing/Conditioned",
|
||||
"description": "Generates video from pose reference frames using LTX-2, with optional synchronized audio."
|
||||
}
|
||||
]
|
||||
|
||||
@ -270,7 +270,7 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Text generation/Prompt enhance",
|
||||
"category": "Text Tools",
|
||||
"description": "Expands short text prompts into detailed descriptions using a text generation model for better generation quality."
|
||||
}
|
||||
]
|
||||
|
||||
@ -389,7 +389,7 @@
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Image generation and editing/Background Removal"
|
||||
"category": "Image Tools/Background Removal"
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
485
blueprints/Select Per-Line Text by Index.json
Normal file
485
blueprints/Select Per-Line Text by Index.json
Normal file
@ -0,0 +1,485 @@
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 10,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 10,
|
||||
"type": "3fb7557a-470d-4983-9d8c-6d5caa9788f0",
|
||||
"pos": [
|
||||
-250,
|
||||
8590
|
||||
],
|
||||
"size": [
|
||||
280,
|
||||
360
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "text_per_line",
|
||||
"name": "text_per_line",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "text_per_line"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "index",
|
||||
"name": "index",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "index"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "selected_line",
|
||||
"name": "selected_line",
|
||||
"type": "STRING",
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"2",
|
||||
"string"
|
||||
],
|
||||
[
|
||||
"3",
|
||||
"value"
|
||||
]
|
||||
],
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.0",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [],
|
||||
"title": "Select Per-Line Text by Index"
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "3fb7557a-470d-4983-9d8c-6d5caa9788f0",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 0,
|
||||
"lastNodeId": 10,
|
||||
"lastLinkId": 14,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Select Per-Line Text by Index",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-990,
|
||||
8595,
|
||||
128,
|
||||
88
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
710,
|
||||
8585,
|
||||
128,
|
||||
68
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "75417d82-a934-4ac9-b667-d8dcd5a3bfb3",
|
||||
"name": "text_per_line",
|
||||
"type": "STRING",
|
||||
"linkIds": [
|
||||
13
|
||||
],
|
||||
"localized_name": "text_per_line",
|
||||
"pos": [
|
||||
-886,
|
||||
8619
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "46e69a73-1804-4ca6-9175-31445bf0be96",
|
||||
"name": "index",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
14
|
||||
],
|
||||
"localized_name": "index",
|
||||
"pos": [
|
||||
-886,
|
||||
8639
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "e34e8ad1-84d2-4bd2-a460-eb7de6067c10",
|
||||
"name": "selected_line",
|
||||
"type": "STRING",
|
||||
"linkIds": [
|
||||
10
|
||||
],
|
||||
"localized_name": "selected_line",
|
||||
"pos": [
|
||||
734,
|
||||
8609
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "PreviewAny",
|
||||
"pos": [
|
||||
-500,
|
||||
8400
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
180
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "source",
|
||||
"name": "source",
|
||||
"type": "*",
|
||||
"link": 1
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "STRING",
|
||||
"name": "STRING",
|
||||
"type": "STRING",
|
||||
"links": [
|
||||
6
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "PreviewAny",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.0",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [
|
||||
null,
|
||||
null,
|
||||
null
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "RegexExtract",
|
||||
"pos": [
|
||||
-240,
|
||||
8740
|
||||
],
|
||||
"size": [
|
||||
470,
|
||||
460
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"showAdvanced": false,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "string",
|
||||
"name": "string",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "string"
|
||||
},
|
||||
"link": 13
|
||||
},
|
||||
{
|
||||
"localized_name": "regex_pattern",
|
||||
"name": "regex_pattern",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "regex_pattern"
|
||||
},
|
||||
"link": 9
|
||||
},
|
||||
{
|
||||
"localized_name": "mode",
|
||||
"name": "mode",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "mode"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "case_insensitive",
|
||||
"name": "case_insensitive",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "case_insensitive"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "multiline",
|
||||
"name": "multiline",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "multiline"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "dotall",
|
||||
"name": "dotall",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "dotall"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "group_index",
|
||||
"name": "group_index",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "group_index"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "STRING",
|
||||
"name": "STRING",
|
||||
"type": "STRING",
|
||||
"links": [
|
||||
10
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "RegexExtract",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.0",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [
|
||||
"",
|
||||
"",
|
||||
"First Group",
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "PrimitiveInt",
|
||||
"pos": [
|
||||
-810,
|
||||
8400
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "value",
|
||||
"name": "value",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "value"
|
||||
},
|
||||
"link": 14
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
1
|
||||
]
|
||||
}
|
||||
],
|
||||
"title": "Int (line index)",
|
||||
"properties": {
|
||||
"Node name for S&R": "Int (line index)",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.0",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [
|
||||
0,
|
||||
"fixed"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"type": "StringReplace",
|
||||
"pos": [
|
||||
-240,
|
||||
8400
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
280
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "string",
|
||||
"name": "string",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "string"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "find",
|
||||
"name": "find",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "find"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "replace",
|
||||
"name": "replace",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "replace"
|
||||
},
|
||||
"link": 6
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "STRING",
|
||||
"name": "STRING",
|
||||
"type": "STRING",
|
||||
"links": [
|
||||
9
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "StringReplace",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.0",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [
|
||||
"^(?:[^\\n]*\\n){index}([^\\n]*)(?:\\n|$)",
|
||||
"index",
|
||||
""
|
||||
]
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 1,
|
||||
"origin_id": 3,
|
||||
"origin_slot": 0,
|
||||
"target_id": 1,
|
||||
"target_slot": 0,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"origin_id": 8,
|
||||
"origin_slot": 0,
|
||||
"target_id": 2,
|
||||
"target_slot": 1,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"origin_id": 1,
|
||||
"origin_slot": 0,
|
||||
"target_id": 8,
|
||||
"target_slot": 2,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"origin_id": 2,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 2,
|
||||
"target_slot": 0,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 3,
|
||||
"target_slot": 0,
|
||||
"type": "INT"
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Text Tools",
|
||||
"description": "Selects one line from multiline text by zero-based index for batch or list-driven prompt workflows."
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {
|
||||
"ue_links": [],
|
||||
"links_added_by_ue": []
|
||||
}
|
||||
}
|
||||
714
blueprints/Split Image Grid to Tiles.json
Normal file
714
blueprints/Split Image Grid to Tiles.json
Normal file
@ -0,0 +1,714 @@
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 251,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 251,
|
||||
"type": "609e1fd1-b731-4b78-89ac-d19b1156b025",
|
||||
"pos": [
|
||||
-1490,
|
||||
130
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
164
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "source_image",
|
||||
"name": "source_image",
|
||||
"type": "IMAGE",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "columns",
|
||||
"name": "columns",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "columns"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "rows",
|
||||
"name": "rows",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "rows"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "tiles",
|
||||
"name": "tiles",
|
||||
"type": "IMAGE",
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"228",
|
||||
"value"
|
||||
],
|
||||
[
|
||||
"252",
|
||||
"value"
|
||||
]
|
||||
],
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.20.1",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [],
|
||||
"title": "Split Image Grid to Tiles"
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "609e1fd1-b731-4b78-89ac-d19b1156b025",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 9,
|
||||
"lastNodeId": 252,
|
||||
"lastLinkId": 429,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Split Image Grid to Tiles",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-1690,
|
||||
260,
|
||||
128,
|
||||
108
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
-510,
|
||||
590,
|
||||
128,
|
||||
68
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "866ac798-cfbc-450a-b755-e704f86404d9",
|
||||
"name": "source_image",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
386,
|
||||
389
|
||||
],
|
||||
"localized_name": "source_image",
|
||||
"pos": [
|
||||
-1586,
|
||||
284
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "bc37b1f8-8ab2-4f19-bd00-75d4fbc4feb3",
|
||||
"name": "columns",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
427
|
||||
],
|
||||
"localized_name": "columns",
|
||||
"pos": [
|
||||
-1586,
|
||||
304
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "d45915da-e848-43dd-9ccc-e3161e9c99d9",
|
||||
"name": "rows",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
428
|
||||
],
|
||||
"localized_name": "rows",
|
||||
"pos": [
|
||||
-1586,
|
||||
324
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "18bc780f-064b-4038-87c6-67dba71deb08",
|
||||
"name": "tiles",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
394
|
||||
],
|
||||
"localized_name": "tiles",
|
||||
"shape": 6,
|
||||
"pos": [
|
||||
-486,
|
||||
614
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 225,
|
||||
"type": "SplitImageToTileList",
|
||||
"pos": [
|
||||
-1010,
|
||||
620
|
||||
],
|
||||
"size": [
|
||||
290,
|
||||
170
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 386
|
||||
},
|
||||
{
|
||||
"localized_name": "tile_width",
|
||||
"name": "tile_width",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "tile_width"
|
||||
},
|
||||
"link": 403
|
||||
},
|
||||
{
|
||||
"localized_name": "tile_height",
|
||||
"name": "tile_height",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "tile_height"
|
||||
},
|
||||
"link": 404
|
||||
},
|
||||
{
|
||||
"localized_name": "overlap",
|
||||
"name": "overlap",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "overlap"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "IMAGE",
|
||||
"name": "IMAGE",
|
||||
"shape": 6,
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
394
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "SplitImageToTileList",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.20.1",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [
|
||||
1024,
|
||||
1024,
|
||||
0
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 231,
|
||||
"type": "ComfyMathExpression",
|
||||
"pos": [
|
||||
-1080,
|
||||
330
|
||||
],
|
||||
"size": [
|
||||
370,
|
||||
190
|
||||
],
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "a",
|
||||
"localized_name": "values.a",
|
||||
"name": "values.a",
|
||||
"type": "FLOAT,INT,BOOLEAN",
|
||||
"link": 390
|
||||
},
|
||||
{
|
||||
"label": "b",
|
||||
"localized_name": "values.b",
|
||||
"name": "values.b",
|
||||
"shape": 7,
|
||||
"type": "FLOAT,INT,BOOLEAN",
|
||||
"link": 429
|
||||
},
|
||||
{
|
||||
"label": "c",
|
||||
"localized_name": "values.c",
|
||||
"name": "values.c",
|
||||
"shape": 7,
|
||||
"type": "FLOAT,INT,BOOLEAN",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "expression",
|
||||
"name": "expression",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "expression"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "FLOAT",
|
||||
"name": "FLOAT",
|
||||
"type": "FLOAT",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
404
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "BOOL",
|
||||
"name": "BOOL",
|
||||
"type": "BOOLEAN",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"title": "Math Expression (Height)",
|
||||
"properties": {
|
||||
"Node name for S&R": "ComfyMathExpression",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.18.1",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [
|
||||
"max(1, (int(a) + int(b) - 1) // int(b))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 229,
|
||||
"type": "ComfyMathExpression",
|
||||
"pos": [
|
||||
-1090,
|
||||
-30
|
||||
],
|
||||
"size": [
|
||||
370,
|
||||
190
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "a",
|
||||
"localized_name": "values.a",
|
||||
"name": "values.a",
|
||||
"type": "FLOAT,INT,BOOLEAN",
|
||||
"link": 387
|
||||
},
|
||||
{
|
||||
"label": "b",
|
||||
"localized_name": "values.b",
|
||||
"name": "values.b",
|
||||
"shape": 7,
|
||||
"type": "FLOAT,INT,BOOLEAN",
|
||||
"link": 388
|
||||
},
|
||||
{
|
||||
"label": "c",
|
||||
"localized_name": "values.c",
|
||||
"name": "values.c",
|
||||
"shape": 7,
|
||||
"type": "FLOAT,INT,BOOLEAN",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "expression",
|
||||
"name": "expression",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "expression"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "FLOAT",
|
||||
"name": "FLOAT",
|
||||
"type": "FLOAT",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
403
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "BOOL",
|
||||
"name": "BOOL",
|
||||
"type": "BOOLEAN",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"title": "Math Expression (Width)",
|
||||
"properties": {
|
||||
"Node name for S&R": "ComfyMathExpression",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.18.1",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [
|
||||
"max(1, (int(a) + int(b) - 1) // int(b))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 228,
|
||||
"type": "PrimitiveInt",
|
||||
"pos": [
|
||||
-1380,
|
||||
90
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "value",
|
||||
"name": "value",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "value"
|
||||
},
|
||||
"link": 427
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
388
|
||||
]
|
||||
}
|
||||
],
|
||||
"title": "Int (grid columns)",
|
||||
"properties": {
|
||||
"Node name for S&R": "Int (grid columns)",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.18.1",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [
|
||||
2,
|
||||
"fixed"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 230,
|
||||
"type": "GetImageSize",
|
||||
"pos": [
|
||||
-1380,
|
||||
290
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
100
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 389
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "width",
|
||||
"name": "width",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
387
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "height",
|
||||
"name": "height",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
390
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "batch_size",
|
||||
"name": "batch_size",
|
||||
"type": "INT",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetImageSize",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.18.1",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 252,
|
||||
"type": "PrimitiveInt",
|
||||
"pos": [
|
||||
-1380,
|
||||
470
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 5,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "value",
|
||||
"name": "value",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "value"
|
||||
},
|
||||
"link": 428
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
429
|
||||
]
|
||||
}
|
||||
],
|
||||
"title": "Int (grid rows)",
|
||||
"properties": {
|
||||
"Node name for S&R": "Int (grid rows)",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.18.1",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [
|
||||
3,
|
||||
"fixed"
|
||||
]
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 403,
|
||||
"origin_id": 229,
|
||||
"origin_slot": 1,
|
||||
"target_id": 225,
|
||||
"target_slot": 1,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 404,
|
||||
"origin_id": 231,
|
||||
"origin_slot": 1,
|
||||
"target_id": 225,
|
||||
"target_slot": 2,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 390,
|
||||
"origin_id": 230,
|
||||
"origin_slot": 1,
|
||||
"target_id": 231,
|
||||
"target_slot": 0,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 387,
|
||||
"origin_id": 230,
|
||||
"origin_slot": 0,
|
||||
"target_id": 229,
|
||||
"target_slot": 0,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 388,
|
||||
"origin_id": 228,
|
||||
"origin_slot": 0,
|
||||
"target_id": 229,
|
||||
"target_slot": 1,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 386,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 225,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 389,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 230,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 394,
|
||||
"origin_id": 225,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 427,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 228,
|
||||
"target_slot": 0,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 428,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 2,
|
||||
"target_id": 252,
|
||||
"target_slot": 0,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 429,
|
||||
"origin_id": 252,
|
||||
"origin_slot": 0,
|
||||
"target_id": 231,
|
||||
"target_slot": 1,
|
||||
"type": "INT"
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Image Tools/Crop",
|
||||
"description": "Splits an image into a configurable columns×rows grid of equal tiles for tiled generation or processing."
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
1085
blueprints/Text to Image (Anima).json
Normal file
1085
blueprints/Text to Image (Anima).json
Normal file
File diff suppressed because it is too large
Load Diff
@ -307,9 +307,9 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Text generation/Video Captioning",
|
||||
"category": "Video Tools",
|
||||
"description": "Generates descriptive captions for video input using Google's Gemini multimodal LLM."
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
1226
blueprints/Video Depth Estimation (MoGe).json
Normal file
1226
blueprints/Video Depth Estimation (MoGe).json
Normal file
File diff suppressed because it is too large
Load Diff
1109
blueprints/Video Face Detection (Mediapipe).json
Normal file
1109
blueprints/Video Face Detection (Mediapipe).json
Normal file
File diff suppressed because it is too large
Load Diff
4340
blueprints/Video Inpaint (VOID).json
Normal file
4340
blueprints/Video Inpaint (VOID).json
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
4196
blueprints/Video Inpainting (Wan2.1 VACE).json
Normal file
4196
blueprints/Video Inpainting (Wan2.1 VACE).json
Normal file
File diff suppressed because it is too large
Load Diff
@ -818,7 +818,7 @@
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Video Tools",
|
||||
"category": "Conditioning & Preprocessors/Segmentation & Mask",
|
||||
"description": "Segments video into temporally consistent masks using Meta SAM3 from text or interactive prompts."
|
||||
}
|
||||
]
|
||||
|
||||
@ -412,7 +412,7 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Video generation and editing/Enhance video",
|
||||
"category": "Video generation and editing/Upscale",
|
||||
"description": "Upscales video to 4× resolution using a GAN-based upscaling model."
|
||||
}
|
||||
]
|
||||
|
||||
1323
blueprints/Video to Pose Map (SDPose Multi-Person).json
Normal file
1323
blueprints/Video to Pose Map (SDPose Multi-Person).json
Normal file
File diff suppressed because it is too large
Load Diff
@ -105,7 +105,7 @@ class WindowAttention(nn.Module):
|
||||
|
||||
relative_position_bias = self.relative_position_bias_table[self.relative_position_index.long().view(-1)].view(
|
||||
self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH
|
||||
relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
|
||||
relative_position_bias = comfy.ops.cast_to_input(relative_position_bias.permute(2, 0, 1).contiguous(), attn) # nH, Wh*Ww, Wh*Ww
|
||||
attn = attn + relative_position_bias.unsqueeze(0)
|
||||
|
||||
if mask is not None:
|
||||
|
||||
@ -55,12 +55,7 @@ class BackgroundRemovalModel():
|
||||
out = torch.nn.functional.interpolate(out, size=(H, W), mode="bicubic", antialias=False)
|
||||
|
||||
mask = out.sigmoid().to(device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype())
|
||||
if mask.ndim == 3:
|
||||
mask = mask.unsqueeze(0)
|
||||
if mask.shape[1] != 1:
|
||||
mask = mask.movedim(-1, 1)
|
||||
|
||||
return mask
|
||||
return mask.squeeze(1) # (B, 1, H, W) -> (B, H, W)
|
||||
|
||||
|
||||
def load_background_removal_model(sd):
|
||||
|
||||
@ -49,7 +49,7 @@ parser.add_argument("--temp-directory", type=str, default=None, help="Set the Co
|
||||
parser.add_argument("--input-directory", type=str, default=None, help="Set the ComfyUI input directory. Overrides --base-directory.")
|
||||
parser.add_argument("--auto-launch", action="store_true", help="Automatically launch ComfyUI in the default browser.")
|
||||
parser.add_argument("--disable-auto-launch", action="store_true", help="Disable auto launching the browser.")
|
||||
parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use. All other devices will not be visible.")
|
||||
parser.add_argument("--cuda-device", type=str, default=None, metavar="DEVICE_ID", help="Set the ids of cuda devices this instance will use, as a comma-separated list (e.g. '0' or '0,1'). All other devices will not be visible.")
|
||||
parser.add_argument("--default-device", type=int, default=None, metavar="DEFAULT_DEVICE_ID", help="Set the id of the default device, all other devices will stay visible.")
|
||||
cm_group = parser.add_mutually_exclusive_group()
|
||||
cm_group.add_argument("--cuda-malloc", action="store_true", help="Enable cudaMallocAsync (enabled by default for torch 2.0 and up).")
|
||||
@ -111,7 +111,7 @@ parser.add_argument("--preview-method", type=LatentPreviewMethod, default=Latent
|
||||
parser.add_argument("--preview-size", type=int, default=512, help="Sets the maximum preview size for sampler nodes.")
|
||||
|
||||
cache_group = parser.add_mutually_exclusive_group()
|
||||
cache_group.add_argument("--cache-ram", nargs='*', type=float, default=[], metavar="GB", help="Use RAM pressure caching with the specified headroom thresholds. This is the default caching mode. The first value sets the active-cache threshold; the optional second value sets the inactive-cache/pin threshold. Defaults when no values are provided: active 25%% of system RAM (min 4GB, max 32GB), inactive 75%% of system RAM (min 12GB, max 96GB).")
|
||||
cache_group.add_argument("--cache-ram", nargs='*', type=float, default=[], metavar="GB", help="Use RAM pressure caching with the specified headroom thresholds. This is the default caching mode. The first value sets the active-cache threshold; the optional second value sets the inactive-cache/pin threshold. Defaults when no values are provided: active 10%% of system RAM (min 2GB, max 10GB), inactive 100%% of system RAM (max 96GB).")
|
||||
cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.")
|
||||
cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.")
|
||||
cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.")
|
||||
@ -149,6 +149,7 @@ parser.add_argument("--async-offload", nargs='?', const=2, type=int, default=Non
|
||||
parser.add_argument("--disable-async-offload", action="store_true", help="Disable async weight offloading.")
|
||||
parser.add_argument("--disable-dynamic-vram", action="store_true", help="Disable dynamic VRAM and use estimate based model loading.")
|
||||
parser.add_argument("--enable-dynamic-vram", action="store_true", help="Enable dynamic VRAM on systems where it's not enabled by default.")
|
||||
parser.add_argument("--fast-disk", action="store_true", help="Prefer disk-backed dynamic loading and offload over unpinned RAM. Can be faster for users with fast NVME disks.")
|
||||
|
||||
parser.add_argument("--force-non-blocking", action="store_true", help="Force ComfyUI to use non-blocking operations for all applicable tensors. This may improve performance on some non-Nvidia systems but can cause issues with some workflows.")
|
||||
|
||||
@ -165,6 +166,8 @@ class PerformanceFeature(enum.Enum):
|
||||
|
||||
parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. This is used to test new features so using it might crash your comfyui. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: {}".format(" ".join(map(lambda c: c.value, PerformanceFeature))))
|
||||
|
||||
parser.add_argument("--debug-hang", action="store_true", help="Enable stack trace dumps on Ctrl-C for debugging hangs.")
|
||||
|
||||
parser.add_argument("--disable-pinned-memory", action="store_true", help="Disable pinned memory use.")
|
||||
|
||||
parser.add_argument("--mmap-torch-files", action="store_true", help="Use mmap when loading ckpt/pt files.")
|
||||
|
||||
@ -24,13 +24,16 @@ IMAGE_ENCODERS = {
|
||||
"siglip_vision_model": comfy.clip_model.CLIPVisionModelProjection,
|
||||
"siglip2_vision_model": comfy.clip_model.CLIPVisionModelProjection,
|
||||
"dinov2": comfy.image_encoders.dino2.Dinov2Model,
|
||||
"dinov3": comfy.image_encoders.dino3.DINOv3ViTModel
|
||||
"dinov3": comfy.image_encoders.dino3.DINOv3ViTModel,
|
||||
}
|
||||
|
||||
class ClipVisionModel():
|
||||
def __init__(self, json_config):
|
||||
with open(json_config) as f:
|
||||
config = json.load(f)
|
||||
if isinstance(json_config, dict):
|
||||
config = json_config
|
||||
else:
|
||||
with open(json_config) as f:
|
||||
config = json.load(f)
|
||||
|
||||
self.image_size = config.get("image_size", 224)
|
||||
self.image_mean = config.get("image_mean", [0.48145466, 0.4578275, 0.40821073])
|
||||
@ -136,8 +139,10 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
|
||||
json_config = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "image_encoders"), "dino2_giant.json")
|
||||
elif 'encoder.layer.23.layer_scale2.lambda1' in sd:
|
||||
json_config = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "image_encoders"), "dino2_large.json")
|
||||
elif 'layer.9.attention.o_proj.bias' in sd: # dinov3
|
||||
elif 'layer.9.attention.o_proj.bias' in sd: # dinov3 large (24 layers)
|
||||
json_config = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "image_encoders"), "dino3_large.json")
|
||||
elif 'layer.0.mlp.gate_proj.weight' in sd and 'layer.31.norm1.weight' in sd: # Dinov3 ViT-H/16+ (SwiGLU gated MLP, 32 layers)
|
||||
json_config = comfy.image_encoders.dino3.DINOV3_VITH_CONFIG
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
"""Comfy-specific type hinting"""
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Literal, TypedDict, Optional
|
||||
from typing_extensions import NotRequired
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
@ -15,13 +15,14 @@
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import torch
|
||||
from enum import Enum
|
||||
import math
|
||||
import os
|
||||
import logging
|
||||
import copy
|
||||
import comfy.utils
|
||||
import comfy.model_management
|
||||
import comfy.model_detection
|
||||
@ -38,7 +39,7 @@ import comfy.ldm.hydit.controlnet
|
||||
import comfy.ldm.flux.controlnet
|
||||
import comfy.ldm.qwen_image.controlnet
|
||||
import comfy.cldm.dit_embedder
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import TYPE_CHECKING, Union
|
||||
if TYPE_CHECKING:
|
||||
from comfy.hooks import HookGroup
|
||||
|
||||
@ -64,6 +65,18 @@ class StrengthType(Enum):
|
||||
CONSTANT = 1
|
||||
LINEAR_UP = 2
|
||||
|
||||
class ControlIsolation:
|
||||
'''Temporarily set a ControlBase object's previous_controlnet to None to prevent cascading calls.'''
|
||||
def __init__(self, control: ControlBase):
|
||||
self.control = control
|
||||
self.orig_previous_controlnet = control.previous_controlnet
|
||||
|
||||
def __enter__(self):
|
||||
self.control.previous_controlnet = None
|
||||
|
||||
def __exit__(self, *args):
|
||||
self.control.previous_controlnet = self.orig_previous_controlnet
|
||||
|
||||
class ControlBase:
|
||||
def __init__(self):
|
||||
self.cond_hint_original = None
|
||||
@ -77,7 +90,7 @@ class ControlBase:
|
||||
self.compression_ratio = 8
|
||||
self.upscale_algorithm = 'nearest-exact'
|
||||
self.extra_args = {}
|
||||
self.previous_controlnet = None
|
||||
self.previous_controlnet: Union[ControlBase, None] = None
|
||||
self.extra_conds = []
|
||||
self.strength_type = StrengthType.CONSTANT
|
||||
self.concat_mask = False
|
||||
@ -85,6 +98,7 @@ class ControlBase:
|
||||
self.extra_concat = None
|
||||
self.extra_hooks: HookGroup = None
|
||||
self.preprocess_image = lambda a: a
|
||||
self.multigpu_clones: dict[torch.device, ControlBase] = {}
|
||||
|
||||
def set_cond_hint(self, cond_hint, strength=1.0, timestep_percent_range=(0.0, 1.0), vae=None, extra_concat=[]):
|
||||
self.cond_hint_original = cond_hint
|
||||
@ -111,17 +125,38 @@ class ControlBase:
|
||||
def cleanup(self):
|
||||
if self.previous_controlnet is not None:
|
||||
self.previous_controlnet.cleanup()
|
||||
|
||||
for device_cnet in self.multigpu_clones.values():
|
||||
with ControlIsolation(device_cnet):
|
||||
device_cnet.cleanup()
|
||||
self.cond_hint = None
|
||||
self.extra_concat = None
|
||||
self.timestep_range = None
|
||||
|
||||
def get_models(self):
|
||||
out = []
|
||||
for device_cnet in self.multigpu_clones.values():
|
||||
out += device_cnet.get_models_only_self()
|
||||
if self.previous_controlnet is not None:
|
||||
out += self.previous_controlnet.get_models()
|
||||
return out
|
||||
|
||||
def get_models_only_self(self):
|
||||
'Calls get_models, but temporarily sets previous_controlnet to None.'
|
||||
with ControlIsolation(self):
|
||||
return self.get_models()
|
||||
|
||||
def get_instance_for_device(self, device):
|
||||
'Returns instance of this Control object intended for selected device.'
|
||||
return self.multigpu_clones.get(device, self)
|
||||
|
||||
def deepclone_multigpu(self, load_device, autoregister=False):
|
||||
'''
|
||||
Create deep clone of Control object where model(s) is set to other devices.
|
||||
|
||||
When autoregister is set to True, the deep clone is also added to multigpu_clones dict.
|
||||
'''
|
||||
raise NotImplementedError("Classes inheriting from ControlBase should define their own deepclone_multigpu funtion.")
|
||||
|
||||
def get_extra_hooks(self):
|
||||
out = []
|
||||
if self.extra_hooks is not None:
|
||||
@ -130,7 +165,7 @@ class ControlBase:
|
||||
out += self.previous_controlnet.get_extra_hooks()
|
||||
return out
|
||||
|
||||
def copy_to(self, c):
|
||||
def copy_to(self, c: ControlBase):
|
||||
c.cond_hint_original = self.cond_hint_original
|
||||
c.strength = self.strength
|
||||
c.timestep_percent_range = self.timestep_percent_range
|
||||
@ -284,6 +319,14 @@ class ControlNet(ControlBase):
|
||||
self.copy_to(c)
|
||||
return c
|
||||
|
||||
def deepclone_multigpu(self, load_device, autoregister=False):
|
||||
c = self.copy()
|
||||
c.control_model = copy.deepcopy(c.control_model)
|
||||
c.control_model_wrapped = comfy.model_patcher.ModelPatcher(c.control_model, load_device=load_device, offload_device=comfy.model_management.unet_offload_device())
|
||||
if autoregister:
|
||||
self.multigpu_clones[load_device] = c
|
||||
return c
|
||||
|
||||
def get_models(self):
|
||||
out = super().get_models()
|
||||
out.append(self.control_model_wrapped)
|
||||
@ -314,6 +357,10 @@ class QwenFunControlNet(ControlNet):
|
||||
super().pre_run(model, percent_to_timestep_function)
|
||||
self.set_extra_arg("base_model", model.diffusion_model)
|
||||
|
||||
def cleanup(self):
|
||||
self.extra_args.pop("base_model", None)
|
||||
super().cleanup()
|
||||
|
||||
def copy(self):
|
||||
c = QwenFunControlNet(None, global_average_pooling=self.global_average_pooling, load_device=self.load_device, manual_cast_dtype=self.manual_cast_dtype)
|
||||
c.control_model = self.control_model
|
||||
@ -906,6 +953,14 @@ class T2IAdapter(ControlBase):
|
||||
self.copy_to(c)
|
||||
return c
|
||||
|
||||
def deepclone_multigpu(self, load_device, autoregister=False):
|
||||
c = self.copy()
|
||||
c.t2i_model = copy.deepcopy(c.t2i_model)
|
||||
c.device = load_device
|
||||
if autoregister:
|
||||
self.multigpu_clones[load_device] = c
|
||||
return c
|
||||
|
||||
def load_t2i_adapter(t2i_data, model_options={}): #TODO: model_options
|
||||
compression_ratio = 8
|
||||
upscale_algorithm = 'nearest-exact'
|
||||
|
||||
@ -1,5 +1,20 @@
|
||||
import logging
|
||||
|
||||
import torch
|
||||
|
||||
_CK_STOCHASTIC_ROUNDING_AVAILABLE = False
|
||||
try:
|
||||
import comfy_kitchen as ck
|
||||
_ck_stochastic_rounding_fp8 = ck.stochastic_rounding_fp8
|
||||
_CK_STOCHASTIC_ROUNDING_AVAILABLE = True
|
||||
except (AttributeError, ImportError):
|
||||
logging.warning("comfy_kitchen does not support stochastic FP8 rounding, please update comfy_kitchen.")
|
||||
|
||||
if not _CK_STOCHASTIC_ROUNDING_AVAILABLE:
|
||||
def _ck_stochastic_rounding_fp8(value, rng, dtype):
|
||||
raise NotImplementedError("comfy_kitchen does not support stochastic FP8 rounding")
|
||||
|
||||
|
||||
def calc_mantissa(abs_x, exponent, normal_mask, MANTISSA_BITS, EXPONENT_BIAS, generator=None):
|
||||
mantissa_scaled = torch.where(
|
||||
normal_mask,
|
||||
@ -57,6 +72,10 @@ def stochastic_rounding(value, dtype, seed=0):
|
||||
if dtype == torch.float8_e4m3fn or dtype == torch.float8_e5m2:
|
||||
generator = torch.Generator(device=value.device)
|
||||
generator.manual_seed(seed)
|
||||
if _CK_STOCHASTIC_ROUNDING_AVAILABLE:
|
||||
rng = torch.randint(0, 256, value.size(), dtype=torch.uint8, layout=value.layout, device=value.device, generator=generator)
|
||||
return _ck_stochastic_rounding_fp8(value, rng, dtype)
|
||||
|
||||
output = torch.empty_like(value, dtype=dtype)
|
||||
num_slices = max(1, (value.numel() / (4096 * 4096)))
|
||||
slice_size = max(1, round(value.shape[0] / num_slices))
|
||||
|
||||
@ -1,7 +1,13 @@
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from comfy.text_encoders.bert import BertAttention
|
||||
import comfy.model_management
|
||||
from comfy.ldm.modules.attention import optimized_attention_for_device
|
||||
from comfy.ldm.depth_anything_3.reference_view_selector import (
|
||||
select_reference_view, reorder_by_reference, restore_original_order,
|
||||
THRESH_FOR_REF_SELECTION,
|
||||
)
|
||||
|
||||
|
||||
class Dino2AttentionOutput(torch.nn.Module):
|
||||
@ -14,13 +20,41 @@ class Dino2AttentionOutput(torch.nn.Module):
|
||||
|
||||
|
||||
class Dino2AttentionBlock(torch.nn.Module):
|
||||
def __init__(self, embed_dim, heads, layer_norm_eps, dtype, device, operations):
|
||||
def __init__(self, embed_dim, heads, layer_norm_eps, dtype, device, operations,
|
||||
qk_norm=False):
|
||||
super().__init__()
|
||||
self.heads = heads
|
||||
self.head_dim = embed_dim // heads
|
||||
self.attention = BertAttention(embed_dim, heads, dtype, device, operations)
|
||||
self.output = Dino2AttentionOutput(embed_dim, embed_dim, layer_norm_eps, dtype, device, operations)
|
||||
if qk_norm:
|
||||
self.q_norm = operations.LayerNorm(self.head_dim, dtype=dtype, device=device)
|
||||
self.k_norm = operations.LayerNorm(self.head_dim, dtype=dtype, device=device)
|
||||
else:
|
||||
self.q_norm = None
|
||||
self.k_norm = None
|
||||
|
||||
def forward(self, x, mask, optimized_attention):
|
||||
return self.output(self.attention(x, mask, optimized_attention))
|
||||
def forward(self, x, mask, optimized_attention, pos=None, rope=None):
|
||||
# Fast path used by the existing CLIP-vision DINOv2 (no DA3 extensions).
|
||||
if self.q_norm is None and rope is None:
|
||||
return self.output(self.attention(x, mask, optimized_attention))
|
||||
|
||||
# DA3 path: do QKV manually so we can apply per-head QK-norm and 2D RoPE.
|
||||
attn = self.attention
|
||||
B, N, C = x.shape
|
||||
h = self.heads
|
||||
d = self.head_dim
|
||||
q = attn.query(x).view(B, N, h, d).transpose(1, 2)
|
||||
k = attn.key(x).view(B, N, h, d).transpose(1, 2)
|
||||
v = attn.value(x).view(B, N, h, d).transpose(1, 2)
|
||||
if self.q_norm is not None:
|
||||
q = self.q_norm(q)
|
||||
k = self.k_norm(k)
|
||||
if rope is not None and pos is not None:
|
||||
q = rope(q, pos)
|
||||
k = rope(k, pos)
|
||||
out = optimized_attention(q, k, v, h, mask=mask, skip_reshape=True)
|
||||
return self.output(out)
|
||||
|
||||
|
||||
class LayerScale(torch.nn.Module):
|
||||
@ -64,9 +98,11 @@ class SwiGLUFFN(torch.nn.Module):
|
||||
|
||||
|
||||
class Dino2Block(torch.nn.Module):
|
||||
def __init__(self, dim, num_heads, layer_norm_eps, dtype, device, operations, use_swiglu_ffn):
|
||||
def __init__(self, dim, num_heads, layer_norm_eps, dtype, device, operations, use_swiglu_ffn,
|
||||
qk_norm=False):
|
||||
super().__init__()
|
||||
self.attention = Dino2AttentionBlock(dim, num_heads, layer_norm_eps, dtype, device, operations)
|
||||
self.attention = Dino2AttentionBlock(dim, num_heads, layer_norm_eps, dtype, device, operations,
|
||||
qk_norm=qk_norm)
|
||||
self.layer_scale1 = LayerScale(dim, dtype, device, operations)
|
||||
self.layer_scale2 = LayerScale(dim, dtype, device, operations)
|
||||
if use_swiglu_ffn:
|
||||
@ -76,19 +112,90 @@ class Dino2Block(torch.nn.Module):
|
||||
self.norm1 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
|
||||
self.norm2 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, x, optimized_attention):
|
||||
x = x + self.layer_scale1(self.attention(self.norm1(x), None, optimized_attention))
|
||||
def forward(self, x, optimized_attention, pos=None, rope=None, attn_mask=None):
|
||||
x = x + self.layer_scale1(self.attention(self.norm1(x), attn_mask, optimized_attention,
|
||||
pos=pos, rope=rope))
|
||||
x = x + self.layer_scale2(self.mlp(self.norm2(x)))
|
||||
return x
|
||||
|
||||
|
||||
class Dino2Encoder(torch.nn.Module):
|
||||
def __init__(self, dim, num_heads, layer_norm_eps, num_layers, dtype, device, operations, use_swiglu_ffn):
|
||||
# -----------------------------------------------------------------------------
|
||||
# 2D Rotary position embedding (DA3 extension)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class _PositionGetter:
|
||||
"""Cache (h, w) -> flat (y, x) position grid used to feed ``rope``."""
|
||||
|
||||
def __init__(self):
|
||||
self._cache: dict = {}
|
||||
|
||||
def __call__(self, batch_size: int, height: int, width: int, device) -> torch.Tensor:
|
||||
key = (height, width, device)
|
||||
if key not in self._cache:
|
||||
y = torch.arange(height, device=device)
|
||||
x = torch.arange(width, device=device)
|
||||
self._cache[key] = torch.cartesian_prod(y, x)
|
||||
cached = self._cache[key]
|
||||
return cached.view(1, height * width, 2).expand(batch_size, -1, -1).clone()
|
||||
|
||||
|
||||
class RotaryPositionEmbedding2D(torch.nn.Module):
|
||||
"""2D RoPE used by DA3-Small/Base. No learnable parameters."""
|
||||
|
||||
def __init__(self, frequency: float = 100.0):
|
||||
super().__init__()
|
||||
self.layer = torch.nn.ModuleList([Dino2Block(dim, num_heads, layer_norm_eps, dtype, device, operations, use_swiglu_ffn = use_swiglu_ffn)
|
||||
for _ in range(num_layers)])
|
||||
self.base_frequency = frequency
|
||||
self._freq_cache: dict = {}
|
||||
|
||||
def _components(self, dim: int, seq_len: int, device, dtype):
|
||||
key = (dim, seq_len, device, dtype)
|
||||
if key not in self._freq_cache:
|
||||
exp = torch.arange(0, dim, 2, device=device).float() / dim
|
||||
inv_freq = 1.0 / (self.base_frequency ** exp)
|
||||
pos = torch.arange(seq_len, device=device, dtype=inv_freq.dtype)
|
||||
ang = torch.einsum("i,j->ij", pos, inv_freq)
|
||||
ang = ang.to(dtype)
|
||||
ang = torch.cat((ang, ang), dim=-1)
|
||||
self._freq_cache[key] = (ang.cos().to(dtype), ang.sin().to(dtype))
|
||||
return self._freq_cache[key]
|
||||
|
||||
@staticmethod
|
||||
def _rotate(x: torch.Tensor) -> torch.Tensor:
|
||||
d = x.shape[-1]
|
||||
x1, x2 = x[..., : d // 2], x[..., d // 2:]
|
||||
return torch.cat((-x2, x1), dim=-1)
|
||||
|
||||
def _apply_1d(self, tokens, positions, cos_c, sin_c):
|
||||
cos = F.embedding(positions, cos_c)[:, None, :, :]
|
||||
sin = F.embedding(positions, sin_c)[:, None, :, :]
|
||||
return (tokens * cos) + (self._rotate(tokens) * sin)
|
||||
|
||||
def forward(self, tokens: torch.Tensor, positions: torch.Tensor) -> torch.Tensor:
|
||||
feature_dim = tokens.size(-1) // 2
|
||||
max_pos = int(positions.max()) + 1
|
||||
cos_c, sin_c = self._components(feature_dim, max_pos, tokens.device, tokens.dtype)
|
||||
v, h = tokens.chunk(2, dim=-1)
|
||||
v = self._apply_1d(v, positions[..., 0], cos_c, sin_c)
|
||||
h = self._apply_1d(h, positions[..., 1], cos_c, sin_c)
|
||||
return torch.cat((v, h), dim=-1)
|
||||
|
||||
|
||||
class Dino2Encoder(torch.nn.Module):
|
||||
def __init__(self, dim, num_heads, layer_norm_eps, num_layers, dtype, device, operations, use_swiglu_ffn,
|
||||
qknorm_start: int = -1):
|
||||
super().__init__()
|
||||
self.layer = torch.nn.ModuleList([
|
||||
Dino2Block(
|
||||
dim, num_heads, layer_norm_eps, dtype, device, operations,
|
||||
use_swiglu_ffn=use_swiglu_ffn,
|
||||
qk_norm=(qknorm_start != -1 and i >= qknorm_start),
|
||||
)
|
||||
for i in range(num_layers)
|
||||
])
|
||||
|
||||
def forward(self, x, intermediate_output=None):
|
||||
# Backward-compat path used by ``ClipVisionModel`` (no DA3 extensions).
|
||||
optimized_attention = optimized_attention_for_device(x.device, False, small_input=True)
|
||||
|
||||
if intermediate_output is not None:
|
||||
@ -122,16 +229,27 @@ class Dino2PatchEmbeddings(torch.nn.Module):
|
||||
|
||||
|
||||
class Dino2Embeddings(torch.nn.Module):
|
||||
def __init__(self, dim, dtype, device, operations):
|
||||
def __init__(self, dim, dtype, device, operations,
|
||||
patch_size: int = 14, image_size: int = 518,
|
||||
use_mask_token: bool = True,
|
||||
num_camera_tokens: int = 0):
|
||||
super().__init__()
|
||||
patch_size = 14
|
||||
image_size = 518
|
||||
self.patch_size = patch_size
|
||||
self.image_size = image_size
|
||||
|
||||
self.patch_embeddings = Dino2PatchEmbeddings(dim, patch_size=patch_size, image_size=image_size, dtype=dtype, device=device, operations=operations)
|
||||
self.position_embeddings = torch.nn.Parameter(torch.empty(1, (image_size // patch_size) ** 2 + 1, dim, dtype=dtype, device=device))
|
||||
self.cls_token = torch.nn.Parameter(torch.empty(1, 1, dim, dtype=dtype, device=device)) # mask_token is a pre-training param, kept only so strict loading accepts the key.
|
||||
self.mask_token = torch.nn.Parameter(torch.empty(1, dim, dtype=dtype, device=device))
|
||||
if use_mask_token:
|
||||
self.mask_token = torch.nn.Parameter(torch.empty(1, dim, dtype=dtype, device=device))
|
||||
else:
|
||||
self.mask_token = None
|
||||
if num_camera_tokens > 0:
|
||||
# DA3 stores (ref_token, src_token) pairs that get injected at the
|
||||
# alt-attn boundary; see ``Dinov2Model._inject_camera_token``.
|
||||
self.camera_token = torch.nn.Parameter(torch.empty(1, num_camera_tokens, dim, dtype=dtype, device=device))
|
||||
else:
|
||||
self.camera_token = None
|
||||
|
||||
def interpolate_pos_encoding(self, x, h_pixels, w_pixels):
|
||||
pos_embed = comfy.model_management.cast_to_device(self.position_embeddings, x.device, torch.float32)
|
||||
@ -140,12 +258,22 @@ class Dino2Embeddings(torch.nn.Module):
|
||||
patch_pos = pos_embed[:, 1:]
|
||||
N = patch_pos.shape[1]
|
||||
M = int(N ** 0.5)
|
||||
assert N == M * M, f"DINOv2 position grid must be square, got N={N} patches (sqrt={M})"
|
||||
h0 = h_pixels // self.patch_size
|
||||
w0 = w_pixels // self.patch_size
|
||||
scale_factor = ((h0 + 0.1) / M, (w0 + 0.1) / M) # +0.1 matches upstream DINOv2's FP-rounding workaround so the interpolate output size lands on (h0, w0).
|
||||
# +0.1 matches upstream DINOv2's FP-rounding workaround so the interpolate output size lands on (h0, w0).
|
||||
# scale_factor is (height_scale, width_scale) -- height MUST come first;
|
||||
# swapping these only happens to work for square inputs and breaks
|
||||
# non-square paths like DA3-Small / DA3-Base multi-view.
|
||||
scale_factor = ((h0 + 0.1) / M, (w0 + 0.1) / M)
|
||||
|
||||
patch_pos = patch_pos.reshape(1, M, M, -1).permute(0, 3, 1, 2)
|
||||
patch_pos = torch.nn.functional.interpolate(patch_pos, scale_factor=scale_factor, mode="bicubic", antialias=False)
|
||||
assert (h0, w0) == patch_pos.shape[-2:], (
|
||||
f"Interpolated pos-embed grid {tuple(patch_pos.shape[-2:])} does not match "
|
||||
f"target patch grid ({h0}, {w0}) for input {h_pixels}x{w_pixels} (patch_size={self.patch_size}); "
|
||||
f"check scale_factor axis order and +0.1 rounding workaround"
|
||||
)
|
||||
patch_pos = patch_pos.permute(0, 2, 3, 1).flatten(1, 2)
|
||||
return torch.cat((class_pos, patch_pos), dim=1).to(x.dtype)
|
||||
|
||||
@ -168,12 +296,51 @@ class Dinov2Model(torch.nn.Module):
|
||||
heads = config_dict["num_attention_heads"]
|
||||
layer_norm_eps = config_dict["layer_norm_eps"]
|
||||
use_swiglu_ffn = config_dict["use_swiglu_ffn"]
|
||||
patch_size = config_dict.get("patch_size", 14)
|
||||
image_size = config_dict.get("image_size", 518)
|
||||
use_mask_token = config_dict.get("use_mask_token", True)
|
||||
|
||||
self.embeddings = Dino2Embeddings(dim, dtype, device, operations)
|
||||
self.encoder = Dino2Encoder(dim, heads, layer_norm_eps, num_layers, dtype, device, operations, use_swiglu_ffn = use_swiglu_ffn)
|
||||
# DA3 extensions (all default to disabled).
|
||||
self.alt_start = config_dict.get("alt_start", -1)
|
||||
self.qknorm_start = config_dict.get("qknorm_start", -1)
|
||||
self.rope_start = config_dict.get("rope_start", -1)
|
||||
self.cat_token = config_dict.get("cat_token", False)
|
||||
rope_freq = config_dict.get("rope_freq", 100.0)
|
||||
|
||||
self.embed_dim = dim
|
||||
self.patch_size = patch_size
|
||||
self.num_register_tokens = 0
|
||||
self.patch_start_idx = 1
|
||||
|
||||
if self.rope_start != -1 and rope_freq > 0:
|
||||
self.rope = RotaryPositionEmbedding2D(frequency=rope_freq)
|
||||
self._position_getter = _PositionGetter()
|
||||
else:
|
||||
self.rope = None
|
||||
self._position_getter = None
|
||||
|
||||
# camera_token shape: (1, 2, dim) -> (ref_token, src_token).
|
||||
num_cam_tokens = 2 if self.alt_start != -1 else 0
|
||||
|
||||
self.embeddings = Dino2Embeddings(
|
||||
dim, dtype, device, operations,
|
||||
patch_size=patch_size, image_size=image_size,
|
||||
use_mask_token=use_mask_token, num_camera_tokens=num_cam_tokens,
|
||||
)
|
||||
self.encoder = Dino2Encoder(
|
||||
dim, heads, layer_norm_eps, num_layers, dtype, device, operations,
|
||||
use_swiglu_ffn=use_swiglu_ffn,
|
||||
qknorm_start=self.qknorm_start,
|
||||
)
|
||||
self.layernorm = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, pixel_values, attention_mask=None, intermediate_output=None):
|
||||
if self.alt_start != -1:
|
||||
raise RuntimeError(
|
||||
"Dinov2Model.forward() is the backward-compatible CLIP-vision path and does not "
|
||||
"apply DA3 extensions (RoPE, alternating attention, camera-token injection). "
|
||||
"Use get_intermediate_layers_da3() for Depth Anything 3 models."
|
||||
)
|
||||
x = self.embeddings(pixel_values)
|
||||
x, i = self.encoder(x, intermediate_output=intermediate_output)
|
||||
x = self.layernorm(x)
|
||||
@ -181,6 +348,7 @@ class Dinov2Model(torch.nn.Module):
|
||||
return x, i, pooled_output, None
|
||||
|
||||
def get_intermediate_layers(self, pixel_values, indices, apply_norm=True):
|
||||
"""Single-view multi-layer feature extraction."""
|
||||
x = self.embeddings(pixel_values)
|
||||
optimized_attention = optimized_attention_for_device(x.device, False, small_input=True)
|
||||
n_layers = len(self.encoder.layer)
|
||||
@ -197,3 +365,132 @@ class Dinov2Model(torch.nn.Module):
|
||||
if i >= max_idx:
|
||||
break
|
||||
return [cache[i] for i in resolved]
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Depth Anything 3 forward
|
||||
# ------------------------------------------------------------------
|
||||
def _prepare_rope_positions(self, B, S, H, W, device):
|
||||
if self.rope is None:
|
||||
return None, None
|
||||
ph, pw = H // self.patch_size, W // self.patch_size
|
||||
pos = self._position_getter(B * S, ph, pw, device=device)
|
||||
# Shift so the cls/cam token at position 0 is reserved for "no diff".
|
||||
pos = pos + 1
|
||||
cls_pos = torch.zeros(B * S, self.patch_start_idx, 2, device=device, dtype=pos.dtype)
|
||||
# Per-view local: real grid positions for patches, 0 for cls token.
|
||||
pos_local = torch.cat([cls_pos, pos], dim=1)
|
||||
# Global (across views): same grid positions; cls token still at 0,
|
||||
# but patches share the same positions in every view.
|
||||
pos_global = torch.cat([cls_pos, torch.zeros_like(pos) + 1], dim=1)
|
||||
return pos_local, pos_global
|
||||
|
||||
def _inject_camera_token(self, x: torch.Tensor, B: int, S: int, cam_token: "torch.Tensor | None") -> torch.Tensor:
|
||||
# x: (B, S, N, C). Replace token at index 0 with the camera token.
|
||||
if cam_token is not None:
|
||||
inj = cam_token
|
||||
else:
|
||||
ct = comfy.model_management.cast_to_device(self.embeddings.camera_token, x.device, x.dtype)
|
||||
ref_token = ct[:, :1].expand(B, -1, -1)
|
||||
src_token = ct[:, 1:].expand(B, max(S - 1, 0), -1)
|
||||
inj = torch.cat([ref_token, src_token], dim=1)
|
||||
x = x.clone()
|
||||
x[:, :, 0] = inj
|
||||
return x
|
||||
|
||||
def get_intermediate_layers_da3(self, pixel_values, out_layers, cam_token=None, ref_view_strategy="saddle_balanced", export_feat_layers=None):
|
||||
"""Multi-view multi-layer feature extraction used by Depth Anything 3."""
|
||||
if pixel_values.ndim == 4:
|
||||
pixel_values = pixel_values.unsqueeze(1)
|
||||
assert pixel_values.ndim == 5 and pixel_values.shape[2] == 3, \
|
||||
f"expected (B,3,H,W) or (B,S,3,H,W); got {tuple(pixel_values.shape)}"
|
||||
B, S, _, H, W = pixel_values.shape
|
||||
|
||||
# Patch + cls + (interpolated) pos embed for each view.
|
||||
x = pixel_values.reshape(B * S, 3, H, W)
|
||||
x = self.embeddings(x) # (B*S, 1+N, C)
|
||||
x = x.reshape(B, S, x.shape[-2], x.shape[-1]) # (B, S, 1+N, C)
|
||||
|
||||
pos_local, pos_global = self._prepare_rope_positions(B, S, H, W, x.device)
|
||||
# optimized_attention is only used by blocks without QK-norm/RoPE
|
||||
# (vanilla DINOv2 path); enabling-aware blocks fall through to SDPA.
|
||||
optimized_attention = optimized_attention_for_device(x.device, False, small_input=True)
|
||||
|
||||
out_set = set(out_layers)
|
||||
export_set = set(export_feat_layers) if export_feat_layers else set()
|
||||
outputs: list[torch.Tensor] = []
|
||||
aux_outputs: list[torch.Tensor] = []
|
||||
local_x = x
|
||||
b_idx = None
|
||||
|
||||
|
||||
for i, blk in enumerate(self.encoder.layer):
|
||||
apply_rope = self.rope is not None and i >= self.rope_start
|
||||
block_rope = self.rope if apply_rope else None
|
||||
l_pos = pos_local if apply_rope else None
|
||||
g_pos = pos_global if apply_rope else None
|
||||
|
||||
# Reference-view selection threshold: matches the upstream constant
|
||||
# THRESH_FOR_REF_SELECTION = 3. Skipped when a user-supplied
|
||||
# cam_token is provided (camera info already pins the geometry).
|
||||
if (self.alt_start != -1 and i == self.alt_start - 1 and S >= THRESH_FOR_REF_SELECTION and cam_token is None):
|
||||
b_idx = select_reference_view(x, strategy=ref_view_strategy)
|
||||
x = reorder_by_reference(x, b_idx)
|
||||
local_x = reorder_by_reference(local_x, b_idx)
|
||||
|
||||
if self.alt_start != -1 and i == self.alt_start:
|
||||
x = self._inject_camera_token(x, B, S, cam_token)
|
||||
|
||||
if self.alt_start != -1 and i >= self.alt_start and (i % 2 == 1):
|
||||
# Global attention across views: flatten S into the seq dim.
|
||||
t = x.reshape(B, S * x.shape[-2], x.shape[-1])
|
||||
p = g_pos.reshape(B, S * g_pos.shape[-2], g_pos.shape[-1]) if g_pos is not None else None
|
||||
t = blk(t, optimized_attention=optimized_attention, pos=p, rope=block_rope)
|
||||
x = t.reshape(B, S, x.shape[-2], x.shape[-1])
|
||||
else:
|
||||
# Per-view local attention.
|
||||
t = x.reshape(B * S, x.shape[-2], x.shape[-1])
|
||||
p = l_pos.reshape(B * S, l_pos.shape[-2], l_pos.shape[-1]) if l_pos is not None else None
|
||||
t = blk(t, optimized_attention=optimized_attention, pos=p, rope=block_rope)
|
||||
x = t.reshape(B, S, x.shape[-2], x.shape[-1])
|
||||
local_x = x
|
||||
|
||||
if i in out_set:
|
||||
if self.cat_token:
|
||||
out_x = torch.cat([local_x, x], dim=-1)
|
||||
else:
|
||||
out_x = x
|
||||
# Restore original view order on the way out so heads see views
|
||||
# in the user's expected order.
|
||||
if b_idx is not None and self.alt_start != -1:
|
||||
out_x = restore_original_order(out_x, b_idx)
|
||||
outputs.append(out_x)
|
||||
|
||||
if i in export_set:
|
||||
aux = x
|
||||
if b_idx is not None and self.alt_start != -1:
|
||||
aux = restore_original_order(aux, b_idx)
|
||||
aux_outputs.append(aux)
|
||||
|
||||
# Apply final norm. When cat_token is set, only the right half
|
||||
# ("global" features) is normalised; the left half is left as-is to
|
||||
# match the upstream DA3 head signature.
|
||||
normed: list[torch.Tensor] = []
|
||||
cls_tokens: list[torch.Tensor] = []
|
||||
for out_x in outputs:
|
||||
cls_tokens.append(out_x[:, :, 0])
|
||||
if out_x.shape[-1] == self.embed_dim:
|
||||
normed.append(self.layernorm(out_x))
|
||||
elif out_x.shape[-1] == self.embed_dim * 2:
|
||||
left = out_x[..., :self.embed_dim]
|
||||
right = self.layernorm(out_x[..., self.embed_dim:])
|
||||
normed.append(torch.cat([left, right], dim=-1))
|
||||
else:
|
||||
raise ValueError(f"Unexpected token width: {out_x.shape[-1]}")
|
||||
|
||||
# Drop cls/cam token from the patch sequence.
|
||||
normed = [o[..., 1 + self.num_register_tokens:, :] for o in normed]
|
||||
|
||||
# Final layernorm + drop cls token from auxiliary features too.
|
||||
aux_normed = [self.layernorm(o)[..., 1 + self.num_register_tokens:, :]
|
||||
for o in aux_outputs]
|
||||
return list(zip(normed, cls_tokens)), aux_normed
|
||||
|
||||
@ -3,10 +3,31 @@ import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
import comfy.model_management
|
||||
import comfy.ops
|
||||
from comfy.ldm.modules.attention import optimized_attention_for_device
|
||||
from comfy.image_encoders.dino2 import LayerScale as DINOv3ViTLayerScale
|
||||
|
||||
|
||||
# DINOv3 ViT-H/16+ (SwiGLU)
|
||||
DINOV3_VITH_CONFIG = {
|
||||
"model_type": "dinov3",
|
||||
"num_hidden_layers": 32,
|
||||
"hidden_size": 1280,
|
||||
"num_attention_heads": 20,
|
||||
"num_register_tokens": 4,
|
||||
"intermediate_size": 5120,
|
||||
"layer_norm_eps": 1e-5,
|
||||
"num_channels": 3,
|
||||
"patch_size": 16,
|
||||
"rope_theta": 100.0,
|
||||
"use_gated_mlp": True,
|
||||
"gated_mlp_act": "silu",
|
||||
"image_size": 1024,
|
||||
"image_mean": [0.485, 0.456, 0.406],
|
||||
"image_std": [0.229, 0.224, 0.225],
|
||||
}
|
||||
|
||||
|
||||
class DINOv3ViTMLP(nn.Module):
|
||||
def __init__(self, hidden_size, intermediate_size, mlp_bias, device, dtype, operations):
|
||||
super().__init__()
|
||||
@ -19,10 +40,13 @@ class DINOv3ViTMLP(nn.Module):
|
||||
def forward(self, x):
|
||||
return self.down_proj(self.act_fn(self.up_proj(x)))
|
||||
|
||||
|
||||
def rotate_half(x):
|
||||
x1 = x[..., : x.shape[-1] // 2]
|
||||
x2 = x[..., x.shape[-1] // 2 :]
|
||||
return torch.cat((-x2, x1), dim=-1)
|
||||
|
||||
|
||||
def apply_rotary_pos_emb(q, k, cos, sin, **kwargs):
|
||||
num_tokens = q.shape[-2]
|
||||
num_patches = sin.shape[-2]
|
||||
@ -39,6 +63,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, **kwargs):
|
||||
|
||||
return q, k
|
||||
|
||||
|
||||
class DINOv3ViTAttention(nn.Module):
|
||||
def __init__(self, hidden_size, num_attention_heads, device, dtype, operations):
|
||||
super().__init__()
|
||||
@ -46,20 +71,12 @@ class DINOv3ViTAttention(nn.Module):
|
||||
self.num_heads = num_attention_heads
|
||||
self.head_dim = self.embed_dim // self.num_heads
|
||||
|
||||
self.k_proj = operations.Linear(self.embed_dim, self.embed_dim, bias=False, device=device, dtype=dtype) # key_bias = False
|
||||
self.k_proj = operations.Linear(self.embed_dim, self.embed_dim, bias=False, device=device, dtype=dtype) # key_bias = False
|
||||
self.v_proj = operations.Linear(self.embed_dim, self.embed_dim, bias=True, device=device, dtype=dtype)
|
||||
|
||||
self.q_proj = operations.Linear(self.embed_dim, self.embed_dim, bias=True, device=device, dtype=dtype)
|
||||
self.o_proj = operations.Linear(self.embed_dim, self.embed_dim, bias=True, device=device, dtype=dtype)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
attention_mask: torch.Tensor | None = None,
|
||||
position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
|
||||
**kwargs,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
|
||||
def forward(self, hidden_states, attention_mask=None, position_embeddings=None, **kwargs):
|
||||
batch_size, patches, _ = hidden_states.size()
|
||||
|
||||
query_states = self.q_proj(hidden_states)
|
||||
@ -75,7 +92,6 @@ class DINOv3ViTAttention(nn.Module):
|
||||
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
|
||||
|
||||
attn = optimized_attention_for_device(query_states.device, mask=False)
|
||||
|
||||
attn_output = attn(
|
||||
query_states, key_states, value_states, self.num_heads, attention_mask,
|
||||
skip_reshape=True, skip_output_reshape=True, low_precision_attention=False,
|
||||
@ -84,27 +100,24 @@ class DINOv3ViTAttention(nn.Module):
|
||||
attn_output = attn_output.transpose(1, 2)
|
||||
attn_output = attn_output.reshape(batch_size, patches, -1).contiguous()
|
||||
attn_output = self.o_proj(attn_output)
|
||||
|
||||
return attn_output
|
||||
|
||||
|
||||
class DINOv3ViTGatedMLP(nn.Module):
|
||||
def __init__(self, hidden_size, intermediate_size, mlp_bias, device, dtype, operations):
|
||||
def __init__(self, hidden_size, intermediate_size, mlp_bias, device, dtype, operations, act="silu"):
|
||||
super().__init__()
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.gate_proj = operations.Linear(self.hidden_size, self.intermediate_size, bias=mlp_bias, device=device, dtype=dtype)
|
||||
self.up_proj = operations.Linear(self.hidden_size, self.intermediate_size, bias=mlp_bias, device=device, dtype=dtype)
|
||||
self.down_proj = operations.Linear(self.intermediate_size, self.hidden_size, bias=mlp_bias, device=device, dtype=dtype)
|
||||
self.act_fn = torch.nn.GELU()
|
||||
self.act_fn = torch.nn.SiLU() if act == "silu" else torch.nn.GELU()
|
||||
|
||||
def forward(self, x):
|
||||
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
|
||||
return down_proj
|
||||
return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
|
||||
|
||||
def get_patches_center_coordinates(
|
||||
num_patches_h: int, num_patches_w: int, dtype: torch.dtype, device: torch.device
|
||||
) -> torch.Tensor:
|
||||
|
||||
def get_patches_center_coordinates(num_patches_h, num_patches_w, dtype, device):
|
||||
coords_h = torch.arange(0.5, num_patches_h, dtype=dtype, device=device)
|
||||
coords_w = torch.arange(0.5, num_patches_w, dtype=dtype, device=device)
|
||||
coords_h = coords_h / num_patches_h
|
||||
@ -114,105 +127,79 @@ def get_patches_center_coordinates(
|
||||
coords = 2.0 * coords - 1.0
|
||||
return coords
|
||||
|
||||
|
||||
class DINOv3ViTRopePositionEmbedding(nn.Module):
|
||||
inv_freq: torch.Tensor
|
||||
|
||||
def __init__(self, rope_theta, hidden_size, num_attention_heads, image_size, patch_size, device, dtype):
|
||||
def __init__(self, rope_theta, hidden_size, num_attention_heads, patch_size, device, dtype):
|
||||
super().__init__()
|
||||
self.base = rope_theta
|
||||
self.head_dim = hidden_size // num_attention_heads
|
||||
self.num_patches_h = image_size // patch_size
|
||||
self.num_patches_w = image_size // patch_size
|
||||
self.patch_size = patch_size
|
||||
|
||||
inv_freq = 1 / self.base ** torch.arange(0, 1, 4 / self.head_dim, dtype=torch.float32, device=device)
|
||||
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
||||
|
||||
def forward(self, pixel_values: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
def forward(self, pixel_values):
|
||||
_, _, height, width = pixel_values.shape
|
||||
num_patches_h = height // self.patch_size
|
||||
num_patches_w = width // self.patch_size
|
||||
|
||||
device = pixel_values.device
|
||||
device_type = device.type if isinstance(device.type, str) and device.type != "mps" else "cpu"
|
||||
with torch.amp.autocast(device_type = device_type, enabled=False):
|
||||
patch_coords = get_patches_center_coordinates(
|
||||
num_patches_h, num_patches_w, dtype=torch.float32, device=device
|
||||
)
|
||||
|
||||
self.inv_freq = self.inv_freq.to(device)
|
||||
angles = 2 * math.pi * patch_coords[:, :, None] * self.inv_freq[None, None, :]
|
||||
angles = angles.flatten(1, 2)
|
||||
angles = angles.tile(2)
|
||||
|
||||
cos = torch.cos(angles)
|
||||
sin = torch.sin(angles)
|
||||
|
||||
dtype = pixel_values.dtype
|
||||
return cos.to(dtype=dtype), sin.to(dtype=dtype)
|
||||
patch_coords = get_patches_center_coordinates(num_patches_h, num_patches_w, dtype=torch.float32, device=pixel_values.device)
|
||||
self.inv_freq = self.inv_freq.to(pixel_values.device)
|
||||
angles = 2 * math.pi * patch_coords[:, :, None] * self.inv_freq[None, None, :]
|
||||
angles = angles.flatten(1, 2)
|
||||
angles = angles.tile(2)
|
||||
cos = torch.cos(angles).to(dtype=pixel_values.dtype)
|
||||
sin = torch.sin(angles).to(dtype=pixel_values.dtype)
|
||||
return cos, sin
|
||||
|
||||
|
||||
class DINOv3ViTEmbeddings(nn.Module):
|
||||
def __init__(self, hidden_size, num_register_tokens, num_channels, patch_size, dtype, device, operations):
|
||||
super().__init__()
|
||||
self.cls_token = nn.Parameter(torch.randn(1, 1, hidden_size, device=device, dtype=dtype))
|
||||
self.mask_token = nn.Parameter(torch.zeros(1, 1, hidden_size, device=device, dtype=dtype))
|
||||
self.cls_token = nn.Parameter(torch.empty(1, 1, hidden_size, device=device, dtype=dtype))
|
||||
self.mask_token = nn.Parameter(torch.empty(1, 1, hidden_size, device=device, dtype=dtype))
|
||||
self.register_tokens = nn.Parameter(torch.empty(1, num_register_tokens, hidden_size, device=device, dtype=dtype))
|
||||
self.patch_embeddings = operations.Conv2d(
|
||||
num_channels, hidden_size, kernel_size=patch_size, stride=patch_size, device=device, dtype=dtype
|
||||
)
|
||||
|
||||
def forward(self, pixel_values: torch.Tensor, bool_masked_pos: torch.Tensor | None = None):
|
||||
def forward(self, pixel_values, bool_masked_pos=None):
|
||||
batch_size = pixel_values.shape[0]
|
||||
target_dtype = self.patch_embeddings.weight.dtype
|
||||
|
||||
patch_embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
|
||||
patch_embeddings = self.patch_embeddings(pixel_values)
|
||||
patch_embeddings = patch_embeddings.flatten(2).transpose(1, 2)
|
||||
|
||||
if bool_masked_pos is not None:
|
||||
mask_token = self.mask_token.to(patch_embeddings.dtype)
|
||||
mask_token = comfy.ops.cast_to_input(self.mask_token, patch_embeddings)
|
||||
patch_embeddings = torch.where(bool_masked_pos.unsqueeze(-1), mask_token, patch_embeddings)
|
||||
|
||||
cls_token = self.cls_token.expand(batch_size, -1, -1)
|
||||
register_tokens = self.register_tokens.expand(batch_size, -1, -1)
|
||||
device = patch_embeddings.device
|
||||
cls_token = cls_token.to(device)
|
||||
register_tokens = register_tokens.to(device)
|
||||
cls_token = comfy.ops.cast_to_input(self.cls_token.expand(batch_size, -1, -1), patch_embeddings)
|
||||
register_tokens = comfy.ops.cast_to_input(self.register_tokens.expand(batch_size, -1, -1), patch_embeddings)
|
||||
embeddings = torch.cat([cls_token, register_tokens, patch_embeddings], dim=1)
|
||||
|
||||
return embeddings
|
||||
|
||||
|
||||
class DINOv3ViTLayer(nn.Module):
|
||||
|
||||
def __init__(self, hidden_size, layer_norm_eps, use_gated_mlp, mlp_bias, intermediate_size, num_attention_heads,
|
||||
device, dtype, operations):
|
||||
def __init__(self, hidden_size, layer_norm_eps, use_gated_mlp, mlp_bias, intermediate_size,
|
||||
num_attention_heads, device, dtype, operations, gated_mlp_act="silu"):
|
||||
super().__init__()
|
||||
|
||||
self.norm1 = operations.LayerNorm(hidden_size, eps=layer_norm_eps, device=device, dtype=dtype)
|
||||
self.attention = DINOv3ViTAttention(hidden_size, num_attention_heads, device=device, dtype=dtype, operations=operations)
|
||||
self.layer_scale1 = DINOv3ViTLayerScale(hidden_size, device=device, dtype=dtype, operations=None)
|
||||
|
||||
self.norm2 = operations.LayerNorm(hidden_size, eps=layer_norm_eps, device=device, dtype=dtype)
|
||||
|
||||
if use_gated_mlp:
|
||||
self.mlp = DINOv3ViTGatedMLP(hidden_size, intermediate_size, mlp_bias, device=device, dtype=dtype, operations=operations)
|
||||
self.mlp = DINOv3ViTGatedMLP(hidden_size, intermediate_size, mlp_bias, device=device, dtype=dtype, operations=operations, act=gated_mlp_act)
|
||||
else:
|
||||
self.mlp = DINOv3ViTMLP(hidden_size, intermediate_size=intermediate_size, mlp_bias=mlp_bias, device=device, dtype=dtype, operations=operations)
|
||||
self.layer_scale2 = DINOv3ViTLayerScale(hidden_size, device=device, dtype=dtype, operations=None)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
attention_mask: torch.Tensor | None = None,
|
||||
position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
|
||||
) -> torch.Tensor:
|
||||
def forward(self, hidden_states, attention_mask=None, position_embeddings=None):
|
||||
residual = hidden_states
|
||||
hidden_states = self.norm1(hidden_states)
|
||||
hidden_states = self.attention(
|
||||
hidden_states,
|
||||
attention_mask=attention_mask,
|
||||
position_embeddings=position_embeddings,
|
||||
)
|
||||
hidden_states = self.attention(hidden_states, attention_mask=attention_mask, position_embeddings=position_embeddings)
|
||||
hidden_states = self.layer_scale1(hidden_states)
|
||||
hidden_states = hidden_states + residual
|
||||
|
||||
@ -221,18 +208,12 @@ class DINOv3ViTLayer(nn.Module):
|
||||
hidden_states = self.mlp(hidden_states)
|
||||
hidden_states = self.layer_scale2(hidden_states)
|
||||
hidden_states = hidden_states + residual
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class DINOv3ViTModel(nn.Module):
|
||||
def __init__(self, config, dtype, device, operations):
|
||||
super().__init__()
|
||||
use_bf16 = comfy.model_management.should_use_bf16(device, prioritize_performance=True)
|
||||
if dtype == torch.float16 and use_bf16:
|
||||
dtype = torch.bfloat16
|
||||
elif dtype == torch.float16 and not use_bf16:
|
||||
dtype = torch.float32
|
||||
num_hidden_layers = config["num_hidden_layers"]
|
||||
hidden_size = config["hidden_size"]
|
||||
num_attention_heads = config["num_attention_heads"]
|
||||
@ -242,45 +223,37 @@ class DINOv3ViTModel(nn.Module):
|
||||
num_channels = config["num_channels"]
|
||||
patch_size = config["patch_size"]
|
||||
rope_theta = config["rope_theta"]
|
||||
use_gated_mlp = config.get("use_gated_mlp", False)
|
||||
gated_mlp_act = config.get("gated_mlp_act", "silu")
|
||||
|
||||
self.embeddings = DINOv3ViTEmbeddings(
|
||||
hidden_size, num_register_tokens, num_channels=num_channels, patch_size=patch_size, dtype=dtype, device=device, operations=operations
|
||||
hidden_size, num_register_tokens, num_channels=num_channels, patch_size=patch_size,
|
||||
dtype=dtype, device=device, operations=operations
|
||||
)
|
||||
self.rope_embeddings = DINOv3ViTRopePositionEmbedding(
|
||||
rope_theta, hidden_size, num_attention_heads, image_size=512, patch_size=patch_size, dtype=dtype, device=device
|
||||
rope_theta, hidden_size, num_attention_heads, patch_size=patch_size, dtype=dtype, device=device
|
||||
)
|
||||
self.layer = nn.ModuleList(
|
||||
[DINOv3ViTLayer(hidden_size, layer_norm_eps, use_gated_mlp=False, mlp_bias=True,
|
||||
intermediate_size=intermediate_size,num_attention_heads = num_attention_heads,
|
||||
dtype=dtype, device=device, operations=operations)
|
||||
self.layer = nn.ModuleList([
|
||||
DINOv3ViTLayer(hidden_size, layer_norm_eps, use_gated_mlp=use_gated_mlp, mlp_bias=True,
|
||||
intermediate_size=intermediate_size, num_attention_heads=num_attention_heads,
|
||||
dtype=dtype, device=device, operations=operations, gated_mlp_act=gated_mlp_act)
|
||||
for _ in range(num_hidden_layers)])
|
||||
self.norm = operations.LayerNorm(hidden_size, eps=layer_norm_eps, dtype=dtype, device=device)
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.embeddings.patch_embeddings
|
||||
|
||||
def forward(
|
||||
self,
|
||||
pixel_values: torch.Tensor,
|
||||
bool_masked_pos: torch.Tensor | None = None,
|
||||
**kwargs,
|
||||
):
|
||||
|
||||
pixel_values = pixel_values.to(self.embeddings.patch_embeddings.weight.dtype)
|
||||
def forward(self, pixel_values, bool_masked_pos=None, **kwargs):
|
||||
hidden_states = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
|
||||
position_embeddings = self.rope_embeddings(pixel_values)
|
||||
|
||||
for i, layer_module in enumerate(self.layer):
|
||||
hidden_states = layer_module(
|
||||
hidden_states,
|
||||
position_embeddings=position_embeddings,
|
||||
)
|
||||
for layer_module in self.layer:
|
||||
hidden_states = layer_module(hidden_states, position_embeddings=position_embeddings)
|
||||
|
||||
if kwargs.get("skip_norm_elementwise", False):
|
||||
sequence_output= F.layer_norm(hidden_states, hidden_states.shape[-1:])
|
||||
sequence_output = F.layer_norm(hidden_states, hidden_states.shape[-1:])
|
||||
else:
|
||||
norm = self.norm.to(hidden_states.device)
|
||||
sequence_output = norm(hidden_states)
|
||||
pooled_output = sequence_output[:, 0, :]
|
||||
|
||||
return sequence_output, None, pooled_output, None
|
||||
|
||||
@ -239,6 +239,16 @@ class Flux2(LatentFormat):
|
||||
def process_out(self, latent):
|
||||
return latent
|
||||
|
||||
class TripoSplat(LatentFormat):
|
||||
# Sequence latent (B, 8192, 16) the camera token rides alongside as a second nested latent
|
||||
latent_channels = 16
|
||||
|
||||
def process_in(self, latent):
|
||||
return latent
|
||||
|
||||
def process_out(self, latent):
|
||||
return latent
|
||||
|
||||
class Mochi(LatentFormat):
|
||||
latent_channels = 12
|
||||
latent_dimensions = 3
|
||||
@ -802,13 +812,15 @@ class ZImagePixelSpace(ChromaRadiance):
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class HiDreamO1Pixel(ChromaRadiance):
|
||||
"""Pixel-space latent format for HiDream-O1.
|
||||
No VAE — model patches/unpatches raw RGB internally with patch_size=32.
|
||||
"""
|
||||
pass
|
||||
|
||||
class PixelDiTPixel(ChromaRadiance):
|
||||
pass
|
||||
|
||||
class CogVideoX(LatentFormat):
|
||||
"""Latent format for CogVideoX-2b (THUDM/CogVideoX-2b).
|
||||
|
||||
|
||||
@ -433,11 +433,11 @@ class Attention(nn.Module):
|
||||
if self.differential:
|
||||
q, q_diff = q.unbind(dim=1)
|
||||
k, k_diff = k.unbind(dim=1)
|
||||
out = optimized_attention(q, k, v, h, skip_reshape=True, transformer_options=transformer_options)
|
||||
out_diff = optimized_attention(q_diff, k_diff, v, h, skip_reshape=True, transformer_options=transformer_options)
|
||||
out = optimized_attention(q, k, v, h, skip_reshape=True, low_precision_attention=False, transformer_options=transformer_options)
|
||||
out_diff = optimized_attention(q_diff, k_diff, v, h, skip_reshape=True, low_precision_attention=False, transformer_options=transformer_options)
|
||||
out = out - out_diff
|
||||
else:
|
||||
out = optimized_attention(q, k, v, h, skip_reshape=True, transformer_options=transformer_options)
|
||||
out = optimized_attention(q, k, v, h, skip_reshape=True, low_precision_attention=False, transformer_options=transformer_options)
|
||||
|
||||
out = self.to_out(out)
|
||||
|
||||
|
||||
@ -138,11 +138,11 @@ class Attention(nn.Module):
|
||||
k_diff = _apply_rotary_pos_emb(k_diff.float(), freqs).to(k_dtype)
|
||||
|
||||
if self.differential:
|
||||
out = (optimized_attention(q, k, v, h, mask=mask, skip_reshape=True)
|
||||
- optimized_attention(q_diff, k_diff, v, h, mask=mask, skip_reshape=True))
|
||||
out = (optimized_attention(q, k, v, h, mask=mask, skip_reshape=True, low_precision_attention=False)
|
||||
- optimized_attention(q_diff, k_diff, v, h, mask=mask, skip_reshape=True, low_precision_attention=False))
|
||||
del q, k, v, q_diff, k_diff
|
||||
else:
|
||||
out = optimized_attention(q, k, v, h, mask=mask, skip_reshape=True)
|
||||
out = optimized_attention(q, k, v, h, mask=mask, skip_reshape=True, low_precision_attention=False)
|
||||
del q, k, v
|
||||
|
||||
return self.to_out(out)
|
||||
|
||||
@ -38,6 +38,8 @@ class ChromaRadianceParams(ChromaParams):
|
||||
# None means use the same dtype as the model.
|
||||
nerf_embedder_dtype: Optional[torch.dtype]
|
||||
use_x0: bool
|
||||
# Use sequential txt_ids instead of zeros
|
||||
use_sequential_txt_ids: bool
|
||||
|
||||
class ChromaRadiance(Chroma):
|
||||
"""
|
||||
@ -162,6 +164,9 @@ class ChromaRadiance(Chroma):
|
||||
if params.use_x0:
|
||||
self.register_buffer("__x0__", torch.tensor([]))
|
||||
|
||||
if params.use_sequential_txt_ids:
|
||||
self.register_buffer("__sequential__", torch.tensor([]))
|
||||
|
||||
@property
|
||||
def _nerf_final_layer(self) -> nn.Module:
|
||||
if self.params.nerf_final_head_type == "linear":
|
||||
@ -313,6 +318,9 @@ class ChromaRadiance(Chroma):
|
||||
img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
|
||||
img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
|
||||
txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
|
||||
# Radiance after 2026-05-22 uses sequential txt_ids instead of zeros
|
||||
if params.use_sequential_txt_ids:
|
||||
txt_ids[:, :, 0] = torch.arange(context.shape[1], device=x.device, dtype=x.dtype).unsqueeze(0).expand(bs, -1)
|
||||
|
||||
img_out = self.forward_orig(
|
||||
img,
|
||||
|
||||
25
comfy/ldm/colormap.py
Normal file
25
comfy/ldm/colormap.py
Normal file
@ -0,0 +1,25 @@
|
||||
"""Colormap utilities for depth and geometry visualisation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
def turbo(x: torch.Tensor) -> torch.Tensor:
|
||||
"""Anton Mikhailov polynomial approximation of the Turbo colormap.
|
||||
|
||||
Args:
|
||||
x: Float tensor with values in [0, 1].
|
||||
|
||||
Returns:
|
||||
RGB tensor of the same shape as ``x`` with a trailing size-3 dimension.
|
||||
"""
|
||||
x = x.clamp(0.0, 1.0)
|
||||
x2 = x * x
|
||||
x3 = x2 * x
|
||||
x4 = x2 * x2
|
||||
x5 = x4 * x
|
||||
r = 0.13572138 + 4.61539260*x - 42.66032258*x2 + 132.13108234*x3 - 152.94239396*x4 + 59.28637943*x5
|
||||
g = 0.09140261 + 2.19418839*x + 4.84296658*x2 - 14.18503333*x3 + 4.27729857*x4 + 2.82956604*x5
|
||||
b = 0.10667330 + 12.64194608*x - 60.58204836*x2 + 110.36276771*x3 - 89.90310912*x4 + 27.34824973*x5
|
||||
return torch.stack([r, g, b], dim=-1).clamp(0.0, 1.0)
|
||||
@ -14,15 +14,7 @@ from torchvision import transforms
|
||||
import comfy.patcher_extension
|
||||
from comfy.ldm.modules.attention import optimized_attention
|
||||
import comfy.ldm.common_dit
|
||||
|
||||
def apply_rotary_pos_emb(
|
||||
t: torch.Tensor,
|
||||
freqs: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
t_ = t.reshape(*t.shape[:-1], 2, -1).movedim(-2, -1).unsqueeze(-2).float()
|
||||
t_out = freqs[..., 0] * t_[..., 0] + freqs[..., 1] * t_[..., 1]
|
||||
t_out = t_out.movedim(-1, -2).reshape(*t.shape).type_as(t)
|
||||
return t_out
|
||||
import comfy.quant_ops
|
||||
|
||||
|
||||
# ---------------------- Feed Forward Network -----------------------
|
||||
@ -173,8 +165,7 @@ class Attention(nn.Module):
|
||||
k = self.k_norm(k)
|
||||
v = self.v_norm(v)
|
||||
if self.is_selfattn and rope_emb is not None: # only apply to self-attention!
|
||||
q = apply_rotary_pos_emb(q, rope_emb)
|
||||
k = apply_rotary_pos_emb(k, rope_emb)
|
||||
q, k = comfy.quant_ops.ck.apply_rope_split_half(q, k, rope_emb)
|
||||
return q, k, v
|
||||
|
||||
q, k, v = apply_norm_and_rotary_pos_emb(q, k, v, rope_emb)
|
||||
|
||||
177
comfy/ldm/depth_anything_3/camera.py
Normal file
177
comfy/ldm/depth_anything_3/camera.py
Normal file
@ -0,0 +1,177 @@
|
||||
"""Camera-token encoder and decoder for Depth Anything 3."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from comfy.ldm.modules.attention import optimized_attention_for_device
|
||||
from .transform import affine_inverse, extri_intri_to_pose_encoding
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Building blocks (mirror depth_anything_3.model.utils.{attention,block})
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
|
||||
class _Mlp(nn.Module):
|
||||
"""Standard 2-layer MLP with GELU. Matches upstream ``utils.attention.Mlp``."""
|
||||
|
||||
def __init__(self, in_features, hidden_features=None, out_features=None, *, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
out_features = out_features or in_features
|
||||
hidden_features = hidden_features or in_features
|
||||
self.fc1 = operations.Linear(in_features, hidden_features, bias=True, device=device, dtype=dtype)
|
||||
self.fc2 = operations.Linear(hidden_features, out_features, bias=True, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
return self.fc2(F.gelu(self.fc1(x)))
|
||||
|
||||
|
||||
class _LayerScale(nn.Module):
|
||||
"""Per-channel learnable scaling. Matches upstream LayerScale."""
|
||||
|
||||
def __init__(self, dim, *, device=None, dtype=None):
|
||||
super().__init__()
|
||||
self.gamma = nn.Parameter(torch.empty(dim, device=device, dtype=dtype))
|
||||
|
||||
def forward(self, x):
|
||||
return x * self.gamma.to(dtype=x.dtype, device=x.device)
|
||||
|
||||
|
||||
class _Attention(nn.Module):
|
||||
""" Self-attention with fused QKV projection. Mirrors upstream utils.attention.Attention;
|
||||
Layout matches the HF safetensors (attn.qkv.{weight,bias} and attn.proj.{weight,bias})."""
|
||||
|
||||
def __init__(self, dim, num_heads, *, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
assert dim % num_heads == 0
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = dim // num_heads
|
||||
self.qkv = operations.Linear(dim, dim * 3, bias=True, device=device, dtype=dtype)
|
||||
self.proj = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
B, N, C = x.shape
|
||||
qkv = self.qkv(x).reshape(B, N, 3, C)
|
||||
q, k, v = qkv.unbind(2) # each (B, N, C)
|
||||
attn_fn = optimized_attention_for_device(x.device, small_input=True)
|
||||
out = attn_fn(q, k, v, heads=self.num_heads)
|
||||
return self.proj(out)
|
||||
|
||||
|
||||
class _Block(nn.Module):
|
||||
"""Pre-norm transformer block with LayerScale. Used by :class:CameraEnc. Layout follows upstream utils.block.Block."""
|
||||
|
||||
def __init__(self, dim, num_heads, mlp_ratio=4, init_values=0.01, *, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.norm1 = operations.LayerNorm(dim, device=device, dtype=dtype)
|
||||
self.attn = _Attention(dim, num_heads, device=device, dtype=dtype, operations=operations)
|
||||
self.ls1 = _LayerScale(dim, device=device, dtype=dtype) if init_values else nn.Identity()
|
||||
self.norm2 = operations.LayerNorm(dim, device=device, dtype=dtype)
|
||||
self.mlp = _Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), device=device, dtype=dtype, operations=operations)
|
||||
self.ls2 = _LayerScale(dim, device=device, dtype=dtype) if init_values else nn.Identity()
|
||||
|
||||
def forward(self, x):
|
||||
x = x + self.ls1(self.attn(self.norm1(x)))
|
||||
x = x + self.ls2(self.mlp(self.norm2(x)))
|
||||
return x
|
||||
|
||||
|
||||
class CameraEnc(nn.Module):
|
||||
"""Encode per-view (extrinsics, intrinsics) into a camera token.
|
||||
|
||||
Maps a 9-D pose-encoding vector through a small MLP up to the backbone's
|
||||
``embed_dim``, then runs ``trunk_depth`` transformer blocks. The output
|
||||
has shape ``(B, S, embed_dim)`` and is injected at block ``alt_start``
|
||||
of the DINOv2 backbone in place of the cls token.
|
||||
|
||||
Parameters mirror the upstream ``cam_enc.py`` so HF weights load directly.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dim_out: int = 1024,
|
||||
dim_in: int = 9,
|
||||
trunk_depth: int = 4,
|
||||
target_dim: int = 9,
|
||||
num_heads: int = 16,
|
||||
mlp_ratio: int = 4,
|
||||
init_values: float = 0.01,
|
||||
*,
|
||||
device=None, dtype=None, operations=None,
|
||||
**_kwargs,
|
||||
):
|
||||
super().__init__()
|
||||
self.target_dim = target_dim
|
||||
self.trunk_depth = trunk_depth
|
||||
self.trunk = nn.Sequential(*[
|
||||
_Block(dim_out, num_heads=num_heads, mlp_ratio=mlp_ratio,
|
||||
init_values=init_values,
|
||||
device=device, dtype=dtype, operations=operations)
|
||||
for _ in range(trunk_depth)
|
||||
])
|
||||
self.token_norm = operations.LayerNorm(dim_out, device=device, dtype=dtype)
|
||||
self.trunk_norm = operations.LayerNorm(dim_out, device=device, dtype=dtype)
|
||||
self.pose_branch = _Mlp(
|
||||
in_features=dim_in,
|
||||
hidden_features=dim_out // 2,
|
||||
out_features=dim_out,
|
||||
device=device, dtype=dtype, operations=operations,
|
||||
)
|
||||
|
||||
def forward(self, extrinsics: torch.Tensor, intrinsics: torch.Tensor,
|
||||
image_size_hw) -> torch.Tensor:
|
||||
"""Encode camera parameters into ``(B, S, dim_out)`` tokens."""
|
||||
c2ws = affine_inverse(extrinsics)
|
||||
pose_encoding = extri_intri_to_pose_encoding(c2ws, intrinsics, image_size_hw)
|
||||
tokens = self.pose_branch(pose_encoding.to(self.pose_branch.fc1.weight.dtype))
|
||||
tokens = self.token_norm(tokens)
|
||||
tokens = self.trunk(tokens)
|
||||
tokens = self.trunk_norm(tokens)
|
||||
return tokens
|
||||
|
||||
|
||||
class CameraDec(nn.Module):
|
||||
"""Decode the final cam token into a 9-D pose encoding.
|
||||
|
||||
Output layout: ``[T(3), quat_xyzw(4), fov_h, fov_w]``. The translation is
|
||||
always predicted by the network; the quaternion and FoV can either be
|
||||
predicted or supplied via ``camera_encoding`` (used at training time
|
||||
when GT cameras are available -- not exercised at inference here).
|
||||
|
||||
Parameters mirror the upstream ``cam_dec.py`` so HF weights load directly.
|
||||
"""
|
||||
|
||||
def __init__(self, dim_in: int = 1536,
|
||||
*, device=None, dtype=None, operations=None, **_kwargs):
|
||||
super().__init__()
|
||||
d = dim_in
|
||||
self.backbone = nn.Sequential(
|
||||
operations.Linear(d, d, device=device, dtype=dtype),
|
||||
nn.ReLU(),
|
||||
operations.Linear(d, d, device=device, dtype=dtype),
|
||||
nn.ReLU(),
|
||||
)
|
||||
self.fc_t = operations.Linear(d, 3, device=device, dtype=dtype)
|
||||
self.fc_qvec = operations.Linear(d, 4, device=device, dtype=dtype)
|
||||
self.fc_fov = nn.Sequential(
|
||||
operations.Linear(d, 2, device=device, dtype=dtype),
|
||||
nn.ReLU(),
|
||||
)
|
||||
|
||||
def forward(self, feat: torch.Tensor,
|
||||
camera_encoding: "torch.Tensor | None" = None) -> torch.Tensor:
|
||||
"""Decode ``(B, N, dim_in)`` cam tokens into ``(B, N, 9)`` pose enc."""
|
||||
B, N = feat.shape[:2]
|
||||
feat = feat.reshape(B * N, -1)
|
||||
feat = self.backbone(feat)
|
||||
out_t = self.fc_t(feat.float()).reshape(B, N, 3)
|
||||
if camera_encoding is None:
|
||||
out_qvec = self.fc_qvec(feat.float()).reshape(B, N, 4)
|
||||
out_fov = self.fc_fov(feat.float()).reshape(B, N, 2)
|
||||
else:
|
||||
out_qvec = camera_encoding[..., 3:7]
|
||||
out_fov = camera_encoding[..., -2:]
|
||||
return torch.cat([out_t, out_qvec, out_fov], dim=-1)
|
||||
489
comfy/ldm/depth_anything_3/dpt.py
Normal file
489
comfy/ldm/depth_anything_3/dpt.py
Normal file
@ -0,0 +1,489 @@
|
||||
"""DPT / DualDPT heads for Depth Anything 3."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import List, Optional, Sequence, Tuple
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
class Permute(nn.Module):
|
||||
def __init__(self, dims: Tuple[int, ...]):
|
||||
super().__init__()
|
||||
self.dims = dims
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
return x.permute(*self.dims)
|
||||
|
||||
|
||||
def _custom_interpolate(
|
||||
x: torch.Tensor,
|
||||
size: Optional[Tuple[int, int]] = None,
|
||||
scale_factor: Optional[float] = None,
|
||||
mode: str = "bilinear",
|
||||
align_corners: bool = True,
|
||||
) -> torch.Tensor:
|
||||
if size is None:
|
||||
assert scale_factor is not None
|
||||
size = (int(x.shape[-2] * scale_factor), int(x.shape[-1] * scale_factor))
|
||||
INT_MAX = 1610612736
|
||||
total = size[0] * size[1] * x.shape[0] * x.shape[1]
|
||||
if total > INT_MAX:
|
||||
chunks = torch.chunk(x, chunks=(total // INT_MAX) + 1, dim=0)
|
||||
outs = [F.interpolate(c, size=size, mode=mode, align_corners=align_corners) for c in chunks]
|
||||
return torch.cat(outs, dim=0).contiguous()
|
||||
return F.interpolate(x, size=size, mode=mode, align_corners=align_corners)
|
||||
|
||||
|
||||
def _create_uv_grid(width: int, height: int, aspect_ratio: float, dtype, device) -> torch.Tensor:
|
||||
"""Normalised UV grid spanning (-x_span, -y_span)..(x_span, y_span)."""
|
||||
diag_factor = (aspect_ratio ** 2 + 1.0) ** 0.5
|
||||
span_x = aspect_ratio / diag_factor
|
||||
span_y = 1.0 / diag_factor
|
||||
left_x = -span_x * (width - 1) / width
|
||||
right_x = span_x * (width - 1) / width
|
||||
top_y = -span_y * (height - 1) / height
|
||||
bottom_y = span_y * (height - 1) / height
|
||||
x_coords = torch.linspace(left_x, right_x, steps=width, dtype=dtype, device=device)
|
||||
y_coords = torch.linspace(top_y, bottom_y, steps=height, dtype=dtype, device=device)
|
||||
uu, vv = torch.meshgrid(x_coords, y_coords, indexing="xy")
|
||||
return torch.stack((uu, vv), dim=-1) # (H, W, 2)
|
||||
|
||||
|
||||
def _make_sincos_pos_embed(embed_dim: int, pos: torch.Tensor, omega_0: float = 100.0) -> torch.Tensor:
|
||||
omega = torch.arange(embed_dim // 2, dtype=torch.float32, device=pos.device)
|
||||
omega = 1.0 / omega_0 ** (omega / (embed_dim / 2.0))
|
||||
pos = pos.reshape(-1)
|
||||
out = torch.einsum("m,d->md", pos, omega)
|
||||
return torch.cat([out.sin(), out.cos()], dim=1).float()
|
||||
|
||||
|
||||
def _position_grid_to_embed(pos_grid: torch.Tensor, embed_dim: int, omega_0: float = 100.0) -> torch.Tensor:
|
||||
H, W, _ = pos_grid.shape
|
||||
pos_flat = pos_grid.reshape(-1, 2)
|
||||
emb_x = _make_sincos_pos_embed(embed_dim // 2, pos_flat[:, 0], omega_0=omega_0)
|
||||
emb_y = _make_sincos_pos_embed(embed_dim // 2, pos_flat[:, 1], omega_0=omega_0)
|
||||
emb = torch.cat([emb_x, emb_y], dim=-1)
|
||||
return emb.view(H, W, embed_dim)
|
||||
|
||||
|
||||
def _add_pos_embed(x: torch.Tensor, W: int, H: int, ratio: float = 0.1) -> torch.Tensor:
|
||||
"""Stateless UV positional embedding added to a feature map (B, C, h, w)."""
|
||||
pw, ph = x.shape[-1], x.shape[-2]
|
||||
pe = _create_uv_grid(pw, ph, aspect_ratio=W / H, dtype=x.dtype, device=x.device)
|
||||
pe = _position_grid_to_embed(pe, x.shape[1]) * ratio
|
||||
pe = pe.permute(2, 0, 1)[None].expand(x.shape[0], -1, -1, -1).to(dtype=x.dtype)
|
||||
return x + pe
|
||||
|
||||
|
||||
def _apply_activation(x: torch.Tensor, activation: str) -> torch.Tensor:
|
||||
act = (activation or "linear").lower()
|
||||
if act == "exp":
|
||||
return torch.exp(x)
|
||||
if act == "expp1":
|
||||
return torch.exp(x) + 1
|
||||
if act == "expm1":
|
||||
return torch.expm1(x)
|
||||
if act == "relu":
|
||||
return torch.relu(x)
|
||||
if act == "sigmoid":
|
||||
return torch.sigmoid(x)
|
||||
if act == "softplus":
|
||||
return F.softplus(x)
|
||||
if act == "tanh":
|
||||
return torch.tanh(x)
|
||||
return x
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Fusion building blocks
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class ResidualConvUnit(nn.Module):
|
||||
def __init__(self, features: int, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.conv1 = operations.Conv2d(features, features, 3, 1, 1, bias=True, device=device, dtype=dtype)
|
||||
self.conv2 = operations.Conv2d(features, features, 3, 1, 1, bias=True, device=device, dtype=dtype)
|
||||
self.activation = nn.ReLU(inplace=False)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
out = self.activation(x)
|
||||
out = self.conv1(out)
|
||||
out = self.activation(out)
|
||||
out = self.conv2(out)
|
||||
return out + x
|
||||
|
||||
|
||||
class FeatureFusionBlock(nn.Module):
|
||||
def __init__(self, features: int, has_residual: bool = True, align_corners: bool = True, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.align_corners = align_corners
|
||||
self.has_residual = has_residual
|
||||
if has_residual:
|
||||
self.resConfUnit1 = ResidualConvUnit(features, device=device, dtype=dtype, operations=operations)
|
||||
else:
|
||||
self.resConfUnit1 = None
|
||||
self.resConfUnit2 = ResidualConvUnit(features, device=device, dtype=dtype, operations=operations)
|
||||
self.out_conv = operations.Conv2d(features, features, 1, 1, 0, bias=True, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, *xs: torch.Tensor, size: Optional[Tuple[int, int]] = None) -> torch.Tensor:
|
||||
y = xs[0]
|
||||
if self.has_residual and len(xs) > 1 and self.resConfUnit1 is not None:
|
||||
y = y + self.resConfUnit1(xs[1])
|
||||
y = self.resConfUnit2(y)
|
||||
if size is None:
|
||||
up_kwargs = {"scale_factor": 2.0}
|
||||
else:
|
||||
up_kwargs = {"size": size}
|
||||
y = _custom_interpolate(y, **up_kwargs, mode="bilinear", align_corners=self.align_corners)
|
||||
y = self.out_conv(y)
|
||||
return y
|
||||
|
||||
|
||||
class _Scratch(nn.Module):
|
||||
"""Container that mirrors upstream ``scratch`` attribute layout."""
|
||||
|
||||
|
||||
def _make_scratch(in_shape: List[int], out_shape: int, device=None, dtype=None, operations=None) -> _Scratch:
|
||||
scratch = _Scratch()
|
||||
scratch.layer1_rn = operations.Conv2d(in_shape[0], out_shape, 3, 1, 1, bias=False, device=device, dtype=dtype)
|
||||
scratch.layer2_rn = operations.Conv2d(in_shape[1], out_shape, 3, 1, 1, bias=False, device=device, dtype=dtype)
|
||||
scratch.layer3_rn = operations.Conv2d(in_shape[2], out_shape, 3, 1, 1, bias=False, device=device, dtype=dtype)
|
||||
scratch.layer4_rn = operations.Conv2d(in_shape[3], out_shape, 3, 1, 1, bias=False, device=device, dtype=dtype)
|
||||
return scratch
|
||||
|
||||
|
||||
def _make_fusion_block(features: int, has_residual: bool = True, device=None, dtype=None, operations=None) -> FeatureFusionBlock:
|
||||
return FeatureFusionBlock(features, has_residual=has_residual, align_corners=True, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# DPT (single head + optional sky head) -- used by DA3Mono/Metric
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class DPT(nn.Module):
|
||||
"""Single-head DPT used by DA3Mono-Large and DA3Metric-Large."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dim_in: int,
|
||||
patch_size: int = 14,
|
||||
output_dim: int = 1,
|
||||
activation: str = "exp",
|
||||
conf_activation: str = "expp1",
|
||||
features: int = 256,
|
||||
out_channels: Sequence[int] = (256, 512, 1024, 1024),
|
||||
pos_embed: bool = False,
|
||||
down_ratio: int = 1,
|
||||
head_name: str = "depth",
|
||||
use_sky_head: bool = True,
|
||||
sky_name: str = "sky",
|
||||
sky_activation: str = "relu",
|
||||
norm_type: str = "idt",
|
||||
device=None, dtype=None, operations=None,
|
||||
):
|
||||
super().__init__()
|
||||
self.patch_size = patch_size
|
||||
self.activation = activation
|
||||
self.conf_activation = conf_activation
|
||||
self.pos_embed = pos_embed
|
||||
self.down_ratio = down_ratio
|
||||
self.head_main = head_name
|
||||
self.sky_name = sky_name
|
||||
self.out_dim = output_dim
|
||||
self.has_conf = output_dim > 1
|
||||
self.use_sky_head = use_sky_head
|
||||
self.sky_activation = sky_activation
|
||||
self.intermediate_layer_idx: Tuple[int, int, int, int] = (0, 1, 2, 3)
|
||||
|
||||
if norm_type == "layer":
|
||||
self.norm = operations.LayerNorm(dim_in, device=device, dtype=dtype)
|
||||
else:
|
||||
self.norm = nn.Identity()
|
||||
|
||||
out_channels = list(out_channels)
|
||||
self.projects = nn.ModuleList([
|
||||
operations.Conv2d(dim_in, oc, kernel_size=1, stride=1, padding=0, device=device, dtype=dtype)
|
||||
for oc in out_channels
|
||||
])
|
||||
self.resize_layers = nn.ModuleList([
|
||||
operations.ConvTranspose2d(out_channels[0], out_channels[0], kernel_size=4, stride=4, padding=0, device=device, dtype=dtype),
|
||||
operations.ConvTranspose2d(out_channels[1], out_channels[1], kernel_size=2, stride=2, padding=0, device=device, dtype=dtype),
|
||||
nn.Identity(),
|
||||
operations.Conv2d(out_channels[3], out_channels[3], kernel_size=3, stride=2, padding=1, device=device, dtype=dtype),
|
||||
])
|
||||
|
||||
self.scratch = _make_scratch(out_channels, features, device=device, dtype=dtype, operations=operations)
|
||||
self.scratch.refinenet1 = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
|
||||
self.scratch.refinenet2 = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
|
||||
self.scratch.refinenet3 = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
|
||||
self.scratch.refinenet4 = _make_fusion_block(features, has_residual=False, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
head_features_1 = features
|
||||
head_features_2 = 32
|
||||
self.scratch.output_conv1 = operations.Conv2d(
|
||||
head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1,
|
||||
device=device, dtype=dtype,
|
||||
)
|
||||
self.scratch.output_conv2 = nn.Sequential(
|
||||
operations.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1, device=device, dtype=dtype),
|
||||
nn.ReLU(inplace=False),
|
||||
operations.Conv2d(head_features_2, output_dim, kernel_size=1, stride=1, padding=0, device=device, dtype=dtype),
|
||||
)
|
||||
|
||||
if self.use_sky_head:
|
||||
self.scratch.sky_output_conv2 = nn.Sequential(
|
||||
operations.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1, device=device, dtype=dtype),
|
||||
nn.ReLU(inplace=False),
|
||||
operations.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0, device=device, dtype=dtype),
|
||||
)
|
||||
|
||||
def forward(self, feats: List[torch.Tensor], H: int, W: int, patch_start_idx: int = 0, **_kwargs) -> dict:
|
||||
# feats[i][0] is the patch-token tensor with shape (B, S, N_patch, C)
|
||||
B, S, N, C = feats[0][0].shape
|
||||
feats_flat = [feat[0].reshape(B * S, N, C) for feat in feats]
|
||||
|
||||
ph, pw = H // self.patch_size, W // self.patch_size
|
||||
resized = []
|
||||
for stage_idx, take_idx in enumerate(self.intermediate_layer_idx):
|
||||
x = feats_flat[take_idx][:, patch_start_idx:]
|
||||
x = self.norm(x)
|
||||
x = x.permute(0, 2, 1).contiguous().reshape(B * S, C, ph, pw)
|
||||
x = self.projects[stage_idx](x)
|
||||
if self.pos_embed:
|
||||
x = _add_pos_embed(x, W, H)
|
||||
x = self.resize_layers[stage_idx](x)
|
||||
resized.append(x)
|
||||
|
||||
l1_rn = self.scratch.layer1_rn(resized[0])
|
||||
l2_rn = self.scratch.layer2_rn(resized[1])
|
||||
l3_rn = self.scratch.layer3_rn(resized[2])
|
||||
l4_rn = self.scratch.layer4_rn(resized[3])
|
||||
|
||||
out = self.scratch.refinenet4(l4_rn, size=l3_rn.shape[2:])
|
||||
out = self.scratch.refinenet3(out, l3_rn, size=l2_rn.shape[2:])
|
||||
out = self.scratch.refinenet2(out, l2_rn, size=l1_rn.shape[2:])
|
||||
out = self.scratch.refinenet1(out, l1_rn)
|
||||
|
||||
h_out = int(ph * self.patch_size / self.down_ratio)
|
||||
w_out = int(pw * self.patch_size / self.down_ratio)
|
||||
|
||||
fused = self.scratch.output_conv1(out)
|
||||
fused = _custom_interpolate(fused, (h_out, w_out), mode="bilinear", align_corners=True)
|
||||
if self.pos_embed:
|
||||
fused = _add_pos_embed(fused, W, H)
|
||||
feat = fused
|
||||
|
||||
main_logits = self.scratch.output_conv2(feat)
|
||||
outs = {}
|
||||
if self.has_conf:
|
||||
fmap = main_logits.permute(0, 2, 3, 1)
|
||||
pred = _apply_activation(fmap[..., :-1], self.activation)
|
||||
conf = _apply_activation(fmap[..., -1], self.conf_activation)
|
||||
outs[self.head_main] = pred.squeeze(-1).view(B, S, *pred.shape[1:-1])
|
||||
outs[f"{self.head_main}_conf"] = conf.view(B, S, *conf.shape[1:])
|
||||
else:
|
||||
pred = _apply_activation(main_logits, self.activation)
|
||||
outs[self.head_main] = pred.squeeze(1).view(B, S, *pred.shape[2:])
|
||||
|
||||
if self.use_sky_head:
|
||||
sky_logits = self.scratch.sky_output_conv2(feat)
|
||||
if self.sky_activation.lower() == "sigmoid":
|
||||
sky = torch.sigmoid(sky_logits)
|
||||
elif self.sky_activation.lower() == "relu":
|
||||
sky = F.relu(sky_logits)
|
||||
else:
|
||||
sky = sky_logits
|
||||
outs[self.sky_name] = sky.squeeze(1).view(B, S, *sky.shape[2:])
|
||||
|
||||
return outs
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# DualDPT (depth + auxiliary "ray" head) -- used by DA3-Small / DA3-Base
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class DualDPT(nn.Module):
|
||||
"""Two-head DPT used by DA3-Small / DA3-Base."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dim_in: int,
|
||||
patch_size: int = 14,
|
||||
output_dim: int = 2,
|
||||
activation: str = "exp",
|
||||
conf_activation: str = "expp1",
|
||||
features: int = 256,
|
||||
out_channels: Sequence[int] = (256, 512, 1024, 1024),
|
||||
pos_embed: bool = True,
|
||||
down_ratio: int = 1,
|
||||
aux_pyramid_levels: int = 4,
|
||||
aux_out1_conv_num: int = 5,
|
||||
head_names: Tuple[str, str] = ("depth", "ray"),
|
||||
device=None, dtype=None, operations=None,
|
||||
):
|
||||
super().__init__()
|
||||
self.patch_size = patch_size
|
||||
self.activation = activation
|
||||
self.conf_activation = conf_activation
|
||||
self.pos_embed = pos_embed
|
||||
self.down_ratio = down_ratio
|
||||
self.aux_levels = aux_pyramid_levels
|
||||
self.aux_out1_conv_num = aux_out1_conv_num
|
||||
self.head_main, self.head_aux = head_names
|
||||
self.intermediate_layer_idx: Tuple[int, int, int, int] = (0, 1, 2, 3)
|
||||
# Toggle the auxiliary ray branch at runtime. Default off (mono path).
|
||||
# DepthAnything3Net flips this on when running multi-view + ray-pose.
|
||||
self.enable_aux: bool = False
|
||||
|
||||
self.norm = operations.LayerNorm(dim_in, device=device, dtype=dtype)
|
||||
out_channels = list(out_channels)
|
||||
self.projects = nn.ModuleList([
|
||||
operations.Conv2d(dim_in, oc, kernel_size=1, stride=1, padding=0, device=device, dtype=dtype)
|
||||
for oc in out_channels
|
||||
])
|
||||
self.resize_layers = nn.ModuleList([
|
||||
operations.ConvTranspose2d(out_channels[0], out_channels[0], kernel_size=4, stride=4, padding=0, device=device, dtype=dtype),
|
||||
operations.ConvTranspose2d(out_channels[1], out_channels[1], kernel_size=2, stride=2, padding=0, device=device, dtype=dtype),
|
||||
nn.Identity(),
|
||||
operations.Conv2d(out_channels[3], out_channels[3], kernel_size=3, stride=2, padding=1, device=device, dtype=dtype),
|
||||
])
|
||||
|
||||
self.scratch = _make_scratch(out_channels, features, device=device, dtype=dtype, operations=operations)
|
||||
# Main fusion chain
|
||||
self.scratch.refinenet1 = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
|
||||
self.scratch.refinenet2 = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
|
||||
self.scratch.refinenet3 = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
|
||||
self.scratch.refinenet4 = _make_fusion_block(features, has_residual=False, device=device, dtype=dtype, operations=operations)
|
||||
# Auxiliary fusion chain (separate copies)
|
||||
self.scratch.refinenet1_aux = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
|
||||
self.scratch.refinenet2_aux = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
|
||||
self.scratch.refinenet3_aux = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
|
||||
self.scratch.refinenet4_aux = _make_fusion_block(features, has_residual=False, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
head_features_1 = features
|
||||
head_features_2 = 32
|
||||
|
||||
# Main head neck + final projection
|
||||
self.scratch.output_conv1 = operations.Conv2d(
|
||||
head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1,
|
||||
device=device, dtype=dtype,
|
||||
)
|
||||
self.scratch.output_conv2 = nn.Sequential(
|
||||
operations.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1, device=device, dtype=dtype),
|
||||
nn.ReLU(inplace=False),
|
||||
operations.Conv2d(head_features_2, output_dim, kernel_size=1, stride=1, padding=0, device=device, dtype=dtype),
|
||||
)
|
||||
|
||||
# Aux pre-head per level (multi-level pyramid)
|
||||
self.scratch.output_conv1_aux = nn.ModuleList([
|
||||
self._make_aux_out1_block(head_features_1, device=device, dtype=dtype, operations=operations)
|
||||
for _ in range(self.aux_levels)
|
||||
])
|
||||
|
||||
# Aux final projection per level (includes LayerNorm permute path).
|
||||
ln_seq = [Permute((0, 2, 3, 1)),
|
||||
operations.LayerNorm(head_features_2, device=device, dtype=dtype),
|
||||
Permute((0, 3, 1, 2))]
|
||||
self.scratch.output_conv2_aux = nn.ModuleList([
|
||||
nn.Sequential(
|
||||
operations.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1, device=device, dtype=dtype),
|
||||
*ln_seq,
|
||||
nn.ReLU(inplace=False),
|
||||
operations.Conv2d(head_features_2, 7, kernel_size=1, stride=1, padding=0, device=device, dtype=dtype),
|
||||
)
|
||||
for _ in range(self.aux_levels)
|
||||
])
|
||||
|
||||
@staticmethod
|
||||
def _make_aux_out1_block(in_ch: int, *, device=None, dtype=None, operations=None) -> nn.Sequential:
|
||||
# aux_out1_conv_num=5 in all Apache-2.0 variants.
|
||||
return nn.Sequential(
|
||||
operations.Conv2d(in_ch, in_ch // 2, 3, 1, 1, device=device, dtype=dtype),
|
||||
operations.Conv2d(in_ch // 2, in_ch, 3, 1, 1, device=device, dtype=dtype),
|
||||
operations.Conv2d(in_ch, in_ch // 2, 3, 1, 1, device=device, dtype=dtype),
|
||||
operations.Conv2d(in_ch // 2, in_ch, 3, 1, 1, device=device, dtype=dtype),
|
||||
operations.Conv2d(in_ch, in_ch // 2, 3, 1, 1, device=device, dtype=dtype),
|
||||
)
|
||||
|
||||
def forward(self, feats: List[torch.Tensor], H: int, W: int, patch_start_idx: int = 0, **_kwargs) -> dict:
|
||||
B, S, N, C = feats[0][0].shape
|
||||
feats_flat = [feat[0].reshape(B * S, N, C) for feat in feats]
|
||||
|
||||
ph, pw = H // self.patch_size, W // self.patch_size
|
||||
resized = []
|
||||
for stage_idx, take_idx in enumerate(self.intermediate_layer_idx):
|
||||
x = feats_flat[take_idx][:, patch_start_idx:]
|
||||
x = self.norm(x)
|
||||
x = x.permute(0, 2, 1).contiguous().reshape(B * S, C, ph, pw)
|
||||
x = self.projects[stage_idx](x)
|
||||
if self.pos_embed:
|
||||
x = _add_pos_embed(x, W, H)
|
||||
x = self.resize_layers[stage_idx](x)
|
||||
resized.append(x)
|
||||
|
||||
l1_rn = self.scratch.layer1_rn(resized[0])
|
||||
l2_rn = self.scratch.layer2_rn(resized[1])
|
||||
l3_rn = self.scratch.layer3_rn(resized[2])
|
||||
l4_rn = self.scratch.layer4_rn(resized[3])
|
||||
|
||||
# Main pyramid (output_conv1 is applied inside the upstream `_fuse`,
|
||||
# before interpolation -- replicate that order here).
|
||||
m = self.scratch.refinenet4(l4_rn, size=l3_rn.shape[2:])
|
||||
if self.enable_aux:
|
||||
a4 = self.scratch.refinenet4_aux(l4_rn, size=l3_rn.shape[2:])
|
||||
aux_pyr = [a4]
|
||||
m = self.scratch.refinenet3(m, l3_rn, size=l2_rn.shape[2:])
|
||||
if self.enable_aux:
|
||||
aux_pyr.append(self.scratch.refinenet3_aux(aux_pyr[-1], l3_rn, size=l2_rn.shape[2:]))
|
||||
m = self.scratch.refinenet2(m, l2_rn, size=l1_rn.shape[2:])
|
||||
if self.enable_aux:
|
||||
aux_pyr.append(self.scratch.refinenet2_aux(aux_pyr[-1], l2_rn, size=l1_rn.shape[2:]))
|
||||
m = self.scratch.refinenet1(m, l1_rn)
|
||||
if self.enable_aux:
|
||||
aux_pyr.append(self.scratch.refinenet1_aux(aux_pyr[-1], l1_rn))
|
||||
m = self.scratch.output_conv1(m)
|
||||
|
||||
h_out = int(ph * self.patch_size / self.down_ratio)
|
||||
w_out = int(pw * self.patch_size / self.down_ratio)
|
||||
|
||||
m = _custom_interpolate(m, (h_out, w_out), mode="bilinear", align_corners=True)
|
||||
if self.pos_embed:
|
||||
m = _add_pos_embed(m, W, H)
|
||||
main_logits = self.scratch.output_conv2(m)
|
||||
fmap = main_logits.permute(0, 2, 3, 1)
|
||||
depth_pred = _apply_activation(fmap[..., :-1], self.activation)
|
||||
depth_conf = _apply_activation(fmap[..., -1], self.conf_activation)
|
||||
|
||||
outs = {
|
||||
self.head_main: depth_pred.squeeze(-1).view(B, S, *depth_pred.shape[1:-1]),
|
||||
f"{self.head_main}_conf": depth_conf.view(B, S, *depth_conf.shape[1:]),
|
||||
}
|
||||
|
||||
if self.enable_aux:
|
||||
# Auxiliary "ray" head (multi-level inside) -- only the last level
|
||||
# is returned. Mirrors upstream ``DualDPT._fuse`` + ``_forward_impl``:
|
||||
# each aux pyramid level goes through ``output_conv1_aux[i]``
|
||||
# (5-layer conv stack that ends at ``features // 2`` channels),
|
||||
# then the last level optionally gets a pos-embed and finally
|
||||
# ``output_conv2_aux[-1]``.
|
||||
aux_processed = [
|
||||
self.scratch.output_conv1_aux[i](a) for i, a in enumerate(aux_pyr)
|
||||
]
|
||||
last_aux = aux_processed[-1]
|
||||
if self.pos_embed:
|
||||
last_aux = _add_pos_embed(last_aux, W, H)
|
||||
last_aux_logits = self.scratch.output_conv2_aux[-1](last_aux)
|
||||
fmap_last = last_aux_logits.permute(0, 2, 3, 1)
|
||||
# Channels: [ray(6), ray_conf(1)]; ray uses 'linear' activation.
|
||||
aux_pred = fmap_last[..., :-1]
|
||||
aux_conf = _apply_activation(fmap_last[..., -1], self.conf_activation)
|
||||
outs[self.head_aux] = aux_pred.view(B, S, *aux_pred.shape[1:])
|
||||
outs[f"{self.head_aux}_conf"] = aux_conf.view(B, S, *aux_conf.shape[1:])
|
||||
|
||||
return outs
|
||||
236
comfy/ldm/depth_anything_3/model.py
Normal file
236
comfy/ldm/depth_anything_3/model.py
Normal file
@ -0,0 +1,236 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Dict, Optional, Sequence
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from comfy.image_encoders.dino2 import Dinov2Model
|
||||
|
||||
from .camera import CameraDec, CameraEnc
|
||||
from .dpt import DPT, DualDPT
|
||||
from .ray_pose import get_extrinsic_from_camray
|
||||
from .transform import affine_inverse, pose_encoding_to_extri_intri
|
||||
|
||||
|
||||
_HEAD_REGISTRY = {
|
||||
"dpt": DPT,
|
||||
"dualdpt": DualDPT,
|
||||
}
|
||||
|
||||
|
||||
# Backbone presets (mirror the upstream DINOv2 ViT variants).
|
||||
_BACKBONE_PRESETS = {
|
||||
"vits": dict(hidden_size=384, num_hidden_layers=12, num_attention_heads=6, use_swiglu_ffn=False),
|
||||
"vitb": dict(hidden_size=768, num_hidden_layers=12, num_attention_heads=12, use_swiglu_ffn=False),
|
||||
"vitl": dict(hidden_size=1024, num_hidden_layers=24, num_attention_heads=16, use_swiglu_ffn=False),
|
||||
"vitg": dict(hidden_size=1536, num_hidden_layers=40, num_attention_heads=24, use_swiglu_ffn=True),
|
||||
}
|
||||
|
||||
|
||||
def _build_backbone_config(
|
||||
backbone_name: str,
|
||||
*,
|
||||
alt_start: int,
|
||||
qknorm_start: int,
|
||||
rope_start: int,
|
||||
cat_token: bool,
|
||||
) -> dict:
|
||||
if backbone_name not in _BACKBONE_PRESETS:
|
||||
raise ValueError(f"Unknown DINOv2 backbone variant: {backbone_name!r}")
|
||||
cfg = dict(_BACKBONE_PRESETS[backbone_name])
|
||||
cfg.update(dict(
|
||||
layer_norm_eps=1e-6,
|
||||
patch_size=14,
|
||||
image_size=518,
|
||||
# No mask_token in DA3 weights; omit param to avoid load warnings.
|
||||
use_mask_token=False,
|
||||
alt_start=alt_start,
|
||||
qknorm_start=qknorm_start,
|
||||
rope_start=rope_start,
|
||||
cat_token=cat_token,
|
||||
rope_freq=100.0,
|
||||
))
|
||||
return cfg
|
||||
|
||||
|
||||
class DepthAnything3Net(nn.Module):
|
||||
|
||||
PATCH_SIZE = 14
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
# --- Backbone ---
|
||||
backbone_name: str = "vitl",
|
||||
out_layers: Sequence[int] = (4, 11, 17, 23),
|
||||
alt_start: int = -1,
|
||||
qknorm_start: int = -1,
|
||||
rope_start: int = -1,
|
||||
cat_token: bool = False,
|
||||
# --- Head ---
|
||||
head_type: str = "dpt", # dpt or dualdpt
|
||||
head_dim_in: int = 1024,
|
||||
head_output_dim: int = 1, # 1 = depth only, 2 = depth+conf
|
||||
head_features: int = 256,
|
||||
head_out_channels: Sequence[int] = (256, 512, 1024, 1024),
|
||||
head_use_sky_head: bool = True, # ignored by DualDPT
|
||||
head_pos_embed: Optional[bool] = None, # default: True for DualDPT, False for DPT
|
||||
# --- Camera (multi-view) ---
|
||||
has_cam_enc: bool = False,
|
||||
has_cam_dec: bool = False,
|
||||
cam_dim_out: Optional[int] = None, # CameraEnc dim_out (defaults to embed_dim)
|
||||
cam_dec_dim_in: Optional[int] = None, # CameraDec dim_in (defaults to 2*embed_dim with cat_token)
|
||||
# ComfyUI plumbing
|
||||
device=None, dtype=None, operations=None,
|
||||
**_ignored,
|
||||
):
|
||||
super().__init__()
|
||||
head_cls = _HEAD_REGISTRY[head_type.lower()]
|
||||
self.head_type = head_type.lower()
|
||||
self.has_sky = (self.head_type == "dpt") and head_use_sky_head
|
||||
self.has_conf = head_output_dim > 1
|
||||
self.out_layers = list(out_layers)
|
||||
|
||||
backbone_cfg = _build_backbone_config(
|
||||
backbone_name,
|
||||
alt_start=alt_start,
|
||||
qknorm_start=qknorm_start,
|
||||
rope_start=rope_start,
|
||||
cat_token=cat_token,
|
||||
)
|
||||
self.backbone = Dinov2Model(backbone_cfg, dtype, device, operations)
|
||||
|
||||
head_kwargs = dict(
|
||||
dim_in=head_dim_in,
|
||||
patch_size=self.PATCH_SIZE,
|
||||
output_dim=head_output_dim,
|
||||
features=head_features,
|
||||
out_channels=tuple(head_out_channels),
|
||||
device=device, dtype=dtype, operations=operations,
|
||||
)
|
||||
if self.head_type == "dpt":
|
||||
head_kwargs.update(
|
||||
use_sky_head=head_use_sky_head,
|
||||
pos_embed=(False if head_pos_embed is None else head_pos_embed),
|
||||
)
|
||||
else: # dualdpt
|
||||
head_kwargs.update(
|
||||
pos_embed=(True if head_pos_embed is None else head_pos_embed),
|
||||
)
|
||||
self.head = head_cls(**head_kwargs)
|
||||
|
||||
# Built only if checkpoint has weights; cam_enc output dim == embed_dim.
|
||||
embed_dim = backbone_cfg["hidden_size"]
|
||||
if has_cam_enc:
|
||||
self.cam_enc = CameraEnc(
|
||||
dim_out=cam_dim_out if cam_dim_out is not None else embed_dim,
|
||||
num_heads=max(1, embed_dim // 64),
|
||||
device=device, dtype=dtype, operations=operations,
|
||||
)
|
||||
else:
|
||||
self.cam_enc = None
|
||||
if has_cam_dec:
|
||||
default_dim = embed_dim * (2 if cat_token else 1)
|
||||
self.cam_dec = CameraDec(
|
||||
dim_in=cam_dec_dim_in if cam_dec_dim_in is not None else default_dim,
|
||||
device=device, dtype=dtype, operations=operations,
|
||||
)
|
||||
else:
|
||||
self.cam_dec = None
|
||||
|
||||
self.dtype = dtype
|
||||
|
||||
def forward(
|
||||
self,
|
||||
image: torch.Tensor,
|
||||
extrinsics: Optional[torch.Tensor] = None,
|
||||
intrinsics: Optional[torch.Tensor] = None,
|
||||
*,
|
||||
use_ray_pose: bool = False,
|
||||
ref_view_strategy: str = "saddle_balanced",
|
||||
export_feat_layers: Optional[Sequence[int]] = None,
|
||||
**_unused,
|
||||
) -> Dict[str, torch.Tensor]:
|
||||
"""Run depth and optionally pose prediction."""
|
||||
if image.ndim == 4:
|
||||
image = image.unsqueeze(1) # (B, 1, 3, H, W)
|
||||
assert image.ndim == 5 and image.shape[2] == 3, \
|
||||
f"image must be (B,3,H,W) or (B,S,3,H,W); got {tuple(image.shape)}"
|
||||
|
||||
B, S, _, H, W = image.shape
|
||||
assert H % self.PATCH_SIZE == 0 and W % self.PATCH_SIZE == 0, \
|
||||
f"image H,W must be multiples of {self.PATCH_SIZE}; got {(H, W)}"
|
||||
|
||||
# Camera-token preparation (multi-view path).
|
||||
cam_token = None
|
||||
if extrinsics is not None and intrinsics is not None and self.cam_enc is not None:
|
||||
cam_token = self.cam_enc(extrinsics, intrinsics, (H, W))
|
||||
|
||||
# Toggle aux ray output on/off depending on what the caller asked for.
|
||||
if isinstance(self.head, DualDPT):
|
||||
self.head.enable_aux = bool(use_ray_pose)
|
||||
|
||||
feats, aux_feats = self.backbone.get_intermediate_layers_da3(
|
||||
image, self.out_layers, cam_token=cam_token,
|
||||
ref_view_strategy=ref_view_strategy,
|
||||
export_feat_layers=export_feat_layers,
|
||||
)
|
||||
head_out = self.head(feats, H=H, W=W, patch_start_idx=0)
|
||||
|
||||
# Pose prediction.
|
||||
out: Dict[str, torch.Tensor] = {}
|
||||
if use_ray_pose and "ray" in head_out and "ray_conf" in head_out:
|
||||
ray = head_out["ray"]
|
||||
ray_conf = head_out["ray_conf"]
|
||||
extr_c2w, focal, pp = get_extrinsic_from_camray(
|
||||
ray, ray_conf, ray.shape[-3], ray.shape[-2],
|
||||
)
|
||||
# Match the upstream output: w2c, drop the homogeneous row.
|
||||
extr_w2c = affine_inverse(extr_c2w)[:, :, :3, :]
|
||||
# Build pixel-space intrinsics from the normalised focal/pp output.
|
||||
intr = torch.eye(3, device=ray.device, dtype=ray.dtype)
|
||||
intr = intr[None, None].expand(extr_c2w.shape[0], extr_c2w.shape[1], 3, 3).clone()
|
||||
intr[:, :, 0, 0] = focal[:, :, 0] / 2 * W
|
||||
intr[:, :, 1, 1] = focal[:, :, 1] / 2 * H
|
||||
intr[:, :, 0, 2] = pp[:, :, 0] * W * 0.5
|
||||
intr[:, :, 1, 2] = pp[:, :, 1] * H * 0.5
|
||||
out["extrinsics"] = extr_w2c
|
||||
out["intrinsics"] = intr
|
||||
elif self.cam_dec is not None and S > 1:
|
||||
# Decode the cam-token of the final out_layer into a pose encoding.
|
||||
cam_feat = feats[-1][1] # (B, S, dim_in_to_cam_dec)
|
||||
pose_enc = self.cam_dec(cam_feat)
|
||||
c2w_3x4, intr = pose_encoding_to_extri_intri(pose_enc, (H, W))
|
||||
# Match the upstream output convention: w2c (world->camera), 3x4.
|
||||
c2w_4x4 = torch.cat([
|
||||
c2w_3x4,
|
||||
torch.tensor([0, 0, 0, 1], device=c2w_3x4.device, dtype=c2w_3x4.dtype)
|
||||
.view(1, 1, 1, 4).expand(B, S, 1, 4),
|
||||
], dim=-2)
|
||||
out["extrinsics"] = affine_inverse(c2w_4x4)[:, :, :3, :]
|
||||
out["intrinsics"] = intr
|
||||
|
||||
# Flatten the views axis for per-pixel outputs (depth/conf/sky) so the
|
||||
# per-image consumer keeps its (B*S, H, W) interface.
|
||||
for k, v in head_out.items():
|
||||
if k in ("ray", "ray_conf"):
|
||||
# Keep multi-view shape for downstream pose work.
|
||||
out[k] = v
|
||||
elif v.ndim >= 3 and v.shape[0] == B and v.shape[1] == S:
|
||||
out[k] = v.reshape(B * S, *v.shape[2:])
|
||||
else:
|
||||
out[k] = v
|
||||
|
||||
if export_feat_layers:
|
||||
out["aux_features"] = self._reshape_aux_features(aux_feats, H, W)
|
||||
return out
|
||||
|
||||
def _reshape_aux_features(self, aux_feats, H: int, W: int):
|
||||
"""Reshape (B, S, N, C) aux features into (B, S, h_p, w_p, C)."""
|
||||
ph, pw = H // self.PATCH_SIZE, W // self.PATCH_SIZE
|
||||
out = []
|
||||
for f in aux_feats:
|
||||
B, S, N, C = f.shape
|
||||
assert N == ph * pw, f"aux feature seq mismatch: {N} != {ph}*{pw}"
|
||||
out.append(f.reshape(B, S, ph, pw, C))
|
||||
return out
|
||||
128
comfy/ldm/depth_anything_3/preprocess.py
Normal file
128
comfy/ldm/depth_anything_3/preprocess.py
Normal file
@ -0,0 +1,128 @@
|
||||
"""Input/output preprocessing helpers for Depth Anything 3."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Tuple
|
||||
|
||||
import torch
|
||||
|
||||
import comfy.utils
|
||||
|
||||
PATCH_SIZE = 14
|
||||
|
||||
# ImageNet normalization constants used during DA3 training.
|
||||
_IMAGENET_MEAN = torch.tensor([0.485, 0.456, 0.406])
|
||||
_IMAGENET_STD = torch.tensor([0.229, 0.224, 0.225])
|
||||
|
||||
|
||||
def _round_to_patch(x: int, patch: int = PATCH_SIZE) -> int:
|
||||
down = (x // patch) * patch
|
||||
up = down + patch
|
||||
return up if abs(up - x) <= abs(x - down) else down
|
||||
|
||||
|
||||
def compute_target_size(orig_h: int, orig_w: int, process_res: int, method: str = "upper_bound_resize") -> Tuple[int, int]:
|
||||
"""Compute (target_h, target_w) for a single image.
|
||||
upper_bound_resize: scale longest side to process_res, then round each dim to nearest multiple of 14 (default upstream method).
|
||||
lower_bound_resize: scale shortest side to process_res, then round."""
|
||||
|
||||
if method == "upper_bound_resize":
|
||||
longest = max(orig_h, orig_w)
|
||||
scale = process_res / float(longest)
|
||||
elif method == "lower_bound_resize":
|
||||
shortest = min(orig_h, orig_w)
|
||||
scale = process_res / float(shortest)
|
||||
else:
|
||||
raise ValueError(f"Unsupported process_res_method: {method}")
|
||||
|
||||
new_w = max(1, _round_to_patch(int(round(orig_w * scale))))
|
||||
new_h = max(1, _round_to_patch(int(round(orig_h * scale))))
|
||||
return new_h, new_w
|
||||
|
||||
|
||||
def preprocess_image(image: torch.Tensor, process_res: int = 504, method: str = "upper_bound_resize") -> torch.Tensor:
|
||||
assert image.ndim == 4 and image.shape[-1] == 3, f"expected (B,H,W,3) IMAGE; got {tuple(image.shape)}"
|
||||
B, H, W, _ = image.shape
|
||||
target_h, target_w = compute_target_size(H, W, process_res, method)
|
||||
|
||||
# (B, H, W, 3) -> (B, 3, H, W)
|
||||
x = image.movedim(-1, 1).contiguous()
|
||||
if (target_h, target_w) != (H, W):
|
||||
# Upstream uses cv2 INTER_CUBIC (upscale) / INTER_AREA (downscale).
|
||||
# Lanczos in ``common_upscale`` is anti-aliased and produces the
|
||||
# closest pixel-wise match in a sweep across {bilinear, bicubic,
|
||||
# area, lanczos, bislerp}. Used in both directions for simplicity.
|
||||
x = comfy.utils.common_upscale(x.float(), target_w, target_h, "lanczos", "disabled",)
|
||||
x = x.clamp(0.0, 1.0)
|
||||
|
||||
mean = _IMAGENET_MEAN.to(device=x.device, dtype=x.dtype).view(1, 3, 1, 1)
|
||||
std = _IMAGENET_STD.to(device=x.device, dtype=x.dtype).view(1, 3, 1, 1)
|
||||
x = (x - mean) / std
|
||||
return x
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Output post-processing (sky-aware clipping for Mono/Metric variants)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def compute_non_sky_mask(sky_prediction: torch.Tensor, threshold: float = 0.3) -> torch.Tensor:
|
||||
"""Boolean mask: True for non-sky pixels (sky probability < threshold)."""
|
||||
return sky_prediction < threshold
|
||||
|
||||
|
||||
def apply_sky_aware_clip(depth: torch.Tensor, sky: torch.Tensor, threshold: float = 0.3, quantile: float = 0.99) -> torch.Tensor:
|
||||
"""Clips sky regions to the 99th percentile of non-sky depth. Returns a new depth tensor."""
|
||||
non_sky = compute_non_sky_mask(sky, threshold=threshold)
|
||||
if non_sky.sum() <= 10 or (~non_sky).sum() <= 10:
|
||||
return depth.clone()
|
||||
|
||||
non_sky_depth = depth[non_sky]
|
||||
if non_sky_depth.numel() > 100_000:
|
||||
idx = torch.randint(0, non_sky_depth.numel(), (100_000,), device=non_sky_depth.device)
|
||||
sampled = non_sky_depth[idx]
|
||||
else:
|
||||
sampled = non_sky_depth
|
||||
|
||||
max_depth = torch.quantile(sampled, quantile)
|
||||
out = depth.clone()
|
||||
out[~non_sky] = max_depth
|
||||
return out
|
||||
|
||||
|
||||
def normalize_depth_v2_style(depth: torch.Tensor, sky: torch.Tensor | None = None, low_quantile: float = 0.01, high_quantile: float = 0.99) -> torch.Tensor:
|
||||
"""V2-style normalization computes percentile bounds over non-sky pixels (when available), then maps depth into [0, 1] with near = white (1.0)."""
|
||||
if sky is not None:
|
||||
mask = compute_non_sky_mask(sky)
|
||||
if mask.any():
|
||||
valid = depth[mask]
|
||||
else:
|
||||
valid = depth.flatten()
|
||||
else:
|
||||
valid = depth.flatten()
|
||||
|
||||
if valid.numel() > 100_000:
|
||||
idx = torch.randint(0, valid.numel(), (100_000,), device=valid.device)
|
||||
sample = valid[idx]
|
||||
else:
|
||||
sample = valid
|
||||
|
||||
lo = torch.quantile(sample, low_quantile)
|
||||
hi = torch.quantile(sample, high_quantile)
|
||||
rng = (hi - lo).clamp(min=1e-6)
|
||||
norm = ((depth - lo) / rng).clamp(0.0, 1.0)
|
||||
# Nearer pixels are brighter (1.0)
|
||||
norm = 1.0 - norm
|
||||
if sky is not None:
|
||||
# Sky pixels become black (far / unknown)
|
||||
sky_mask = ~compute_non_sky_mask(sky)
|
||||
norm = torch.where(sky_mask, torch.zeros_like(norm), norm)
|
||||
return norm
|
||||
|
||||
|
||||
def normalize_depth_min_max(depth: torch.Tensor) -> torch.Tensor:
|
||||
"""Simple per-frame min/max normalization with near=1.0 convention."""
|
||||
lo = depth.amin(dim=(-2, -1), keepdim=True)
|
||||
hi = depth.amax(dim=(-2, -1), keepdim=True)
|
||||
rng = (hi - lo).clamp(min=1e-6)
|
||||
return 1.0 - ((depth - lo) / rng).clamp(0.0, 1.0)
|
||||
272
comfy/ldm/depth_anything_3/ray_pose.py
Normal file
272
comfy/ldm/depth_anything_3/ray_pose.py
Normal file
@ -0,0 +1,272 @@
|
||||
"""Ray-to-pose conversion for the multi-view path of Depth Anything 3."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
# qr/svd use fp32: CUDA often has no fp16/bf16 kernels for these ops.
|
||||
|
||||
|
||||
def _ql_decomposition(A: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Decompose A = Q @ L with Q orthogonal and L lower-triangular.
|
||||
Implemented in terms of QR by reversing the columns/rows; the standard
|
||||
trick from the upstream reference. Inputs A are (3, 3)."""
|
||||
P = torch.tensor([[0, 0, 1], [0, 1, 0], [1, 0, 0]], device=A.device, dtype=A.dtype)
|
||||
A_tilde = A @ P
|
||||
# CUDA QR is not implemented for fp16/bf16; upcast just for this call.
|
||||
Q_tilde, R_tilde = torch.linalg.qr(A_tilde.float())
|
||||
Q_tilde = Q_tilde.to(A.dtype)
|
||||
R_tilde = R_tilde.to(A.dtype)
|
||||
Q = Q_tilde @ P
|
||||
L = P @ R_tilde @ P
|
||||
d = torch.diag(L)
|
||||
sign = torch.sign(d)
|
||||
Q = Q * sign[None, :] # scale columns of Q
|
||||
L = L * sign[:, None] # scale rows of L
|
||||
return Q, L
|
||||
|
||||
|
||||
def _homogenize_points(points: torch.Tensor) -> torch.Tensor:
|
||||
return torch.cat([points, torch.ones_like(points[..., :1])], dim=-1)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Weighted-LSQ + RANSAC homography (batched)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _find_homography_weighted_lsq(src_pts: torch.Tensor, dst_pts: torch.Tensor, confident_weight: torch.Tensor,) -> torch.Tensor:
|
||||
"""Solve a single H with weighted least-squares (DLT)."""
|
||||
N = src_pts.shape[0]
|
||||
if N < 4:
|
||||
raise ValueError("At least 4 points are required to compute a homography.")
|
||||
w = confident_weight.sqrt().unsqueeze(1) # (N, 1)
|
||||
x = src_pts[:, 0:1]
|
||||
y = src_pts[:, 1:2]
|
||||
u = dst_pts[:, 0:1]
|
||||
v = dst_pts[:, 1:2]
|
||||
zeros = torch.zeros_like(x)
|
||||
A1 = torch.cat([-x * w, -y * w, -w, zeros, zeros, zeros, x * u * w, y * u * w, u * w], dim=1)
|
||||
A2 = torch.cat([zeros, zeros, zeros, -x * w, -y * w, -w, x * v * w, y * v * w, v * w], dim=1)
|
||||
A = torch.cat([A1, A2], dim=0) # (2N, 9)
|
||||
# CUDA SVD is not implemented for fp16/bf16; upcast just for this call.
|
||||
_, _, Vh = torch.linalg.svd(A.float())
|
||||
Vh = Vh.to(A.dtype)
|
||||
H = Vh[-1].reshape(3, 3)
|
||||
return H / H[-1, -1]
|
||||
|
||||
|
||||
def _find_homography_weighted_lsq_batched(src_pts_batch: torch.Tensor, dst_pts_batch: torch.Tensor, confident_weight_batch: torch.Tensor) -> torch.Tensor:
|
||||
"""Batched DLT solver. Inputs (B, K, 2) / (B, K); output (B, 3, 3)."""
|
||||
B, K, _ = src_pts_batch.shape
|
||||
w = confident_weight_batch.sqrt().unsqueeze(2)
|
||||
x = src_pts_batch[:, :, 0:1]
|
||||
y = src_pts_batch[:, :, 1:2]
|
||||
u = dst_pts_batch[:, :, 0:1]
|
||||
v = dst_pts_batch[:, :, 1:2]
|
||||
zeros = torch.zeros_like(x)
|
||||
A1 = torch.cat([-x * w, -y * w, -w, zeros, zeros, zeros, x * u * w, y * u * w, u * w], dim=2)
|
||||
A2 = torch.cat([zeros, zeros, zeros, -x * w, -y * w, -w, x * v * w, y * v * w, v * w], dim=2)
|
||||
A = torch.cat([A1, A2], dim=1) # (B, 2K, 9)
|
||||
# CUDA SVD is not implemented for fp16/bf16; upcast just for this call.
|
||||
_, _, Vh = torch.linalg.svd(A.float())
|
||||
Vh = Vh.to(A.dtype)
|
||||
H = Vh[:, -1].reshape(B, 3, 3)
|
||||
return H / H[:, 2:3, 2:3]
|
||||
|
||||
|
||||
def _ransac_find_homography_weighted_batched(
|
||||
src_pts: torch.Tensor, # (B, N, 2)
|
||||
dst_pts: torch.Tensor, # (B, N, 2)
|
||||
confident_weight: torch.Tensor, # (B, N)
|
||||
n_sample: int,
|
||||
n_iter: int = 100,
|
||||
reproj_threshold: float = 3.0,
|
||||
num_sample_for_ransac: int = 8,
|
||||
max_inlier_num: int = 10000,
|
||||
rand_sample_iters_idx: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
"""Batched weighted-RANSAC homography estimator. Returns (B, 3, 3) homography matrices."""
|
||||
B, N, _ = src_pts.shape
|
||||
assert N >= 4
|
||||
device = src_pts.device
|
||||
|
||||
sorted_idx = torch.argsort(confident_weight, descending=True, dim=1)
|
||||
candidate_idx = sorted_idx[:, :n_sample] # (B, n_sample)
|
||||
|
||||
if rand_sample_iters_idx is None:
|
||||
rand_sample_iters_idx = torch.stack(
|
||||
[torch.randperm(n_sample, device=device)[:num_sample_for_ransac]
|
||||
for _ in range(n_iter)],
|
||||
dim=0,
|
||||
)
|
||||
|
||||
rand_idx = candidate_idx[:, rand_sample_iters_idx] # (B, n_iter, k)
|
||||
b_idx = (
|
||||
torch.arange(B, device=device)
|
||||
.view(B, 1, 1)
|
||||
.expand(B, n_iter, num_sample_for_ransac)
|
||||
)
|
||||
src_b = src_pts[b_idx, rand_idx]
|
||||
dst_b = dst_pts[b_idx, rand_idx]
|
||||
w_b = confident_weight[b_idx, rand_idx]
|
||||
|
||||
cB, cN = src_b.shape[:2]
|
||||
H_batch = _find_homography_weighted_lsq_batched(
|
||||
src_b.flatten(0, 1), dst_b.flatten(0, 1), w_b.flatten(0, 1),
|
||||
).unflatten(0, (cB, cN)) # (B, n_iter, 3, 3)
|
||||
|
||||
src_homo = torch.cat([src_pts, torch.ones(B, N, 1, device=device, dtype=src_pts.dtype)], dim=2)
|
||||
proj = torch.bmm(
|
||||
src_homo.unsqueeze(1).expand(B, n_iter, N, 3).reshape(-1, N, 3),
|
||||
H_batch.reshape(-1, 3, 3).transpose(1, 2),
|
||||
) # (B*n_iter, N, 3)
|
||||
proj_xy = (proj[:, :, :2] / proj[:, :, 2:3]).reshape(B, n_iter, N, 2)
|
||||
err = ((proj_xy - dst_pts.unsqueeze(1)) ** 2).sum(-1).sqrt() # (B, n_iter, N)
|
||||
inlier_mask = err < reproj_threshold
|
||||
score = (inlier_mask * confident_weight.unsqueeze(1)).sum(dim=2)
|
||||
best_idx = torch.argmax(score, dim=1)
|
||||
best_inlier_mask = inlier_mask[torch.arange(B, device=device), best_idx]
|
||||
|
||||
# Refit with the inlier set (per-batch, since the inlier counts vary).
|
||||
H_inlier_list = []
|
||||
for b in range(B):
|
||||
mask = best_inlier_mask[b]
|
||||
in_src = src_pts[b][mask]
|
||||
in_dst = dst_pts[b][mask]
|
||||
in_w = confident_weight[b][mask]
|
||||
if in_src.shape[0] < 4:
|
||||
# Fall back to identity when RANSAC fails to find enough inliers.
|
||||
H_inlier_list.append(torch.eye(3, device=device, dtype=src_pts.dtype))
|
||||
continue
|
||||
sorted_w = torch.argsort(in_w, descending=True)
|
||||
if len(sorted_w) > max_inlier_num:
|
||||
keep = max(int(len(sorted_w) * 0.95), max_inlier_num)
|
||||
sorted_w = sorted_w[:keep][torch.randperm(keep, device=device)[:max_inlier_num]]
|
||||
H_inlier_list.append(
|
||||
_find_homography_weighted_lsq(in_src[sorted_w], in_dst[sorted_w], in_w[sorted_w])
|
||||
)
|
||||
return torch.stack(H_inlier_list, dim=0)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Camera-ray utilities
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _unproject_identity(num_y: int, num_x: int, B: int, S: int, device, dtype) -> torch.Tensor:
|
||||
"""Camera-space unit rays for an identity intrinsic on a 2x2 image plane."""
|
||||
dx = 1.0 / num_x
|
||||
dy = 1.0 / num_y
|
||||
# Centered camera-space coords directly (skip the K^-1 step since it's
|
||||
# just a translation by -1 on x and y when K is identity-with-center=1).
|
||||
y = torch.linspace(-(1 - dy), (1 - dy), num_y, device=device, dtype=dtype)
|
||||
x = torch.linspace(-(1 - dx), (1 - dx), num_x, device=device, dtype=dtype)
|
||||
yy, xx = torch.meshgrid(y, x, indexing="ij")
|
||||
grid = torch.stack((xx, yy), dim=-1) # (h, w, 2)
|
||||
grid = grid.unsqueeze(0).unsqueeze(0).expand(B, S, num_y, num_x, 2)
|
||||
return torch.cat([grid, torch.ones_like(grid[..., :1])], dim=-1)
|
||||
|
||||
|
||||
def _camray_to_caminfo(
|
||||
camray: torch.Tensor, # (B, S, h, w, 6)
|
||||
confidence: Optional[torch.Tensor] = None, # (B, S, h, w)
|
||||
reproj_threshold: float = 0.2,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""Convert per-pixel camera rays to per-view (R, T, focal, principal)."""
|
||||
if confidence is None:
|
||||
confidence = torch.ones_like(camray[..., 0])
|
||||
B, S, h, w, _ = camray.shape
|
||||
device = camray.device
|
||||
dtype = camray.dtype
|
||||
|
||||
rays_target = camray[..., :3] # (B, S, h, w, 3)
|
||||
rays_origin = _unproject_identity(h, w, B, S, device, dtype)
|
||||
|
||||
# Flatten (B*S, h*w, *) for the RANSAC routine.
|
||||
rays_target = rays_target.flatten(0, 1).flatten(1, 2)
|
||||
rays_origin = rays_origin.flatten(0, 1).flatten(1, 2)
|
||||
weights = confidence.flatten(0, 1).flatten(1, 2).clone()
|
||||
|
||||
# Project to 2D in homogeneous form (the upstream calls this "perspective division").
|
||||
z_thresh = 1e-4
|
||||
mask = (rays_target[:, :, 2].abs() > z_thresh) & (rays_origin[:, :, 2].abs() > z_thresh)
|
||||
weights = torch.where(mask, weights, torch.zeros_like(weights))
|
||||
src = rays_origin.clone()
|
||||
dst = rays_target.clone()
|
||||
src[..., 0] = torch.where(mask, src[..., 0] / src[..., 2], src[..., 0])
|
||||
src[..., 1] = torch.where(mask, src[..., 1] / src[..., 2], src[..., 1])
|
||||
dst[..., 0] = torch.where(mask, dst[..., 0] / dst[..., 2], dst[..., 0])
|
||||
dst[..., 1] = torch.where(mask, dst[..., 1] / dst[..., 2], dst[..., 1])
|
||||
src = src[..., :2]
|
||||
dst = dst[..., :2]
|
||||
|
||||
N = src.shape[1]
|
||||
n_iter = 100
|
||||
sample_ratio = 0.3
|
||||
num_sample_for_ransac = 8
|
||||
n_sample = max(num_sample_for_ransac, int(N * sample_ratio))
|
||||
rand_idx = torch.stack(
|
||||
[torch.randperm(n_sample, device=device)[:num_sample_for_ransac] for _ in range(n_iter)],
|
||||
dim=0,
|
||||
)
|
||||
|
||||
# Chunk along the view axis to keep peak memory predictable.
|
||||
chunk = 2
|
||||
A_list = []
|
||||
for i in range(0, src.shape[0], chunk):
|
||||
A = _ransac_find_homography_weighted_batched(
|
||||
src[i:i + chunk], dst[i:i + chunk], weights[i:i + chunk],
|
||||
n_sample=n_sample, n_iter=n_iter,
|
||||
num_sample_for_ransac=num_sample_for_ransac,
|
||||
reproj_threshold=reproj_threshold,
|
||||
rand_sample_iters_idx=rand_idx,
|
||||
max_inlier_num=8000,
|
||||
)
|
||||
# Flip sign on dets that come out < 0 (so that the QL produces a
|
||||
# right-handed rotation). ``det`` lacks fp16/bf16 CUDA kernels, so
|
||||
# do the comparison in fp32.
|
||||
flip = torch.linalg.det(A.float()) < 0
|
||||
A = torch.where(flip[:, None, None], -A, A)
|
||||
A_list.append(A)
|
||||
A = torch.cat(A_list, dim=0) # (B*S, 3, 3)
|
||||
|
||||
R_list, f_list, pp_list = [], [], []
|
||||
for i in range(A.shape[0]):
|
||||
R, L = _ql_decomposition(A[i])
|
||||
L = L / L[2][2]
|
||||
f_list.append(torch.stack((L[0][0], L[1][1])))
|
||||
pp_list.append(torch.stack((L[2][0], L[2][1])))
|
||||
R_list.append(R)
|
||||
R = torch.stack(R_list).reshape(B, S, 3, 3)
|
||||
focal = torch.stack(f_list).reshape(B, S, 2)
|
||||
pp = torch.stack(pp_list).reshape(B, S, 2)
|
||||
|
||||
# Translation: confidence-weighted average of camray direction(s).
|
||||
cf = confidence.flatten(0, 1).flatten(1, 2)
|
||||
T = (camray.flatten(0, 1).flatten(1, 2)[..., 3:] * cf.unsqueeze(-1)).sum(dim=1)
|
||||
T = T / cf.sum(dim=-1, keepdim=True)
|
||||
T = T.reshape(B, S, 3)
|
||||
|
||||
# Match upstream output convention: focal -> 1/focal, pp + 1.
|
||||
return R, T, 1.0 / focal, pp + 1.0
|
||||
|
||||
|
||||
def get_extrinsic_from_camray(
|
||||
camray: torch.Tensor, # (B, S, h, w, 6)
|
||||
conf: torch.Tensor, # (B, S, h, w, 1) or (B, S, h, w)
|
||||
patch_size_y: int,
|
||||
patch_size_x: int,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""Wrap a 4x4 extrinsic + per-view focal + principal-point output."""
|
||||
if conf.ndim == 5 and conf.shape[-1] == 1:
|
||||
conf = conf.squeeze(-1)
|
||||
R, T, focal, pp = _camray_to_caminfo(camray, confidence=conf)
|
||||
extr = torch.cat([R, T.unsqueeze(-1)], dim=-1) # (B, S, 3, 4)
|
||||
homo_row = torch.tensor([0, 0, 0, 1], dtype=R.dtype, device=R.device)
|
||||
homo_row = homo_row.view(1, 1, 1, 4).expand(R.shape[0], R.shape[1], 1, 4)
|
||||
extr = torch.cat([extr, homo_row], dim=-2) # (B, S, 4, 4)
|
||||
return extr, focal, pp
|
||||
87
comfy/ldm/depth_anything_3/reference_view_selector.py
Normal file
87
comfy/ldm/depth_anything_3/reference_view_selector.py
Normal file
@ -0,0 +1,87 @@
|
||||
"""Reference-view selection for the multi-view path of Depth Anything 3."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Literal
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
RefViewStrategy = Literal["first", "middle", "saddle_balanced", "saddle_sim_range"]
|
||||
|
||||
|
||||
# Per the upstream constants module: ``THRESH_FOR_REF_SELECTION = 3``.
|
||||
# Reference selection only runs when there are at least this many views.
|
||||
THRESH_FOR_REF_SELECTION: int = 3
|
||||
|
||||
|
||||
def select_reference_view(x: torch.Tensor, strategy: RefViewStrategy = "saddle_balanced") -> torch.Tensor:
|
||||
"""Pick a reference view index per batch element."""
|
||||
B, S, _, _ = x.shape
|
||||
if S <= 1:
|
||||
return torch.zeros(B, dtype=torch.long, device=x.device)
|
||||
if strategy == "first":
|
||||
return torch.zeros(B, dtype=torch.long, device=x.device)
|
||||
if strategy == "middle":
|
||||
return torch.full((B,), S // 2, dtype=torch.long, device=x.device)
|
||||
|
||||
# Feature-based strategies: normalised cls/cam token per view.
|
||||
img_class_feat = x[:, :, 0] / x[:, :, 0].norm(dim=-1, keepdim=True) # (B,S,C)
|
||||
|
||||
if strategy == "saddle_balanced":
|
||||
sim = torch.matmul(img_class_feat, img_class_feat.transpose(1, 2)) # (B,S,S)
|
||||
sim_no_diag = sim - torch.eye(S, device=sim.device).unsqueeze(0)
|
||||
sim_score = sim_no_diag.sum(dim=-1) / (S - 1) # (B,S)
|
||||
feat_norm = x[:, :, 0].norm(dim=-1) # (B,S)
|
||||
feat_var = img_class_feat.var(dim=-1) # (B,S)
|
||||
|
||||
def _normalize(metric):
|
||||
mn = metric.min(dim=1, keepdim=True).values
|
||||
mx = metric.max(dim=1, keepdim=True).values
|
||||
return (metric - mn) / (mx - mn + 1e-8)
|
||||
|
||||
sim_n, norm_n, var_n = _normalize(sim_score), _normalize(feat_norm), _normalize(feat_var)
|
||||
balance = (sim_n - 0.5).abs() + (norm_n - 0.5).abs() + (var_n - 0.5).abs()
|
||||
return balance.argmin(dim=1)
|
||||
|
||||
if strategy == "saddle_sim_range":
|
||||
sim = torch.matmul(img_class_feat, img_class_feat.transpose(1, 2))
|
||||
sim_no_diag = sim - torch.eye(S, device=sim.device).unsqueeze(0)
|
||||
sim_max = sim_no_diag.max(dim=-1).values
|
||||
sim_min = sim_no_diag.min(dim=-1).values
|
||||
return (sim_max - sim_min).argmax(dim=1)
|
||||
|
||||
raise ValueError(
|
||||
f"Unknown reference view selection strategy: {strategy!r}. "
|
||||
f"Must be one of: 'first', 'middle', 'saddle_balanced', 'saddle_sim_range'"
|
||||
)
|
||||
|
||||
|
||||
def reorder_by_reference(x: torch.Tensor, b_idx: torch.Tensor) -> torch.Tensor:
|
||||
"""Reorder x so the reference view is at position 0 in axis S."""
|
||||
B, S = x.shape[0], x.shape[1]
|
||||
if S <= 1:
|
||||
return x
|
||||
positions = torch.arange(S, device=x.device).unsqueeze(0).expand(B, -1)
|
||||
b_idx_exp = b_idx.unsqueeze(1)
|
||||
reorder = torch.where(
|
||||
(positions > 0) & (positions <= b_idx_exp),
|
||||
positions - 1,
|
||||
positions,
|
||||
)
|
||||
reorder[:, 0] = b_idx
|
||||
batch = torch.arange(B, device=x.device).unsqueeze(1)
|
||||
return x[batch, reorder]
|
||||
|
||||
|
||||
def restore_original_order(x: torch.Tensor, b_idx: torch.Tensor) -> torch.Tensor:
|
||||
"""Inverse of reorder_by_reference."""
|
||||
B, S = x.shape[0], x.shape[1]
|
||||
if S <= 1:
|
||||
return x
|
||||
target_positions = torch.arange(S, device=x.device).unsqueeze(0).expand(B, -1)
|
||||
b_idx_exp = b_idx.unsqueeze(1)
|
||||
restore = torch.where(target_positions < b_idx_exp, target_positions + 1, target_positions)
|
||||
restore = torch.scatter(restore, dim=1, index=b_idx_exp, src=torch.zeros_like(b_idx_exp))
|
||||
batch = torch.arange(B, device=x.device).unsqueeze(1)
|
||||
return x[batch, restore]
|
||||
160
comfy/ldm/depth_anything_3/transform.py
Normal file
160
comfy/ldm/depth_anything_3/transform.py
Normal file
@ -0,0 +1,160 @@
|
||||
"""Geometry / camera transform helpers for Depth Anything 3."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Tuple
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Affine 4x4 helpers
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def as_homogeneous(ext: torch.Tensor) -> torch.Tensor:
|
||||
"""Promote (...,3,4) extrinsics to (...,4,4) homogeneous form. No-op when the input is already ``(...,4,4)``."""
|
||||
if ext.shape[-2:] == (4, 4):
|
||||
return ext
|
||||
if ext.shape[-2:] == (3, 4):
|
||||
ones = torch.zeros_like(ext[..., :1, :4])
|
||||
ones[..., 0, 3] = 1.0
|
||||
return torch.cat([ext, ones], dim=-2)
|
||||
raise ValueError(f"Invalid affine shape: {ext.shape}")
|
||||
|
||||
|
||||
def affine_inverse(A: torch.Tensor) -> torch.Tensor:
|
||||
"""Inverse of an affine matrix ``[R|T; 0 0 0 1]``."""
|
||||
R = A[..., :3, :3]
|
||||
T = A[..., :3, 3:]
|
||||
P = A[..., 3:, :]
|
||||
return torch.cat([torch.cat([R.mT, -R.mT @ T], dim=-1), P], dim=-2)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Quaternion <-> rotation matrix (xyzw / scalar-last)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor:
|
||||
"""sqrt(max(0, x)) with a zero subgradient where x == 0."""
|
||||
ret = torch.zeros_like(x)
|
||||
positive_mask = x > 0
|
||||
if torch.is_grad_enabled():
|
||||
ret[positive_mask] = torch.sqrt(x[positive_mask])
|
||||
else:
|
||||
ret = torch.where(positive_mask, torch.sqrt(x), ret)
|
||||
return ret
|
||||
|
||||
|
||||
def standardize_quaternion(quaternions: torch.Tensor) -> torch.Tensor:
|
||||
"""Force the real part of a unit quaternion (xyzw) to be non-negative."""
|
||||
return torch.where(quaternions[..., 3:4] < 0, -quaternions, quaternions)
|
||||
|
||||
|
||||
def quat_to_mat(quaternions: torch.Tensor) -> torch.Tensor:
|
||||
"""Convert quaternions (xyzw) to (...,3,3) rotation matrices."""
|
||||
i, j, k, r = torch.unbind(quaternions, -1)
|
||||
two_s = 2.0 / (quaternions * quaternions).sum(-1)
|
||||
o = torch.stack(
|
||||
(
|
||||
1 - two_s * (j * j + k * k),
|
||||
two_s * (i * j - k * r),
|
||||
two_s * (i * k + j * r),
|
||||
two_s * (i * j + k * r),
|
||||
1 - two_s * (i * i + k * k),
|
||||
two_s * (j * k - i * r),
|
||||
two_s * (i * k - j * r),
|
||||
two_s * (j * k + i * r),
|
||||
1 - two_s * (i * i + j * j),
|
||||
),
|
||||
-1,
|
||||
)
|
||||
return o.reshape(quaternions.shape[:-1] + (3, 3))
|
||||
|
||||
|
||||
def mat_to_quat(matrix: torch.Tensor) -> torch.Tensor:
|
||||
"""Convert (...,3,3) rotation matrices to quaternions (xyzw)."""
|
||||
if matrix.size(-1) != 3 or matrix.size(-2) != 3:
|
||||
raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")
|
||||
|
||||
batch_dim = matrix.shape[:-2]
|
||||
m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(
|
||||
matrix.reshape(batch_dim + (9,)), dim=-1
|
||||
)
|
||||
|
||||
q_abs = _sqrt_positive_part(
|
||||
torch.stack(
|
||||
[
|
||||
1.0 + m00 + m11 + m22,
|
||||
1.0 + m00 - m11 - m22,
|
||||
1.0 - m00 + m11 - m22,
|
||||
1.0 - m00 - m11 + m22,
|
||||
],
|
||||
dim=-1,
|
||||
)
|
||||
)
|
||||
|
||||
quat_by_rijk = torch.stack(
|
||||
[
|
||||
torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1),
|
||||
torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1),
|
||||
torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1),
|
||||
torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1),
|
||||
],
|
||||
dim=-2,
|
||||
)
|
||||
|
||||
flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device)
|
||||
quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr))
|
||||
|
||||
out = quat_candidates[F.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, :].reshape(
|
||||
batch_dim + (4,)
|
||||
)
|
||||
# Reorder rijk -> xyzw (i.e. ijkr).
|
||||
out = out[..., [1, 2, 3, 0]]
|
||||
return standardize_quaternion(out)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Pose-encoding <-> extrinsics + intrinsics
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def extri_intri_to_pose_encoding(extrinsics: torch.Tensor, intrinsics: torch.Tensor, image_size_hw: Tuple[int, int]) -> torch.Tensor:
|
||||
"""Pack (extr, intr, image_size) into the 9-D pose-encoding vector.
|
||||
extrinsics: camera-to-world (c2w) (B,S,4,4) matrices,
|
||||
intrinsics: pixel-space (B,S,3,3) matrices,
|
||||
image_size_hw: is a (H, W) pair.
|
||||
"""
|
||||
R = extrinsics[..., :3, :3]
|
||||
T = extrinsics[..., :3, 3]
|
||||
quat = mat_to_quat(R)
|
||||
H, W = image_size_hw
|
||||
fov_h = 2 * torch.atan((H / 2) / intrinsics[..., 1, 1])
|
||||
fov_w = 2 * torch.atan((W / 2) / intrinsics[..., 0, 0])
|
||||
return torch.cat([T, quat, fov_h[..., None], fov_w[..., None]], dim=-1).float()
|
||||
|
||||
|
||||
def pose_encoding_to_extri_intri(pose_encoding: torch.Tensor, image_size_hw: Tuple[int, int]) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Inverse of extri_intri_to_pose_encoding."""
|
||||
T = pose_encoding[..., :3]
|
||||
quat = pose_encoding[..., 3:7]
|
||||
fov_h = pose_encoding[..., 7]
|
||||
fov_w = pose_encoding[..., 8]
|
||||
# Normalize to unit quaternion. CameraDec outputs raw values; a near-zero
|
||||
# quaternion causes two_s = 2/norm² → inf in quat_to_mat → NaN extrinsics.
|
||||
quat = quat / quat.norm(dim=-1, keepdim=True).clamp(min=1e-6)
|
||||
R = quat_to_mat(quat)
|
||||
extrinsics = torch.cat([R, T[..., None]], dim=-1)
|
||||
H, W = image_size_hw
|
||||
fy = (H / 2.0) / torch.clamp(torch.tan(fov_h / 2.0), 1e-6)
|
||||
fx = (W / 2.0) / torch.clamp(torch.tan(fov_w / 2.0), 1e-6)
|
||||
intrinsics = torch.zeros(pose_encoding.shape[:2] + (3, 3), device=pose_encoding.device, dtype=pose_encoding.dtype)
|
||||
intrinsics[..., 0, 0] = fx
|
||||
intrinsics[..., 1, 1] = fy
|
||||
intrinsics[..., 0, 2] = W / 2
|
||||
intrinsics[..., 1, 2] = H / 2
|
||||
intrinsics[..., 2, 2] = 1.0
|
||||
return extrinsics, intrinsics
|
||||
@ -5,6 +5,7 @@ import torch.nn.functional as F
|
||||
|
||||
from comfy.ldm.modules.attention import optimized_attention
|
||||
import comfy.model_management
|
||||
import comfy.quant_ops
|
||||
|
||||
def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
|
||||
assert dim % 2 == 0
|
||||
@ -19,15 +20,6 @@ def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
|
||||
out = torch.stack([torch.cos(out), torch.sin(out)], dim=0)
|
||||
return out.to(dtype=torch.float32, device=pos.device)
|
||||
|
||||
def apply_rotary_emb(x_in: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
|
||||
rot_dim = freqs_cis.shape[-1]
|
||||
x, x_pass = x_in[..., :rot_dim], x_in[..., rot_dim:]
|
||||
cos_ = freqs_cis[0]
|
||||
sin_ = freqs_cis[1]
|
||||
x1, x2 = x.chunk(2, dim=-1)
|
||||
x_rotated = torch.cat((-x2, x1), dim=-1)
|
||||
return torch.cat((x * cos_ + x_rotated * sin_, x_pass), dim=-1)
|
||||
|
||||
class ErnieImageEmbedND3(nn.Module):
|
||||
def __init__(self, dim: int, theta: int, axes_dim: tuple):
|
||||
super().__init__()
|
||||
@ -37,8 +29,16 @@ class ErnieImageEmbedND3(nn.Module):
|
||||
|
||||
def forward(self, ids: torch.Tensor) -> torch.Tensor:
|
||||
emb = torch.cat([rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(3)], dim=-1)
|
||||
emb = emb.unsqueeze(3) # [2, B, S, 1, head_dim//2]
|
||||
return torch.stack([emb, emb], dim=-1).reshape(*emb.shape[:-1], -1) # [B, S, 1, head_dim]
|
||||
cos_ = emb[0]
|
||||
sin_ = emb[1]
|
||||
N = cos_.shape[-1]
|
||||
half = N // 2
|
||||
cos_top = cos_[..., :half].repeat_interleave(2, dim=-1)
|
||||
sin_top = sin_[..., :half].repeat_interleave(2, dim=-1)
|
||||
cos_bot = cos_[..., half:].repeat_interleave(2, dim=-1)
|
||||
sin_bot = sin_[..., half:].repeat_interleave(2, dim=-1)
|
||||
rot = torch.stack([cos_top, -sin_top, sin_bot, cos_bot], dim=-1)
|
||||
return rot.reshape(*rot.shape[:-1], 2, 2).unsqueeze(2)
|
||||
|
||||
class ErnieImagePatchEmbedDynamic(nn.Module):
|
||||
def __init__(self, in_channels: int, embed_dim: int, patch_size: int, operations, device=None, dtype=None):
|
||||
@ -115,8 +115,7 @@ class ErnieImageAttention(nn.Module):
|
||||
key = self.norm_k(key)
|
||||
|
||||
if image_rotary_emb is not None:
|
||||
query = apply_rotary_emb(query, image_rotary_emb)
|
||||
key = apply_rotary_emb(key, image_rotary_emb)
|
||||
query, key = comfy.quant_ops.ck.apply_rope_split_half(query, key, image_rotary_emb)
|
||||
|
||||
q_flat = query.reshape(B, S, -1)
|
||||
k_flat = key.reshape(B, S, -1)
|
||||
@ -274,7 +273,7 @@ class ErnieImageModel(nn.Module):
|
||||
|
||||
image_ids = image_ids.view(1, N_img, 3).expand(B, -1, -1)
|
||||
|
||||
rotary_pos_emb = self.pos_embed(torch.cat([image_ids, text_ids], dim=1)).to(x.dtype)
|
||||
rotary_pos_emb = self.pos_embed(torch.cat([image_ids, text_ids], dim=1))
|
||||
del image_ids, text_ids
|
||||
|
||||
sample = self.time_proj(timesteps).to(dtype)
|
||||
|
||||
@ -4,7 +4,7 @@ from torch import Tensor
|
||||
|
||||
from comfy.ldm.modules.attention import optimized_attention
|
||||
import comfy.model_management
|
||||
import logging
|
||||
import comfy.quant_ops
|
||||
|
||||
|
||||
def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None, transformer_options={}) -> Tensor:
|
||||
@ -44,21 +44,15 @@ def _apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor):
|
||||
return apply_rope1(xq, freqs_cis), apply_rope1(xk, freqs_cis)
|
||||
|
||||
|
||||
try:
|
||||
import comfy.quant_ops
|
||||
q_apply_rope = comfy.quant_ops.ck.apply_rope
|
||||
q_apply_rope1 = comfy.quant_ops.ck.apply_rope1
|
||||
def apply_rope(xq, xk, freqs_cis):
|
||||
if comfy.model_management.in_training:
|
||||
return _apply_rope(xq, xk, freqs_cis)
|
||||
else:
|
||||
return apply_rope1(xq, freqs_cis), apply_rope1(xk, freqs_cis)
|
||||
def apply_rope1(x, freqs_cis):
|
||||
if comfy.model_management.in_training:
|
||||
return _apply_rope1(x, freqs_cis)
|
||||
else:
|
||||
return q_apply_rope1(x, freqs_cis)
|
||||
except:
|
||||
logging.warning("No comfy kitchen, using old apply_rope functions.")
|
||||
apply_rope = _apply_rope
|
||||
apply_rope1 = _apply_rope1
|
||||
def apply_rope(xq, xk, freqs_cis):
|
||||
if comfy.model_management.in_training:
|
||||
return _apply_rope(xq, xk, freqs_cis)
|
||||
else:
|
||||
return comfy.quant_ops.ck.apply_rope(xq, xk, freqs_cis)
|
||||
|
||||
|
||||
def apply_rope1(x, freqs_cis):
|
||||
if comfy.model_management.in_training:
|
||||
return _apply_rope1(x, freqs_cis)
|
||||
else:
|
||||
return comfy.quant_ops.ck.apply_rope1(x, freqs_cis)
|
||||
|
||||
@ -607,9 +607,13 @@ class HunYuanDiTPlain(nn.Module):
|
||||
def forward(self, x, t, context, transformer_options = {}, **kwargs):
|
||||
|
||||
x = x.movedim(-1, -2)
|
||||
if context.shape[0] >= 2:
|
||||
uncond_emb, cond_emb = context.chunk(2, dim = 0)
|
||||
context = torch.cat([cond_emb, uncond_emb], dim = 0)
|
||||
|
||||
swap_cfg_halves = context.shape[0] >= 2
|
||||
|
||||
if swap_cfg_halves:
|
||||
first_half, second_half = context.chunk(2, dim = 0)
|
||||
context = torch.cat([second_half, first_half], dim = 0)
|
||||
|
||||
main_condition = context
|
||||
|
||||
t = 1.0 - t
|
||||
@ -657,8 +661,8 @@ class HunYuanDiTPlain(nn.Module):
|
||||
output = self.final_layer(combined)
|
||||
output = output.movedim(-2, -1) * (-1.0)
|
||||
|
||||
if output.shape[0] >= 2:
|
||||
cond_emb, uncond_emb = output.chunk(2, dim = 0)
|
||||
return torch.cat([uncond_emb, cond_emb])
|
||||
else:
|
||||
return output
|
||||
if swap_cfg_halves:
|
||||
first_half, second_half = output.chunk(2, dim = 0)
|
||||
output = torch.cat([second_half, first_half], dim = 0)
|
||||
|
||||
return output
|
||||
|
||||
297
comfy/ldm/ideogram4/model.py
Normal file
297
comfy/ldm/ideogram4/model.py
Normal file
@ -0,0 +1,297 @@
|
||||
"""
|
||||
The Ideogram 4 transformer is a NextDiT/Lumina2-family single-stream model
|
||||
consumes Qwen3-VL hidden-state features (concatenated from 13 layers -> 53248 dims)
|
||||
packs ``[text tokens, image tokens]`` into one sequence with block-diagonal segment attention and 3D interleaved MRoPE.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
import comfy.patcher_extension
|
||||
from comfy.ldm.lumina.model import FeedForward
|
||||
from comfy.ldm.modules.attention import optimized_attention_masked
|
||||
from comfy.text_encoders.llama import apply_rope, precompute_freqs_cis
|
||||
|
||||
# Per-token role indicators
|
||||
SEQUENCE_PADDING_INDICATOR = -1
|
||||
OUTPUT_IMAGE_INDICATOR = 2
|
||||
LLM_TOKEN_INDICATOR = 3
|
||||
# Image grid coordinates are offset so they never collide with text positions
|
||||
IMAGE_POSITION_OFFSET = 65536
|
||||
|
||||
|
||||
class Ideogram4Attention(nn.Module):
|
||||
def __init__(self, hidden_size, num_heads, eps=1e-5, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = hidden_size // num_heads
|
||||
self.hidden_size = hidden_size
|
||||
|
||||
self.qkv = operations.Linear(hidden_size, hidden_size * 3, bias=False, dtype=dtype, device=device)
|
||||
self.norm_q = operations.RMSNorm(self.head_dim, eps=eps, elementwise_affine=True, dtype=dtype, device=device)
|
||||
self.norm_k = operations.RMSNorm(self.head_dim, eps=eps, elementwise_affine=True, dtype=dtype, device=device)
|
||||
self.o = operations.Linear(hidden_size, hidden_size, bias=False, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, x, attn_mask, freqs_cis, transformer_options={}):
|
||||
batch_size, seq_len, _ = x.shape
|
||||
qkv = self.qkv(x).view(batch_size, seq_len, 3, self.num_heads, self.head_dim)
|
||||
q, k, v = qkv.unbind(dim=2)
|
||||
|
||||
q = self.norm_q(q)
|
||||
k = self.norm_k(k)
|
||||
|
||||
# (B, heads, L, head_dim)
|
||||
q = q.transpose(1, 2)
|
||||
k = k.transpose(1, 2)
|
||||
v = v.transpose(1, 2)
|
||||
|
||||
q, k = apply_rope(q, k, freqs_cis)
|
||||
|
||||
out = optimized_attention_masked(q, k, v, self.num_heads, attn_mask, skip_reshape=True, transformer_options=transformer_options)
|
||||
return self.o(out)
|
||||
|
||||
|
||||
class Ideogram4TransformerBlock(nn.Module):
|
||||
def __init__(self, hidden_size, intermediate_size, num_heads, norm_eps, adaln_dim, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
self.attention = Ideogram4Attention(hidden_size, num_heads, eps=1e-5, dtype=dtype, device=device, operations=operations)
|
||||
self.feed_forward = FeedForward(
|
||||
dim=hidden_size, hidden_dim=intermediate_size, multiple_of=1, ffn_dim_multiplier=None,
|
||||
operation_settings={"operations": operations, "dtype": dtype, "device": device},
|
||||
)
|
||||
|
||||
self.attention_norm1 = operations.RMSNorm(hidden_size, eps=norm_eps, elementwise_affine=True, dtype=dtype, device=device)
|
||||
self.ffn_norm1 = operations.RMSNorm(hidden_size, eps=norm_eps, elementwise_affine=True, dtype=dtype, device=device)
|
||||
self.attention_norm2 = operations.RMSNorm(hidden_size, eps=norm_eps, elementwise_affine=True, dtype=dtype, device=device)
|
||||
self.ffn_norm2 = operations.RMSNorm(hidden_size, eps=norm_eps, elementwise_affine=True, dtype=dtype, device=device)
|
||||
|
||||
self.adaln_modulation = operations.Linear(adaln_dim, 4 * hidden_size, bias=True, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, x, attn_mask, freqs_cis, adaln_input, transformer_options={}):
|
||||
mod = self.adaln_modulation(adaln_input)
|
||||
scale_msa, gate_msa, scale_mlp, gate_mlp = mod.chunk(4, dim=-1)
|
||||
gate_msa = torch.tanh(gate_msa)
|
||||
gate_mlp = torch.tanh(gate_mlp)
|
||||
scale_msa = 1.0 + scale_msa
|
||||
scale_mlp = 1.0 + scale_mlp
|
||||
|
||||
attn_out = self.attention(self.attention_norm1(x) * scale_msa, attn_mask, freqs_cis, transformer_options=transformer_options)
|
||||
x = x + gate_msa * self.attention_norm2(attn_out)
|
||||
x = x + gate_mlp * self.ffn_norm2(self.feed_forward(self.ffn_norm1(x) * scale_mlp))
|
||||
return x
|
||||
|
||||
|
||||
def _sinusoidal_embedding(t, dim, scale=1e4):
|
||||
t = t.to(torch.float32)
|
||||
half = dim // 2
|
||||
freq = math.log(scale) / (half - 1)
|
||||
freq = torch.exp(torch.arange(half, dtype=torch.float32, device=t.device) * -freq)
|
||||
emb = t.unsqueeze(-1) * freq
|
||||
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
|
||||
if dim % 2 == 1:
|
||||
emb = F.pad(emb, (0, 1))
|
||||
return emb
|
||||
|
||||
|
||||
class Ideogram4EmbedScalar(nn.Module):
|
||||
def __init__(self, dim, input_range=(0.0, 1.0), dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
self.range_min, self.range_max = input_range
|
||||
self.mlp_in = operations.Linear(dim, dim, bias=True, dtype=dtype, device=device)
|
||||
self.mlp_out = operations.Linear(dim, dim, bias=True, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, x):
|
||||
x = x.to(torch.float32)
|
||||
scaled = 1e4 * (x - self.range_min) / (self.range_max - self.range_min)
|
||||
emb = _sinusoidal_embedding(scaled, self.dim)
|
||||
emb = emb.to(self.mlp_in.weight.dtype)
|
||||
emb = F.silu(self.mlp_in(emb))
|
||||
return self.mlp_out(emb)
|
||||
|
||||
|
||||
class Ideogram4FinalLayer(nn.Module):
|
||||
def __init__(self, hidden_size, out_channels, adaln_dim, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
self.norm_final = operations.LayerNorm(hidden_size, eps=1e-6, elementwise_affine=False, dtype=dtype, device=device)
|
||||
self.linear = operations.Linear(hidden_size, out_channels, bias=True, dtype=dtype, device=device)
|
||||
self.adaln_modulation = operations.Linear(adaln_dim, hidden_size, bias=True, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, x, c):
|
||||
scale = 1.0 + self.adaln_modulation(F.silu(c))
|
||||
return self.linear(self.norm_final(x) * scale)
|
||||
|
||||
|
||||
class Ideogram4Transformer(nn.Module):
|
||||
"""A single Ideogram 4 backbone operating on a packed token sequence."""
|
||||
|
||||
def __init__(self, emb_dim, num_layers, num_heads, intermediate_size, adaln_dim,
|
||||
in_channels, llm_features_dim, rope_theta, mrope_section, norm_eps,
|
||||
dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
self.head_dim = emb_dim // num_heads
|
||||
self.rope_theta = rope_theta
|
||||
self.mrope_section = tuple(mrope_section)
|
||||
|
||||
self.input_proj = operations.Linear(in_channels, emb_dim, bias=True, dtype=dtype, device=device)
|
||||
self.llm_cond_norm = operations.RMSNorm(llm_features_dim, eps=1e-6, elementwise_affine=True, dtype=dtype, device=device)
|
||||
self.llm_cond_proj = operations.Linear(llm_features_dim, emb_dim, bias=True, dtype=dtype, device=device)
|
||||
self.t_embedding = Ideogram4EmbedScalar(emb_dim, input_range=(0.0, 1.0), dtype=dtype, device=device, operations=operations)
|
||||
self.adaln_proj = operations.Linear(emb_dim, adaln_dim, bias=True, dtype=dtype, device=device)
|
||||
|
||||
self.embed_image_indicator = operations.Embedding(2, emb_dim, dtype=dtype, device=device)
|
||||
|
||||
self.layers = nn.ModuleList([
|
||||
Ideogram4TransformerBlock(emb_dim, intermediate_size, num_heads, norm_eps, adaln_dim,
|
||||
dtype=dtype, device=device, operations=operations)
|
||||
for _ in range(num_layers)
|
||||
])
|
||||
|
||||
self.final_layer = Ideogram4FinalLayer(emb_dim, in_channels, adaln_dim, dtype=dtype, device=device, operations=operations)
|
||||
|
||||
def _backbone(self, llm_features, x, t, position_ids, attn_mask, indicator, transformer_options={}):
|
||||
indicator = indicator.to(torch.long)
|
||||
output_image_mask = (indicator == OUTPUT_IMAGE_INDICATOR).to(x.dtype).unsqueeze(-1)
|
||||
|
||||
x = x * output_image_mask
|
||||
h = self.input_proj(x) * output_image_mask
|
||||
|
||||
t_cond = self.t_embedding(t)
|
||||
if t.dim() == 1:
|
||||
t_cond = t_cond.unsqueeze(1)
|
||||
adaln_input = F.silu(self.adaln_proj(t_cond))
|
||||
|
||||
# h is zero on the text rows (content lives only on image rows), add writes the text features in place
|
||||
if llm_features is not None:
|
||||
L_text = llm_features.shape[1]
|
||||
text_mask = (indicator[:, :L_text] == LLM_TOKEN_INDICATOR).to(x.dtype).unsqueeze(-1)
|
||||
llm = self.llm_cond_norm(llm_features * text_mask)
|
||||
llm = self.llm_cond_proj(llm) * text_mask
|
||||
h[:, :L_text] = h[:, :L_text] + llm
|
||||
|
||||
h = h + self.embed_image_indicator((indicator == OUTPUT_IMAGE_INDICATOR).to(torch.long), out_dtype=h.dtype)
|
||||
|
||||
# Qwen3-VL interleaved MRoPE; position_ids (B, L, 3) -> (3, L) (same across batch).
|
||||
freqs_cis = precompute_freqs_cis(
|
||||
self.head_dim, position_ids[0].transpose(0, 1), self.rope_theta,
|
||||
rope_dims=self.mrope_section, interleaved_mrope=True, device=position_ids.device,
|
||||
)
|
||||
|
||||
if attn_mask is not None and attn_mask.dtype == torch.bool:
|
||||
attn_mask = torch.zeros_like(attn_mask, dtype=h.dtype).masked_fill_(~attn_mask, -torch.finfo(h.dtype).max)
|
||||
|
||||
for layer in self.layers:
|
||||
h = layer(h, attn_mask, freqs_cis, adaln_input, transformer_options=transformer_options)
|
||||
|
||||
return self.final_layer(h, adaln_input)
|
||||
|
||||
|
||||
class Ideogram4Transformer2DModel(Ideogram4Transformer):
|
||||
"""Ideogram 4 single-stream DiT.
|
||||
|
||||
Runs a packed ``[text, image]`` sequence when text context is supplied, or an image-only sequence when ``context is None``.
|
||||
"""
|
||||
|
||||
def __init__(self, image_model=None, in_channels=128, num_layers=34, num_attention_heads=18, attention_head_dim=256, intermediate_size=12288,
|
||||
adaln_dim=512, llm_features_dim=53248, rope_theta=5000000, mrope_section=(24, 20, 20), norm_eps=1e-5,
|
||||
dtype=None, device=None, operations=None, **kwargs):
|
||||
emb_dim = num_attention_heads * attention_head_dim
|
||||
super().__init__(
|
||||
emb_dim=emb_dim, num_layers=num_layers, num_heads=num_attention_heads,
|
||||
intermediate_size=intermediate_size, adaln_dim=adaln_dim, in_channels=in_channels,
|
||||
llm_features_dim=llm_features_dim, rope_theta=rope_theta, mrope_section=mrope_section,
|
||||
norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
|
||||
self.dtype = dtype
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = in_channels
|
||||
# 128-dim token = patch (2x2) * ae_channels (32).
|
||||
self.patch_size = 2
|
||||
self.ae_channels = in_channels // (self.patch_size * self.patch_size)
|
||||
|
||||
def _img_to_tokens(self, x):
|
||||
B, C, gh, gw = x.shape
|
||||
x = x.view(B, self.ae_channels, self.patch_size, self.patch_size, gh, gw)
|
||||
x = x.permute(0, 4, 5, 2, 3, 1) # (B, gh, gw, pi, pj, c)
|
||||
return x.reshape(B, gh * gw, C)
|
||||
|
||||
def _tokens_to_img(self, tokens, gh, gw):
|
||||
B = tokens.shape[0]
|
||||
C = tokens.shape[-1]
|
||||
x = tokens.reshape(B, gh, gw, self.patch_size, self.patch_size, self.ae_channels)
|
||||
x = x.permute(0, 5, 3, 4, 1, 2) # (B, c, pi, pj, gh, gw)
|
||||
return x.reshape(B, C, gh, gw)
|
||||
|
||||
def _image_position_ids(self, gh, gw, device):
|
||||
h_idx = torch.arange(gh, device=device).view(-1, 1).expand(gh, gw).reshape(-1)
|
||||
w_idx = torch.arange(gw, device=device).view(1, -1).expand(gh, gw).reshape(-1)
|
||||
t_idx = torch.zeros_like(h_idx)
|
||||
return torch.stack([t_idx, h_idx, w_idx], dim=1) + IMAGE_POSITION_OFFSET # (L_img, 3)
|
||||
|
||||
def _run_conditional(self, x_chunk, context_chunk, attn_mask_chunk, t_chunk, gh, gw, transformer_options):
|
||||
B = x_chunk.shape[0]
|
||||
device = x_chunk.device
|
||||
img_tokens = self._img_to_tokens(x_chunk)
|
||||
L_img = img_tokens.shape[1]
|
||||
L_text = context_chunk.shape[1]
|
||||
L = L_text + L_img
|
||||
latent_dim = img_tokens.shape[-1]
|
||||
|
||||
x_full = torch.zeros(B, L, latent_dim, dtype=img_tokens.dtype, device=device)
|
||||
x_full[:, L_text:] = img_tokens
|
||||
|
||||
text_pos = torch.arange(L_text, device=device).view(-1, 1).expand(L_text, 3)
|
||||
img_pos = self._image_position_ids(gh, gw, device)
|
||||
position_ids = torch.cat([text_pos, img_pos], dim=0).unsqueeze(0).expand(B, L, 3)
|
||||
|
||||
indicator = torch.empty(B, L, dtype=torch.long, device=device)
|
||||
indicator[:, :L_text] = LLM_TOKEN_INDICATOR
|
||||
indicator[:, L_text:] = OUTPUT_IMAGE_INDICATOR
|
||||
|
||||
attn_mask = None
|
||||
if attn_mask_chunk is not None:
|
||||
segment_ids = torch.ones(B, L, dtype=torch.long, device=device)
|
||||
pad = (attn_mask_chunk == 0)
|
||||
segment_ids[:, :L_text][pad] = SEQUENCE_PADDING_INDICATOR
|
||||
indicator[:, :L_text][pad] = 0
|
||||
# Block-diagonal mask from segment ids: (B, 1, L, L), True = attend.
|
||||
attn_mask = (segment_ids.unsqueeze(2) == segment_ids.unsqueeze(1)).unsqueeze(1)
|
||||
|
||||
out = self._backbone(context_chunk, x_full, t_chunk, position_ids, attn_mask, indicator,
|
||||
transformer_options=transformer_options)
|
||||
return self._tokens_to_img(out[:, L_text:], gh, gw)
|
||||
|
||||
def _run_image_only(self, x_chunk, t_chunk, gh, gw, transformer_options):
|
||||
B = x_chunk.shape[0]
|
||||
device = x_chunk.device
|
||||
img_tokens = self._img_to_tokens(x_chunk)
|
||||
L_img = img_tokens.shape[1]
|
||||
|
||||
position_ids = self._image_position_ids(gh, gw, device).unsqueeze(0).expand(B, L_img, 3)
|
||||
indicator = torch.full((B, L_img), OUTPUT_IMAGE_INDICATOR, dtype=torch.long, device=device)
|
||||
|
||||
# Image-only sequence is a single segment -> no mask, full attention, no LLM context.
|
||||
out = self._backbone(None, img_tokens, t_chunk, position_ids, None, indicator, transformer_options=transformer_options)
|
||||
return self._tokens_to_img(out, gh, gw)
|
||||
|
||||
def forward(self, x, timesteps, context=None, attention_mask=None, transformer_options={}, **kwargs):
|
||||
return comfy.patcher_extension.WrapperExecutor.new_class_executor(
|
||||
self._forward,
|
||||
self,
|
||||
comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options),
|
||||
).execute(x, timesteps, context, attention_mask, transformer_options, **kwargs)
|
||||
|
||||
def _forward(self, x, timesteps, context=None, attention_mask=None, transformer_options={}, **kwargs):
|
||||
bs, c, gh, gw = x.shape
|
||||
|
||||
timesteps = 1.0 - timesteps
|
||||
|
||||
# unconditional pass
|
||||
if context is None:
|
||||
return -self._run_image_only(x, timesteps, gh, gw, transformer_options)
|
||||
|
||||
return -self._run_conditional(x, context, attention_mask, timesteps, gh, gw, transformer_options)
|
||||
510
comfy/ldm/lens/model.py
Normal file
510
comfy/ldm/lens/model.py
Normal file
@ -0,0 +1,510 @@
|
||||
"""Lens denoising transformer (DiT)"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, Optional, Tuple
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
import comfy.ldm.flux.layers
|
||||
import comfy.patcher_extension
|
||||
from comfy.ldm.flux.layers import EmbedND
|
||||
from comfy.ldm.flux.math import apply_rope
|
||||
from comfy.ldm.modules.attention import optimized_attention
|
||||
|
||||
|
||||
def _lens_time_proj(t: torch.Tensor, dim: int = 256) -> torch.Tensor:
|
||||
return comfy.ldm.flux.layers.timestep_embedding(t, dim)
|
||||
|
||||
|
||||
def _lens_position_ids(
|
||||
frame: int, height: int, width: int, text_seq_len: int,
|
||||
scale_rope: bool = True, device=None,
|
||||
) -> torch.Tensor:
|
||||
"""Lens axial (frame, h, w) position ids for joint image + text sequence.
|
||||
|
||||
With ``scale_rope=True`` h/w are centered around 0 (negative + positive
|
||||
halves) and text starts at ``max(h//2, w//2)``. Result shape ``[seq, 3]``;
|
||||
caller adds a batch dim for ``EmbedND``.
|
||||
"""
|
||||
if scale_rope:
|
||||
h_pos = torch.cat([torch.arange(-(height - height // 2), 0, device=device),
|
||||
torch.arange(0, height // 2, device=device)])
|
||||
w_pos = torch.cat([torch.arange(-(width - width // 2), 0, device=device),
|
||||
torch.arange(0, width // 2, device=device)])
|
||||
text_start = max(height // 2, width // 2)
|
||||
else:
|
||||
h_pos = torch.arange(height, device=device)
|
||||
w_pos = torch.arange(width, device=device)
|
||||
text_start = max(height, width)
|
||||
|
||||
f_pos = torch.arange(frame, device=device)
|
||||
img_ids = torch.zeros(frame, height, width, 3, device=device)
|
||||
img_ids[..., 0] = f_pos[:, None, None]
|
||||
img_ids[..., 1] = h_pos[None, :, None]
|
||||
img_ids[..., 2] = w_pos[None, None, :]
|
||||
img_ids = img_ids.reshape(-1, 3)
|
||||
|
||||
# Text positions replicate across all 3 axes (matches original packing).
|
||||
txt_pos = torch.arange(text_start, text_start + text_seq_len, device=device).float()
|
||||
txt_ids = txt_pos[:, None].expand(text_seq_len, 3)
|
||||
|
||||
return torch.cat([img_ids, txt_ids], dim=0)
|
||||
|
||||
|
||||
class _TimestepEmbedder(nn.Module):
|
||||
def __init__(self, in_channels: int, time_embed_dim: int, dtype=None, device=None, operations=None) -> None:
|
||||
super().__init__()
|
||||
self.linear_1 = operations.Linear(in_channels, time_embed_dim, dtype=dtype, device=device)
|
||||
self.linear_2 = operations.Linear(time_embed_dim, time_embed_dim, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
x = self.linear_1(x)
|
||||
x = F.silu(x)
|
||||
return self.linear_2(x)
|
||||
|
||||
|
||||
class LensTimestepProjEmbeddings(nn.Module):
|
||||
def __init__(self, embedding_dim: int, dtype=None, device=None, operations=None) -> None:
|
||||
super().__init__()
|
||||
self.timestep_embedder = _TimestepEmbedder(256, embedding_dim, dtype=dtype, device=device, operations=operations)
|
||||
|
||||
def forward(self, timestep: torch.Tensor, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||
proj = _lens_time_proj(timestep, 256)
|
||||
return self.timestep_embedder(proj.to(dtype=hidden_states.dtype))
|
||||
|
||||
|
||||
class GateMLP(nn.Module):
|
||||
"""SwiGLU MLP."""
|
||||
|
||||
def __init__(self, dim: int, hidden_dim: int, dtype=None, device=None, operations=None) -> None:
|
||||
super().__init__()
|
||||
self.w1 = operations.Linear(dim, hidden_dim, bias=False, dtype=dtype, device=device)
|
||||
self.w2 = operations.Linear(hidden_dim, dim, bias=False, dtype=dtype, device=device)
|
||||
self.w3 = operations.Linear(dim, hidden_dim, bias=False, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, x):
|
||||
return self.w2(F.silu(self.w1(x), inplace=True).mul_(self.w3(x)))
|
||||
|
||||
|
||||
class LensJointAttention(nn.Module):
|
||||
"""Joint image+text attention with fused QKV per stream."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
query_dim: int,
|
||||
added_kv_proj_dim: int,
|
||||
dim_head: int = 64,
|
||||
heads: int = 8,
|
||||
out_dim: Optional[int] = None,
|
||||
eps: float = 1e-5,
|
||||
dtype=None,
|
||||
device=None,
|
||||
operations=None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.inner_dim = out_dim if out_dim is not None else dim_head * heads
|
||||
self.heads = self.inner_dim // dim_head
|
||||
self.dim_head = dim_head
|
||||
self.out_dim = out_dim if out_dim is not None else query_dim
|
||||
|
||||
self.norm_q = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device)
|
||||
self.norm_k = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device)
|
||||
self.norm_added_q = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device)
|
||||
self.norm_added_k = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device)
|
||||
|
||||
self.img_qkv = operations.Linear(query_dim, 3 * self.inner_dim, bias=True, dtype=dtype, device=device)
|
||||
self.txt_qkv = operations.Linear(added_kv_proj_dim, 3 * self.inner_dim, bias=True, dtype=dtype, device=device)
|
||||
|
||||
# ModuleList([Linear, Identity]) for state-dict key compatibility.
|
||||
self.to_out = nn.ModuleList([
|
||||
operations.Linear(self.inner_dim, self.out_dim, bias=True, dtype=dtype, device=device),
|
||||
nn.Identity(),
|
||||
])
|
||||
self.to_add_out = operations.Linear(self.inner_dim, query_dim, bias=True, dtype=dtype, device=device)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
encoder_hidden_states: torch.Tensor,
|
||||
freqs_cis: torch.Tensor,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
transformer_options: Optional[Dict[str, Any]] = None,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
bsz, seq_img, _ = hidden_states.shape
|
||||
seq_txt = encoder_hidden_states.shape[1]
|
||||
|
||||
# image stream
|
||||
img_qkv = self.img_qkv(hidden_states).view(bsz, seq_img, 3, self.heads, self.dim_head)
|
||||
img_q, img_k, img_v = img_qkv.unbind(dim=2)
|
||||
img_q = self.norm_q(img_q)
|
||||
img_k = self.norm_k(img_k)
|
||||
del img_qkv
|
||||
|
||||
# text stream
|
||||
txt_qkv = self.txt_qkv(encoder_hidden_states).view(bsz, seq_txt, 3, self.heads, self.dim_head)
|
||||
txt_q, txt_k, txt_v = txt_qkv.unbind(dim=2)
|
||||
txt_q = self.norm_added_q(txt_q)
|
||||
txt_k = self.norm_added_k(txt_k)
|
||||
|
||||
# [B, S, H, D] → [B, H, S, D] for attention, dels to avoid VRAM peaks
|
||||
q = torch.cat([img_q, txt_q], dim=1).transpose(1, 2)
|
||||
del img_q, txt_q
|
||||
k = torch.cat([img_k, txt_k], dim=1).transpose(1, 2)
|
||||
del img_k, txt_k
|
||||
v = torch.cat([img_v, txt_v], dim=1).transpose(1, 2)
|
||||
del img_v, txt_v
|
||||
|
||||
q, k = apply_rope(q, k, freqs_cis)
|
||||
|
||||
if attention_mask is not None:
|
||||
expected = (bsz, 1, 1, seq_img + seq_txt)
|
||||
if attention_mask.shape != expected:
|
||||
raise ValueError(
|
||||
f"attention_mask must be {expected}, got {tuple(attention_mask.shape)}"
|
||||
)
|
||||
attention_mask = attention_mask.to(q.dtype)
|
||||
|
||||
out = optimized_attention(
|
||||
q, k, v, self.heads, mask=attention_mask, skip_reshape=True,
|
||||
transformer_options=transformer_options,
|
||||
)
|
||||
|
||||
img_out = self.to_out[1](self.to_out[0](out[:, :seq_img, :]))
|
||||
txt_out = self.to_add_out(out[:, seq_img:, :])
|
||||
return img_out, txt_out
|
||||
|
||||
|
||||
class LensTransformerBlock(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
dim: int,
|
||||
num_attention_heads: int,
|
||||
attention_head_dim: int,
|
||||
eps: float = 1e-6,
|
||||
rms_norm: bool = True,
|
||||
dtype=None,
|
||||
device=None,
|
||||
operations=None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.attn = LensJointAttention(
|
||||
query_dim=dim,
|
||||
added_kv_proj_dim=dim,
|
||||
dim_head=attention_head_dim,
|
||||
heads=num_attention_heads,
|
||||
out_dim=dim,
|
||||
eps=1e-5,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
operations=operations,
|
||||
)
|
||||
|
||||
if rms_norm:
|
||||
NormCls = operations.RMSNorm
|
||||
norm_kwargs = {}
|
||||
else:
|
||||
NormCls = operations.LayerNorm
|
||||
norm_kwargs = {"elementwise_affine": False}
|
||||
|
||||
mlp_hidden = int(dim / 3 * 8)
|
||||
|
||||
# Sequential(SiLU, Linear) so state-dict lands at img_mod.1.{weight,bias}.
|
||||
self.img_mod = nn.Sequential(
|
||||
nn.SiLU(),
|
||||
operations.Linear(dim, 6 * dim, bias=True, dtype=dtype, device=device),
|
||||
)
|
||||
self.img_norm1 = NormCls(dim, eps=eps, dtype=dtype, device=device, **norm_kwargs)
|
||||
self.img_norm2 = NormCls(dim, eps=eps, dtype=dtype, device=device, **norm_kwargs)
|
||||
self.img_mlp = GateMLP(dim, mlp_hidden, dtype=dtype, device=device, operations=operations)
|
||||
|
||||
self.txt_mod = nn.Sequential(
|
||||
nn.SiLU(),
|
||||
operations.Linear(dim, 6 * dim, bias=True, dtype=dtype, device=device),
|
||||
)
|
||||
self.txt_norm1 = NormCls(dim, eps=eps, dtype=dtype, device=device, **norm_kwargs)
|
||||
self.txt_norm2 = NormCls(dim, eps=eps, dtype=dtype, device=device, **norm_kwargs)
|
||||
self.txt_mlp = GateMLP(dim, mlp_hidden, dtype=dtype, device=device, operations=operations)
|
||||
|
||||
@staticmethod
|
||||
def _modulate(x: torch.Tensor, mod_params: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
shift, scale, gate = mod_params.chunk(3, dim=-1)
|
||||
return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1), gate.unsqueeze(1)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
encoder_hidden_states: torch.Tensor,
|
||||
temb: torch.Tensor,
|
||||
freqs_cis: torch.Tensor,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
transformer_options: Optional[Dict[str, Any]] = None,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
img_mod1, img_mod2 = self.img_mod(temb).chunk(2, dim=-1)
|
||||
txt_mod1, txt_mod2 = self.txt_mod(temb).chunk(2, dim=-1)
|
||||
|
||||
img_modulated, img_gate1 = self._modulate(self.img_norm1(hidden_states), img_mod1)
|
||||
txt_modulated, txt_gate1 = self._modulate(self.txt_norm1(encoder_hidden_states), txt_mod1)
|
||||
|
||||
img_attn, txt_attn = self.attn(
|
||||
hidden_states=img_modulated,
|
||||
encoder_hidden_states=txt_modulated,
|
||||
freqs_cis=freqs_cis,
|
||||
attention_mask=attention_mask,
|
||||
transformer_options=transformer_options,
|
||||
)
|
||||
|
||||
hidden_states = hidden_states + img_gate1 * img_attn
|
||||
encoder_hidden_states = encoder_hidden_states + txt_gate1 * txt_attn
|
||||
|
||||
img_modulated2, img_gate2 = self._modulate(self.img_norm2(hidden_states), img_mod2)
|
||||
hidden_states = hidden_states + img_gate2 * self.img_mlp(img_modulated2)
|
||||
|
||||
txt_modulated2, txt_gate2 = self._modulate(self.txt_norm2(encoder_hidden_states), txt_mod2)
|
||||
encoder_hidden_states = encoder_hidden_states + txt_gate2 * self.txt_mlp(txt_modulated2)
|
||||
|
||||
return encoder_hidden_states, hidden_states
|
||||
|
||||
|
||||
class _AdaLayerNormContinuousNoAffine(nn.Module):
|
||||
"""AdaLayerNormContinuous(elementwise_affine=False).
|
||||
|
||||
The reference uses ``scale, shift = chunk(2)`` (scale first) — opposite
|
||||
to Flux's ``LastLayer``.
|
||||
"""
|
||||
|
||||
def __init__(self, embedding_dim: int, conditioning_embedding_dim: int, eps: float = 1e-6,
|
||||
dtype=None, device=None, operations=None) -> None:
|
||||
super().__init__()
|
||||
self.linear = operations.Linear(
|
||||
conditioning_embedding_dim, embedding_dim * 2, bias=True, dtype=dtype, device=device
|
||||
)
|
||||
self.eps = eps
|
||||
self.embedding_dim = embedding_dim
|
||||
|
||||
def forward(self, x: torch.Tensor, conditioning: torch.Tensor) -> torch.Tensor:
|
||||
emb = self.linear(F.silu(conditioning))
|
||||
scale, shift = torch.chunk(emb, 2, dim=-1)
|
||||
x = F.layer_norm(x, (self.embedding_dim,), None, None, self.eps)
|
||||
return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
|
||||
|
||||
|
||||
class LensTransformer2DModel(nn.Module):
|
||||
"""Lens dual-stream MMDiT (48 blocks, inner_dim=1536, multi-layer text)."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
patch_size: int = 2,
|
||||
in_channels: int = 128,
|
||||
out_channels: Optional[int] = 32,
|
||||
num_layers: int = 48,
|
||||
attention_head_dim: int = 64,
|
||||
num_attention_heads: int = 24,
|
||||
enc_hidden_dim: int = 2880,
|
||||
axes_dims_rope: Tuple[int, int, int] = (8, 28, 28),
|
||||
rms_norm: bool = True,
|
||||
multi_layer_encoder_feature: bool = True,
|
||||
selected_layer_index: Tuple[int, ...] = (5, 11, 17, 23),
|
||||
image_model=None, # unused; accepted for detection-side configs.
|
||||
dtype=None,
|
||||
device=None,
|
||||
operations=None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.patch_size = patch_size
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels if out_channels is not None else in_channels
|
||||
self.inner_dim = num_attention_heads * attention_head_dim
|
||||
self.multi_layer_encoder_feature = multi_layer_encoder_feature
|
||||
self.selected_layer_index = list(selected_layer_index)
|
||||
self.dtype = dtype
|
||||
|
||||
self.pos_embed = EmbedND(dim=attention_head_dim, theta=10000, axes_dim=list(axes_dims_rope))
|
||||
self.time_text_embed = LensTimestepProjEmbeddings(
|
||||
embedding_dim=self.inner_dim, dtype=dtype, device=device, operations=operations
|
||||
)
|
||||
|
||||
if self.multi_layer_encoder_feature:
|
||||
self.txt_norm = nn.ModuleList(
|
||||
[operations.RMSNorm(enc_hidden_dim, eps=1e-5, dtype=dtype, device=device)
|
||||
for _ in self.selected_layer_index]
|
||||
)
|
||||
self.txt_in = operations.Linear(
|
||||
enc_hidden_dim * len(self.selected_layer_index),
|
||||
self.inner_dim, bias=True, dtype=dtype, device=device,
|
||||
)
|
||||
else:
|
||||
self.txt_norm = operations.RMSNorm(enc_hidden_dim, eps=1e-5, dtype=dtype, device=device)
|
||||
self.txt_in = operations.Linear(enc_hidden_dim, self.inner_dim, bias=True, dtype=dtype, device=device)
|
||||
|
||||
self.img_in = operations.Linear(in_channels, self.inner_dim, bias=True, dtype=dtype, device=device)
|
||||
|
||||
self.transformer_blocks = nn.ModuleList([
|
||||
LensTransformerBlock(
|
||||
dim=self.inner_dim,
|
||||
num_attention_heads=num_attention_heads,
|
||||
attention_head_dim=attention_head_dim,
|
||||
eps=1e-6,
|
||||
rms_norm=rms_norm,
|
||||
dtype=dtype, device=device, operations=operations,
|
||||
)
|
||||
for _ in range(num_layers)
|
||||
])
|
||||
|
||||
self.norm_out = _AdaLayerNormContinuousNoAffine(
|
||||
self.inner_dim, self.inner_dim, eps=1e-6,
|
||||
dtype=dtype, device=device, operations=operations,
|
||||
)
|
||||
self.proj_out = operations.Linear(
|
||||
self.inner_dim, patch_size * patch_size * self.out_channels, bias=True,
|
||||
dtype=dtype, device=device,
|
||||
)
|
||||
|
||||
def forward(self, x: torch.Tensor, timestep: torch.Tensor, context: torch.Tensor, attention_mask: Optional[torch.Tensor] = None,
|
||||
transformer_options: Optional[Dict[str, Any]] = None, **kwargs) -> torch.Tensor:
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
return comfy.patcher_extension.WrapperExecutor.new_class_executor(
|
||||
self._forward, self,
|
||||
comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options),
|
||||
).execute(x, timestep, context, attention_mask, transformer_options, **kwargs)
|
||||
|
||||
def _forward(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
timestep: torch.Tensor,
|
||||
context: torch.Tensor,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
transformer_options: Optional[Dict[str, Any]] = None,
|
||||
control: Optional[Dict[str, Any]] = None,
|
||||
**kwargs,
|
||||
) -> torch.Tensor:
|
||||
"""ComfyUI bridge: ``(x[B,128,h,w], t[B], context[B,S,L*H], mask[B,S])``."""
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
transformer_options = transformer_options.copy()
|
||||
patches = transformer_options.get("patches", {})
|
||||
patches_replace = transformer_options.get("patches_replace", {})
|
||||
blocks_replace = patches_replace.get("dit", {})
|
||||
|
||||
B, C, h, w = x.shape
|
||||
hidden_states = x.permute(0, 2, 3, 1).reshape(B, h * w, C)
|
||||
|
||||
if self.multi_layer_encoder_feature:
|
||||
L = len(self.selected_layer_index)
|
||||
enc_dim = context.shape[-1] // L
|
||||
encoder_hidden_states = list(
|
||||
context.reshape(B, -1, L, enc_dim).unbind(dim=2)
|
||||
)
|
||||
text_seq_len = encoder_hidden_states[0].shape[1]
|
||||
else:
|
||||
encoder_hidden_states = context
|
||||
text_seq_len = context.shape[1]
|
||||
|
||||
if attention_mask is None:
|
||||
attention_mask = torch.ones(
|
||||
(B, text_seq_len), dtype=torch.bool, device=x.device
|
||||
)
|
||||
|
||||
img_len = h * w
|
||||
joint_mask = self._build_joint_attention_mask(attention_mask, img_len)
|
||||
|
||||
hidden_states = self.img_in(hidden_states)
|
||||
timestep = timestep.to(hidden_states.dtype)
|
||||
|
||||
if self.multi_layer_encoder_feature:
|
||||
normed = [self.txt_norm[i](encoder_hidden_states[i]) for i in range(L)]
|
||||
encoder_hidden_states = torch.cat(normed, dim=-1)
|
||||
else:
|
||||
encoder_hidden_states = self.txt_norm(encoder_hidden_states)
|
||||
encoder_hidden_states = self.txt_in(encoder_hidden_states)
|
||||
|
||||
if "post_input" in patches:
|
||||
for p in patches["post_input"]:
|
||||
out = p({
|
||||
"img": hidden_states,
|
||||
"txt": encoder_hidden_states,
|
||||
"transformer_options": transformer_options,
|
||||
})
|
||||
hidden_states = out["img"]
|
||||
encoder_hidden_states = out["txt"]
|
||||
|
||||
temb = self.time_text_embed(timestep, hidden_states)
|
||||
ids = _lens_position_ids(1, h, w, text_seq_len, device=hidden_states.device).unsqueeze(0)
|
||||
freqs_cis = self.pos_embed(ids)
|
||||
|
||||
transformer_options["total_blocks"] = len(self.transformer_blocks)
|
||||
transformer_options["block_type"] = "double"
|
||||
for i, block in enumerate(self.transformer_blocks):
|
||||
transformer_options["block_index"] = i
|
||||
if ("double_block", i) in blocks_replace:
|
||||
def block_wrap(args):
|
||||
out = {}
|
||||
out["txt"], out["img"] = block(
|
||||
hidden_states=args["img"],
|
||||
encoder_hidden_states=args["txt"],
|
||||
temb=args["vec"],
|
||||
freqs_cis=args["pe"],
|
||||
attention_mask=args.get("attn_mask"),
|
||||
transformer_options=args.get("transformer_options"),
|
||||
)
|
||||
return out
|
||||
out = blocks_replace[("double_block", i)](
|
||||
{
|
||||
"img": hidden_states,
|
||||
"txt": encoder_hidden_states,
|
||||
"vec": temb,
|
||||
"pe": freqs_cis,
|
||||
"attn_mask": joint_mask,
|
||||
"transformer_options": transformer_options,
|
||||
},
|
||||
{"original_block": block_wrap},
|
||||
)
|
||||
encoder_hidden_states = out["txt"]
|
||||
hidden_states = out["img"]
|
||||
else:
|
||||
encoder_hidden_states, hidden_states = block(
|
||||
hidden_states=hidden_states,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
temb=temb,
|
||||
freqs_cis=freqs_cis,
|
||||
attention_mask=joint_mask,
|
||||
transformer_options=transformer_options,
|
||||
)
|
||||
|
||||
if "double_block" in patches:
|
||||
for p in patches["double_block"]:
|
||||
out = p({
|
||||
"img": hidden_states,
|
||||
"txt": encoder_hidden_states,
|
||||
"x": x,
|
||||
"block_index": i,
|
||||
"transformer_options": transformer_options,
|
||||
})
|
||||
hidden_states = out["img"]
|
||||
encoder_hidden_states = out["txt"]
|
||||
|
||||
if control is not None:
|
||||
control_i = control.get("input")
|
||||
if control_i is not None and i < len(control_i):
|
||||
add = control_i[i]
|
||||
if add is not None:
|
||||
hidden_states[:, :add.shape[1]] += add
|
||||
|
||||
hidden_states = self.norm_out(hidden_states, temb)
|
||||
out = self.proj_out(hidden_states)
|
||||
return out.reshape(B, h, w, C).permute(0, 3, 1, 2).contiguous()
|
||||
|
||||
@staticmethod
|
||||
def _build_joint_attention_mask(text_mask: torch.Tensor, img_len: int) -> torch.Tensor:
|
||||
if text_mask.dtype != torch.bool:
|
||||
text_mask = text_mask.bool()
|
||||
bsz = text_mask.shape[0]
|
||||
img_ones = torch.ones((bsz, img_len), dtype=torch.bool, device=text_mask.device)
|
||||
joint = torch.cat([img_ones, text_mask], dim=1)
|
||||
additive = torch.zeros_like(joint, dtype=torch.float32)
|
||||
additive.masked_fill_(~joint, torch.finfo(torch.float32).min)
|
||||
return additive[:, None, None, :]
|
||||
@ -767,25 +767,25 @@ class LTXAVModel(LTXVModel):
|
||||
|
||||
# Cross-attention timesteps - compress these too
|
||||
av_ca_audio_scale_shift_timestep, _ = self.av_ca_audio_scale_shift_adaln_single(
|
||||
timestep.max().expand_as(a_timestep_flat),
|
||||
a_timestep_flat,
|
||||
{"resolution": None, "aspect_ratio": None},
|
||||
batch_size=batch_size,
|
||||
hidden_dtype=hidden_dtype,
|
||||
)
|
||||
av_ca_video_scale_shift_timestep, _ = self.av_ca_video_scale_shift_adaln_single(
|
||||
a_timestep.max().expand_as(timestep_flat),
|
||||
timestep_flat,
|
||||
{"resolution": None, "aspect_ratio": None},
|
||||
batch_size=batch_size,
|
||||
hidden_dtype=hidden_dtype,
|
||||
)
|
||||
av_ca_a2v_gate_noise_timestep, _ = self.av_ca_a2v_gate_adaln_single(
|
||||
a_timestep.max().expand_as(timestep_flat) * av_ca_factor,
|
||||
a_timestep_scaled.max().expand_as(timestep_flat) * av_ca_factor,
|
||||
{"resolution": None, "aspect_ratio": None},
|
||||
batch_size=batch_size,
|
||||
hidden_dtype=hidden_dtype,
|
||||
)
|
||||
av_ca_v2a_gate_noise_timestep, _ = self.av_ca_v2a_gate_adaln_single(
|
||||
timestep.max().expand_as(a_timestep_flat) * av_ca_factor,
|
||||
timestep_scaled.max().expand_as(a_timestep_flat) * av_ca_factor,
|
||||
{"resolution": None, "aspect_ratio": None},
|
||||
batch_size=batch_size,
|
||||
hidden_dtype=hidden_dtype,
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
from __future__ import annotations
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
from __future__ import annotations
|
||||
import threading
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
# Code from: https://github.com/Alpha-VLLM/Lumina-Image-2.0/blob/main/models/model.py
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
|
||||
@ -810,12 +810,12 @@ optimized_attention = attention_basic
|
||||
if model_management.sage_attention_enabled():
|
||||
logging.info("Using sage attention")
|
||||
optimized_attention = attention_sage
|
||||
elif model_management.xformers_enabled():
|
||||
logging.info("Using xformers attention")
|
||||
optimized_attention = attention_xformers
|
||||
elif model_management.flash_attention_enabled():
|
||||
logging.info("Using Flash Attention")
|
||||
optimized_attention = attention_flash
|
||||
elif model_management.xformers_enabled():
|
||||
logging.info("Using xformers attention")
|
||||
optimized_attention = attention_xformers
|
||||
elif model_management.pytorch_attention_enabled():
|
||||
logging.info("Using pytorch attention")
|
||||
optimized_attention = attention_pytorch
|
||||
|
||||
@ -211,7 +211,7 @@ class TimestepEmbedder(nn.Module):
|
||||
Embeds scalar timesteps into vector representations.
|
||||
"""
|
||||
|
||||
def __init__(self, hidden_size, frequency_embedding_size=256, output_size=None, dtype=None, device=None, operations=None):
|
||||
def __init__(self, hidden_size, frequency_embedding_size=256, output_size=None, dtype=None, device=None, operations=None, max_period=10000):
|
||||
super().__init__()
|
||||
if output_size is None:
|
||||
output_size = hidden_size
|
||||
@ -221,9 +221,10 @@ class TimestepEmbedder(nn.Module):
|
||||
operations.Linear(hidden_size, output_size, bias=True, dtype=dtype, device=device),
|
||||
)
|
||||
self.frequency_embedding_size = frequency_embedding_size
|
||||
self.max_period = max_period
|
||||
|
||||
def forward(self, t, dtype, **kwargs):
|
||||
t_freq = timestep_embedding(t, self.frequency_embedding_size).to(dtype)
|
||||
t_freq = timestep_embedding(t, self.frequency_embedding_size, max_period=self.max_period).to(dtype)
|
||||
t_emb = self.mlp(t_freq)
|
||||
return t_emb
|
||||
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
"""Pure-torch + scipy geometry helpers for MoGe inference and mesh export."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional, Tuple
|
||||
|
||||
|
||||
@ -4,7 +4,6 @@ V1: DINOv2 backbone + multi-output head (points, mask).
|
||||
V2: DINOv2 encoder + neck + per-output heads (points, mask, normal, optional metric-scale MLP).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from numbers import Number
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
"""Building blocks for MoGe: residual conv stack, resamplers, MLP, DINOv2 encoder, v1 head."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import List, Optional, Sequence, Tuple, Union
|
||||
|
||||
|
||||
@ -6,7 +6,6 @@ equirect distance map via a multi-scale Poisson + gradient sparse solve.
|
||||
Image sampling uses F.grid_sample (GPU); the sparse solve uses lsmr (CPU).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Callable, List, Optional, Tuple
|
||||
|
||||
|
||||
239
comfy/ldm/pixeldit/model.py
Normal file
239
comfy/ldm/pixeldit/model.py
Normal file
@ -0,0 +1,239 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
import comfy.ldm.common_dit
|
||||
import comfy.patcher_extension
|
||||
from comfy.ldm.flux.math import apply_rope, rope
|
||||
from comfy.ldm.hidream.model import FeedForwardSwiGLU
|
||||
from comfy.ldm.modules.attention import optimized_attention
|
||||
from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder
|
||||
|
||||
from .modules import (
|
||||
FinalLayer,
|
||||
PatchTokenEmbedder,
|
||||
PiTBlock,
|
||||
PixelTokenEmbedder,
|
||||
apply_adaln_,
|
||||
precompute_freqs_cis_2d,
|
||||
)
|
||||
|
||||
|
||||
class MMDiTJointAttention(nn.Module):
|
||||
"""Joint MMDiT attention with separate Q/K/V/proj for image and text streams.
|
||||
|
||||
RoPE is applied to each stream before concatenation so each stream uses its own
|
||||
2D/1D positional encoding. Concat order is [text, image] (text first).
|
||||
"""
|
||||
def __init__(self, dim, num_heads=8, qkv_bias=False, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
assert dim % num_heads == 0
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = dim // num_heads
|
||||
|
||||
self.qkv_x = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device)
|
||||
self.qkv_y = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device)
|
||||
|
||||
self.q_norm_x = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device)
|
||||
self.k_norm_x = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device)
|
||||
self.q_norm_y = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device)
|
||||
self.k_norm_y = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device)
|
||||
|
||||
self.proj_x = operations.Linear(dim, dim, dtype=dtype, device=device)
|
||||
self.proj_y = operations.Linear(dim, dim, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, x, y, pos_img, pos_txt=None, attn_mask=None, transformer_options={}):
|
||||
B, Nx, _ = x.shape
|
||||
_, Ny, _ = y.shape
|
||||
H = self.num_heads
|
||||
D = self.head_dim
|
||||
|
||||
qkv_x = self.qkv_x(x).reshape(B, Nx, 3, H, D).permute(2, 0, 3, 1, 4)
|
||||
qx, kx, vx = qkv_x.unbind(0)
|
||||
qx = self.q_norm_x(qx)
|
||||
kx = self.k_norm_x(kx)
|
||||
|
||||
qkv_y = self.qkv_y(y).reshape(B, Ny, 3, H, D).permute(2, 0, 3, 1, 4)
|
||||
qy, ky, vy = qkv_y.unbind(0)
|
||||
qy = self.q_norm_y(qy)
|
||||
ky = self.k_norm_y(ky)
|
||||
|
||||
qx, kx = apply_rope(qx, kx, pos_img[None, None])
|
||||
if pos_txt is not None:
|
||||
qy, ky = apply_rope(qy, ky, pos_txt[None, None])
|
||||
|
||||
q_joint = torch.cat([qy, qx], dim=2)
|
||||
k_joint = torch.cat([ky, kx], dim=2)
|
||||
v_joint = torch.cat([vy, vx], dim=2)
|
||||
|
||||
out_joint = optimized_attention(
|
||||
q_joint, k_joint, v_joint, H,
|
||||
mask=attn_mask, skip_reshape=True, skip_output_reshape=True,
|
||||
transformer_options=transformer_options,
|
||||
)
|
||||
|
||||
out_y = out_joint[:, :, :Ny, :].transpose(1, 2).reshape(B, Ny, H * D)
|
||||
out_x = out_joint[:, :, Ny:, :].transpose(1, 2).reshape(B, Nx, H * D)
|
||||
|
||||
return self.proj_x(out_x), self.proj_y(out_y)
|
||||
|
||||
|
||||
class MMDiTBlockT2I(nn.Module):
|
||||
def __init__(self, hidden_size, groups, mlp_ratio=4.0, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
self.norm_x1 = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device)
|
||||
self.norm_y1 = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device)
|
||||
self.attn = MMDiTJointAttention(hidden_size, num_heads=groups, qkv_bias=False, dtype=dtype, device=device, operations=operations)
|
||||
self.norm_x2 = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device)
|
||||
self.norm_y2 = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device)
|
||||
mlp_hidden_dim = int(hidden_size * mlp_ratio)
|
||||
self.mlp_x = FeedForwardSwiGLU(hidden_size, mlp_hidden_dim, multiple_of=1, dtype=dtype, device=device, operations=operations)
|
||||
self.mlp_y = FeedForwardSwiGLU(hidden_size, mlp_hidden_dim, multiple_of=1, dtype=dtype, device=device, operations=operations)
|
||||
self.adaLN_modulation_img = nn.Sequential(operations.Linear(hidden_size, 6 * hidden_size, bias=True, dtype=dtype, device=device))
|
||||
self.adaLN_modulation_txt = nn.Sequential(operations.Linear(hidden_size, 6 * hidden_size, bias=True, dtype=dtype, device=device))
|
||||
|
||||
def forward(self, x, y, c, pos_img, pos_txt=None, attn_mask=None, transformer_options={}):
|
||||
shift_msa_x, scale_msa_x, gate_msa_x, shift_mlp_x, scale_mlp_x, gate_mlp_x = self.adaLN_modulation_img(c).chunk(6, dim=-1)
|
||||
shift_msa_y, scale_msa_y, gate_msa_y, shift_mlp_y, scale_mlp_y, gate_mlp_y = self.adaLN_modulation_txt(c).chunk(6, dim=-1)
|
||||
|
||||
x_norm = apply_adaln_(self.norm_x1(x), shift_msa_x, scale_msa_x)
|
||||
y_norm = apply_adaln_(self.norm_y1(y), shift_msa_y, scale_msa_y)
|
||||
attn_x, attn_y = self.attn(x_norm, y_norm, pos_img, pos_txt, attn_mask, transformer_options=transformer_options)
|
||||
x = torch.addcmul(x, gate_msa_x, attn_x)
|
||||
y = torch.addcmul(y, gate_msa_y, attn_y)
|
||||
|
||||
x = torch.addcmul(x, gate_mlp_x, self.mlp_x(apply_adaln_(self.norm_x2(x), shift_mlp_x, scale_mlp_x)))
|
||||
y = torch.addcmul(y, gate_mlp_y, self.mlp_y(apply_adaln_(self.norm_y2(y), shift_mlp_y, scale_mlp_y)))
|
||||
return x, y
|
||||
|
||||
|
||||
class PixDiT_T2I(nn.Module):
|
||||
"""PixelDiT T2I model. Hardcoded for the released 1024px Stage-3 checkpoint
|
||||
(also runs at 512px when fed the appropriate latent size and flow_shift).
|
||||
|
||||
Forward:
|
||||
x: [B, 3, H, W] pixel-space input (no VAE)
|
||||
timesteps:[B] in [0, 1000] (ComfyUI flow sampling convention)
|
||||
context: [B, Ltxt, 2304] Gemma-2-2b-it hidden states (chi_prompt prepended)
|
||||
Returns flow-matching velocity [B, 3, H, W].
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
in_channels=3,
|
||||
num_groups=24,
|
||||
hidden_size=1536,
|
||||
pixel_hidden_size=16,
|
||||
pixel_attn_hidden_size=1152,
|
||||
pixel_num_groups=16,
|
||||
patch_depth=14,
|
||||
pixel_depth=2,
|
||||
patch_size=16,
|
||||
txt_embed_dim=2304,
|
||||
txt_max_length=300,
|
||||
use_text_rope=True,
|
||||
text_rope_theta=10000.0,
|
||||
image_model=None,
|
||||
dtype=None,
|
||||
device=None,
|
||||
operations=None,
|
||||
pixel_mlp_chunks=2,
|
||||
):
|
||||
super().__init__()
|
||||
self.dtype = dtype
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = in_channels
|
||||
self.hidden_size = hidden_size
|
||||
self.num_groups = num_groups
|
||||
self.patch_depth = patch_depth
|
||||
self.pixel_depth = pixel_depth
|
||||
self.patch_size = patch_size
|
||||
self.pixel_hidden_size = pixel_hidden_size
|
||||
self.pixel_attn_hidden_size = pixel_attn_hidden_size
|
||||
self.pixel_num_groups = pixel_num_groups
|
||||
self.txt_embed_dim = txt_embed_dim
|
||||
self.txt_max_length = txt_max_length
|
||||
self.use_text_rope = use_text_rope
|
||||
self.text_rope_theta = text_rope_theta
|
||||
|
||||
self.pixel_embedder = PixelTokenEmbedder(self.in_channels, self.pixel_hidden_size, dtype=dtype, device=device, operations=operations)
|
||||
self.s_embedder = PatchTokenEmbedder(self.in_channels * self.patch_size ** 2, self.hidden_size, bias=True, dtype=dtype, device=device, operations=operations)
|
||||
self.t_embedder = TimestepEmbedder(self.hidden_size, dtype=dtype, device=device, operations=operations, max_period=10)
|
||||
self.y_embedder = PatchTokenEmbedder(self.txt_embed_dim, self.hidden_size, bias=True, use_norm=True, dtype=dtype, device=device, operations=operations)
|
||||
self.y_pos_embedding = nn.Parameter(torch.empty(1, self.txt_max_length, self.hidden_size, dtype=dtype, device=device))
|
||||
|
||||
self.patch_blocks = nn.ModuleList([
|
||||
MMDiTBlockT2I(self.hidden_size, self.num_groups,
|
||||
dtype=dtype, device=device, operations=operations)
|
||||
for _ in range(self.patch_depth)
|
||||
])
|
||||
self.pixel_blocks = nn.ModuleList([
|
||||
PiTBlock(
|
||||
self.pixel_hidden_size,
|
||||
self.hidden_size,
|
||||
patch_size=self.patch_size,
|
||||
num_heads=self.num_groups,
|
||||
attn_hidden_size=self.pixel_attn_hidden_size,
|
||||
attn_num_heads=self.pixel_num_groups,
|
||||
dtype=dtype, device=device, operations=operations,
|
||||
mlp_chunks=pixel_mlp_chunks,
|
||||
)
|
||||
for _ in range(self.pixel_depth)
|
||||
])
|
||||
|
||||
self.final_layer = FinalLayer(self.pixel_hidden_size, self.out_channels, dtype=dtype, device=device, operations=operations)
|
||||
|
||||
def _fetch_patch_pos(self, height, width, device, dtype, **rope_opts):
|
||||
return precompute_freqs_cis_2d(self.hidden_size // self.num_groups, height, width, device=device, dtype=dtype, **rope_opts)
|
||||
|
||||
def _fetch_text_pos(self, length, device, dtype):
|
||||
return rope(torch.arange(length, dtype=torch.float32, device=device).reshape(1, -1), self.hidden_size // self.num_groups, self.text_rope_theta).squeeze(0).to(dtype=dtype)
|
||||
|
||||
def forward(self, x, timesteps, context=None, attention_mask=None, transformer_options={}, **kwargs):
|
||||
return comfy.patcher_extension.WrapperExecutor.new_class_executor(
|
||||
self._forward, self, comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options),
|
||||
).execute(x, timesteps, context, attention_mask, transformer_options, **kwargs)
|
||||
|
||||
def _pre_patch_block(self, s, i, **kwargs):
|
||||
"""Hook for subclasses to inject per-block state into the patch stream (e.g. PiD's LQ gate)."""
|
||||
return s
|
||||
|
||||
def _forward(self, x, timesteps, context=None, attention_mask=None, transformer_options={}, **kwargs):
|
||||
H_orig, W_orig = x.shape[2], x.shape[3]
|
||||
x = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size))
|
||||
B, _, H, W = x.shape
|
||||
Hs = H // self.patch_size
|
||||
Ws = W // self.patch_size
|
||||
L = Hs * Ws
|
||||
|
||||
pos_img = self._fetch_patch_pos(Hs, Ws, x.device, x.dtype, **(transformer_options.get("rope_options") or {}))
|
||||
x_patches = F.unfold(x, kernel_size=self.patch_size, stride=self.patch_size).transpose(1, 2)
|
||||
|
||||
t_emb = self.t_embedder(timesteps.view(-1), x.dtype).view(B, -1, self.hidden_size)
|
||||
|
||||
if context is None or context.dim() != 3:
|
||||
raise ValueError("PixDiT_T2I requires context (text embeddings) of shape [B, L, D]")
|
||||
Ltxt = min(context.shape[1], self.txt_max_length)
|
||||
y = context[:, :Ltxt, :]
|
||||
y_emb = self.y_embedder(y).view(B, Ltxt, self.hidden_size)
|
||||
y_emb = y_emb + self.y_pos_embedding[:, :Ltxt, :].to(y_emb) # y_pos_embedding is a raw nn.Parameter
|
||||
|
||||
condition = F.silu(t_emb)
|
||||
pos_txt = self._fetch_text_pos(Ltxt, x.device, x.dtype) if self.use_text_rope else None
|
||||
|
||||
s = self.s_embedder(x_patches)
|
||||
for i, blk in enumerate(self.patch_blocks):
|
||||
s = self._pre_patch_block(s, i, **kwargs)
|
||||
s, y_emb = blk(s, y_emb, condition, pos_img, pos_txt, None, transformer_options=transformer_options)
|
||||
s = F.silu(t_emb + s)
|
||||
|
||||
s_cond = s.view(B * L, self.hidden_size)
|
||||
x_pixels = self.pixel_embedder(x, patch_size=self.patch_size)
|
||||
for blk in self.pixel_blocks:
|
||||
x_pixels = blk(x_pixels, s_cond, H, W, self.patch_size, mask=None, transformer_options=transformer_options)
|
||||
|
||||
x_pixels = self.final_layer(x_pixels)
|
||||
C_out = self.out_channels
|
||||
P2 = self.patch_size * self.patch_size
|
||||
x_pixels = x_pixels.view(B, L, P2, C_out).permute(0, 3, 2, 1).reshape(B, C_out * P2, L)
|
||||
out = F.fold(x_pixels, (H, W), kernel_size=self.patch_size, stride=self.patch_size)
|
||||
return out[:, :, :H_orig, :W_orig]
|
||||
187
comfy/ldm/pixeldit/modules.py
Normal file
187
comfy/ldm/pixeldit/modules.py
Normal file
@ -0,0 +1,187 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from comfy.ldm.flux.math import apply_rope, rope
|
||||
from comfy.ldm.modules.attention import optimized_attention
|
||||
from comfy.ldm.modules.diffusionmodules.mmdit import Mlp, get_1d_sincos_pos_embed_from_grid_torch
|
||||
|
||||
|
||||
def apply_adaln_(x, shift, scale):
|
||||
return x.addcmul_(x, scale).add_(shift)
|
||||
|
||||
|
||||
def precompute_freqs_cis_2d(dim, height, width, theta=10000.0, scale=16.0,
|
||||
ref_grid_h=None, ref_grid_w=None,
|
||||
scale_x=1.0, scale_y=1.0, shift_x=0.0, shift_y=0.0,
|
||||
device=None, dtype=torch.float32, **kwargs):
|
||||
"""2D RoPE with x/y axis frequencies interleaved at stride 2 across head dim.
|
||||
|
||||
rope_options:
|
||||
scale_x / scale_y multiply the position range (RoPE extrapolation).
|
||||
shift_x / shift_y offset the position origin (tiled / regional inference).
|
||||
With ref_grid_h/w set, also applies NTK-aware per-axis theta scaling
|
||||
(rope_mode='ntk_aware'): theta_axis = theta * (current/ref)^(dim_axis/(dim_axis-2)).
|
||||
Returns Flux-format rotation matrices of shape [H*W, dim/2, 2, 2].
|
||||
Layout of head-dim pairs: [x_0, y_0, x_1, y_1, ..., x_{dim/4-1}, y_{dim/4-1}].
|
||||
"""
|
||||
dim_axis = dim // 2
|
||||
if ref_grid_h is not None and dim_axis > 2:
|
||||
h_ntk = (height / ref_grid_h) ** (dim_axis / (dim_axis - 2))
|
||||
w_ntk = (width / ref_grid_w) ** (dim_axis / (dim_axis - 2))
|
||||
else:
|
||||
h_ntk = w_ntk = 1.0
|
||||
|
||||
x_lin = torch.linspace(shift_x, scale * scale_x + shift_x, width, device=device)
|
||||
y_lin = torch.linspace(shift_y, scale * scale_y + shift_y, height, device=device)
|
||||
y_grid, x_grid = torch.meshgrid(y_lin, x_lin, indexing="ij")
|
||||
x_rope = rope(x_grid.reshape(1, -1), dim_axis, theta * w_ntk).squeeze(0)
|
||||
y_rope = rope(y_grid.reshape(1, -1), dim_axis, theta * h_ntk).squeeze(0)
|
||||
out = torch.stack([x_rope, y_rope], dim=2).reshape(height * width, dim // 2, 2, 2)
|
||||
return out.to(dtype=dtype)
|
||||
|
||||
|
||||
def get_2d_sincos_pos_embed(embed_dim, height, width, device=None, dtype=torch.float32):
|
||||
"""Standard 2D sin/cos absolute positional embedding (ViT-style).
|
||||
|
||||
first half encodes W-coordinates, second half H.
|
||||
"""
|
||||
assert embed_dim % 4 == 0
|
||||
grid_h = torch.arange(height, dtype=torch.float32, device=device)
|
||||
grid_w = torch.arange(width, dtype=torch.float32, device=device)
|
||||
grid_y, grid_x = torch.meshgrid(grid_h, grid_w, indexing="ij")
|
||||
emb_w = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_x.reshape(-1), device=device)
|
||||
emb_h = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_y.reshape(-1), device=device)
|
||||
return torch.cat([emb_w, emb_h], dim=1).to(dtype=dtype)
|
||||
|
||||
|
||||
class RotaryAttention(nn.Module):
|
||||
"""Single-stream self-attention with rotary positional encoding (used inside PiTBlock)."""
|
||||
def __init__(self, dim, num_heads=8, qkv_bias=False, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
assert dim % num_heads == 0
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = dim // num_heads
|
||||
self.qkv = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device)
|
||||
self.q_norm = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device)
|
||||
self.k_norm = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device)
|
||||
self.proj = operations.Linear(dim, dim, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, x, pos, mask=None, transformer_options={}):
|
||||
B, N, C = x.shape
|
||||
H = self.num_heads
|
||||
D = self.head_dim
|
||||
qkv = self.qkv(x).reshape(B, N, 3, H, D).permute(2, 0, 3, 1, 4)
|
||||
q, k, v = qkv.unbind(0)
|
||||
q, k = apply_rope(self.q_norm(q), self.k_norm(k), pos[None, None])
|
||||
x = optimized_attention(q, k, v, H, mask=mask, skip_reshape=True, transformer_options=transformer_options)
|
||||
return self.proj(x)
|
||||
|
||||
|
||||
class FinalLayer(nn.Module):
|
||||
def __init__(self, hidden_size, out_channels, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
self.norm = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device)
|
||||
self.linear = operations.Linear(hidden_size, out_channels, bias=True, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, x):
|
||||
return self.linear(self.norm(x))
|
||||
|
||||
|
||||
class PatchTokenEmbedder(nn.Module):
|
||||
"""Linear projection used both for patchified-image tokens and text-feature tokens."""
|
||||
def __init__(self, in_chans, embed_dim, use_norm=False, bias=True, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
self.proj = operations.Linear(in_chans, embed_dim, bias=bias, dtype=dtype, device=device)
|
||||
self.norm = operations.RMSNorm(embed_dim, eps=1e-6, dtype=dtype, device=device) if use_norm else nn.Identity()
|
||||
|
||||
def forward(self, x):
|
||||
return self.norm(self.proj(x))
|
||||
|
||||
|
||||
class PixelTokenEmbedder(nn.Module):
|
||||
"""Pixel-level embedder: lifts each RGB pixel to hidden_size and packs into per-patch sequences."""
|
||||
def __init__(self, in_channels, hidden_size_output, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
self.in_channels = in_channels
|
||||
self.hidden_size_output = hidden_size_output
|
||||
self.proj = operations.Linear(self.in_channels, self.hidden_size_output, bias=True, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, inputs, patch_size):
|
||||
B, _, H, W = inputs.shape
|
||||
Hs, Ws = H // patch_size, W // patch_size
|
||||
P2 = patch_size * patch_size
|
||||
x = inputs.permute(0, 2, 3, 1).contiguous()
|
||||
x = self.proj(x)
|
||||
pos_full = get_2d_sincos_pos_embed(self.hidden_size_output, H, W, device=x.device, dtype=x.dtype).view(H, W, self.hidden_size_output)
|
||||
x = x + pos_full.unsqueeze(0)
|
||||
x = x.view(B, Hs, patch_size, Ws, patch_size, self.hidden_size_output)
|
||||
return x.permute(0, 1, 3, 2, 4, 5).reshape(B * Hs * Ws, P2, self.hidden_size_output)
|
||||
|
||||
|
||||
class PiTBlock(nn.Module):
|
||||
"""Pixel-level transformer block.
|
||||
|
||||
Compresses each patch's P^2 pixel tokens → 1 attention token via a linear,
|
||||
runs global self-attention across patches with 2D RoPE, then expands back to P^2 tokens.
|
||||
Conditioning is per-pixel adaLN from the patch-level features.
|
||||
"""
|
||||
def __init__(self, pixel_hidden_size, patch_hidden_size, patch_size, num_heads, mlp_ratio=4.0,
|
||||
attn_hidden_size=None, attn_num_heads=None, dtype=None, device=None, operations=None, mlp_chunks=1):
|
||||
super().__init__()
|
||||
self.pixel_dim = pixel_hidden_size
|
||||
self.context_dim = patch_hidden_size
|
||||
self.attn_dim = attn_hidden_size if attn_hidden_size is not None else patch_hidden_size
|
||||
self.num_heads = attn_num_heads if attn_num_heads is not None else num_heads
|
||||
assert self.attn_dim % self.num_heads == 0
|
||||
|
||||
p2 = patch_size * patch_size
|
||||
self.compress_to_attn = operations.Linear(p2 * self.pixel_dim, self.attn_dim, bias=True, dtype=dtype, device=device)
|
||||
self.expand_from_attn = operations.Linear(self.attn_dim, p2 * self.pixel_dim, bias=True, dtype=dtype, device=device)
|
||||
|
||||
self.norm1 = operations.RMSNorm(self.pixel_dim, eps=1e-6, dtype=dtype, device=device)
|
||||
self.attn = RotaryAttention(self.attn_dim, num_heads=self.num_heads, qkv_bias=False, dtype=dtype, device=device, operations=operations)
|
||||
self.norm2 = operations.RMSNorm(self.pixel_dim, eps=1e-6, dtype=dtype, device=device)
|
||||
self.mlp = Mlp(self.pixel_dim, hidden_features=int(self.pixel_dim * mlp_ratio), dtype=dtype, device=device, operations=operations)
|
||||
|
||||
self.adaLN_modulation_msa = operations.Linear(self.context_dim, 3 * self.pixel_dim * p2, bias=True, dtype=dtype, device=device)
|
||||
self.adaLN_modulation_mlp = operations.Linear(self.context_dim, 3 * self.pixel_dim * p2, bias=True, dtype=dtype, device=device)
|
||||
|
||||
self._rope_fn = precompute_freqs_cis_2d
|
||||
self.mlp_chunks = max(1, int(mlp_chunks))
|
||||
|
||||
def _fetch_pos(self, height, width, device, dtype, **rope_opts):
|
||||
return self._rope_fn(self.attn_dim // self.num_heads, height, width, device=device, dtype=dtype, **rope_opts)
|
||||
|
||||
def forward(self, x, s_cond, image_height, image_width, patch_size, mask=None, transformer_options={}):
|
||||
BL, P2, _ = x.shape
|
||||
Hs, Ws = image_height // patch_size, image_width // patch_size
|
||||
L = Hs * Ws
|
||||
B = BL // L
|
||||
|
||||
# Attention path uses only msa params; compute, use, free before mlp params allocate.
|
||||
msa_params = self.adaLN_modulation_msa(s_cond).view(BL, P2, 3 * self.pixel_dim)
|
||||
shift_msa, scale_msa, gate_msa = msa_params.chunk(3, dim=-1)
|
||||
|
||||
x_norm = apply_adaln_(self.norm1(x), shift_msa, scale_msa)
|
||||
x_flat = x_norm.view(BL, P2 * self.pixel_dim)
|
||||
|
||||
x_comp = self.compress_to_attn(x_flat).view(B, L, self.attn_dim)
|
||||
pos_comp = self._fetch_pos(Hs, Ws, x.device, x.dtype, **(transformer_options.get("rope_options") or {}))
|
||||
attn_out = self.attn(x_comp, pos_comp, mask=mask, transformer_options=transformer_options)
|
||||
attn_flat = self.expand_from_attn(attn_out.view(B * L, self.attn_dim))
|
||||
attn_exp = attn_flat.view(BL, P2, self.pixel_dim)
|
||||
x = torch.addcmul(x, gate_msa, attn_exp)
|
||||
del msa_params, shift_msa, scale_msa, gate_msa
|
||||
|
||||
mlp_params = self.adaLN_modulation_mlp(s_cond).view(BL, P2, 3 * self.pixel_dim)
|
||||
shift_mlp, scale_mlp, gate_mlp = mlp_params.chunk(3, dim=-1)
|
||||
gate_mlp = gate_mlp.contiguous() # detach from mlp_params so the del below frees shift+scale storage before the MLP
|
||||
mlp_input = apply_adaln_(self.norm2(x), shift_mlp, scale_mlp)
|
||||
del mlp_params, shift_mlp, scale_mlp
|
||||
|
||||
# MLP in chunks since the peak memory usage is huge here
|
||||
chunk_size = (BL + self.mlp_chunks - 1) // self.mlp_chunks
|
||||
for s in range(0, BL, chunk_size):
|
||||
e = min(s + chunk_size, BL)
|
||||
x[s:e].addcmul_(gate_mlp[s:e], self.mlp(mlp_input[s:e]))
|
||||
return x
|
||||
227
comfy/ldm/pixeldit/pid.py
Normal file
227
comfy/ldm/pixeldit/pid.py
Normal file
@ -0,0 +1,227 @@
|
||||
"""PiD — Pixel Diffusion Decoder. Decodes a Flux/SD3/Flux2/Z-Image latent
|
||||
directly to a 4x-upscaled image in 4 distilled flow-matching steps. PixDiT_T2I
|
||||
body + LQ projection branch injected before each MMDiT patch block.
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from .model import PixDiT_T2I
|
||||
from .modules import precompute_freqs_cis_2d
|
||||
|
||||
|
||||
class SigmaAwareGatePerTokenPerDim(nn.Module):
|
||||
"""gate = sigmoid(content_proj(cat[x, lq]) - exp(log_alpha) * sigma); out = x + gate * lq.
|
||||
|
||||
Trained init gives ~0.88 gate at sigma=0, ~0.05 at sigma=1.
|
||||
"""
|
||||
|
||||
def __init__(self, dim: int, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
self.content_proj = operations.Linear(dim * 2, dim, dtype=dtype, device=device)
|
||||
self.log_alpha = nn.Parameter(torch.empty((), dtype=dtype, device=device))
|
||||
|
||||
def forward(self, x: torch.Tensor, lq: torch.Tensor, sigma: torch.Tensor) -> torch.Tensor:
|
||||
content_logit = self.content_proj(torch.cat([x, lq], dim=-1))
|
||||
# log_alpha is a raw nn.Parameter -> doesn't auto-cast under dynamic VRAM.
|
||||
log_alpha = self.log_alpha.to(device=x.device, dtype=torch.float32)
|
||||
sigma_offset = -log_alpha.exp() * sigma.float().view(-1, 1, 1)
|
||||
gate = torch.sigmoid(content_logit + sigma_offset)
|
||||
return x + (gate * lq).to(x.dtype)
|
||||
|
||||
|
||||
class ResBlock(nn.Module):
|
||||
"""Pre-activation ResNet block: GN -> SiLU -> Conv -> GN -> SiLU -> Conv + skip."""
|
||||
|
||||
def __init__(self, channels: int, num_groups: int = 4, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
self.block = nn.Sequential(
|
||||
operations.GroupNorm(num_groups, channels, dtype=dtype, device=device),
|
||||
nn.SiLU(),
|
||||
operations.Conv2d(channels, channels, kernel_size=3, padding=1, dtype=dtype, device=device),
|
||||
operations.GroupNorm(num_groups, channels, dtype=dtype, device=device),
|
||||
nn.SiLU(),
|
||||
operations.Conv2d(channels, channels, kernel_size=3, padding=1, dtype=dtype, device=device),
|
||||
)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
return x + self.block(x)
|
||||
|
||||
|
||||
class LQProjection2D(nn.Module):
|
||||
"""LQ latent -> per-block patch-aligned features for controlnet-style injection."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
latent_channels: int,
|
||||
hidden_dim: int = 512,
|
||||
out_dim: int = 1536,
|
||||
patch_size: int = 16,
|
||||
sr_scale: int = 4,
|
||||
latent_spatial_down_factor: int = 8,
|
||||
num_res_blocks: int = 4,
|
||||
num_outputs: int = 7,
|
||||
interval: int = 2,
|
||||
dtype=None, device=None, operations=None,
|
||||
):
|
||||
super().__init__()
|
||||
self.latent_channels = latent_channels
|
||||
self.hidden_dim = hidden_dim
|
||||
self.out_dim = out_dim
|
||||
self.patch_size = patch_size
|
||||
self.sr_scale = sr_scale
|
||||
self.latent_spatial_down_factor = latent_spatial_down_factor
|
||||
self.num_outputs = num_outputs
|
||||
self.interval = interval
|
||||
|
||||
z_to_patch_ratio = (sr_scale * latent_spatial_down_factor) / patch_size
|
||||
self.z_to_patch_ratio = z_to_patch_ratio
|
||||
if z_to_patch_ratio >= 1:
|
||||
self.latent_fold_factor = 0
|
||||
latent_proj_in_ch = latent_channels
|
||||
else:
|
||||
fold_factor = int(1 / z_to_patch_ratio)
|
||||
assert fold_factor * z_to_patch_ratio == 1.0
|
||||
self.latent_fold_factor = fold_factor
|
||||
latent_proj_in_ch = latent_channels * fold_factor * fold_factor
|
||||
|
||||
layers = [
|
||||
operations.Conv2d(latent_proj_in_ch, hidden_dim, kernel_size=3, padding=1, dtype=dtype, device=device),
|
||||
nn.SiLU(),
|
||||
operations.Conv2d(hidden_dim, hidden_dim, kernel_size=3, padding=1, dtype=dtype, device=device),
|
||||
]
|
||||
for _ in range(num_res_blocks):
|
||||
layers.append(ResBlock(hidden_dim, dtype=dtype, device=device, operations=operations))
|
||||
self.latent_proj = nn.Sequential(*layers)
|
||||
|
||||
self.output_heads = nn.ModuleList(
|
||||
[operations.Linear(hidden_dim, out_dim, dtype=dtype, device=device) for _ in range(num_outputs)]
|
||||
)
|
||||
self.gate_modules = nn.ModuleList(
|
||||
[SigmaAwareGatePerTokenPerDim(out_dim, dtype=dtype, device=device, operations=operations)
|
||||
for _ in range(num_outputs)]
|
||||
)
|
||||
|
||||
def is_gate_active(self, block_idx: int) -> bool:
|
||||
return block_idx % self.interval == 0
|
||||
|
||||
def output_index(self, block_idx: int) -> int:
|
||||
return block_idx // self.interval
|
||||
|
||||
def gate(self, x: torch.Tensor, lq_feature: torch.Tensor, sigma: torch.Tensor, out_idx: int) -> torch.Tensor:
|
||||
return self.gate_modules[out_idx](x, lq_feature, sigma)
|
||||
|
||||
def _align_latent_to_patch_grid(self, lq_latent: torch.Tensor, pH: int, pW: int) -> torch.Tensor:
|
||||
B, z_dim = lq_latent.shape[:2]
|
||||
if self.z_to_patch_ratio >= 1:
|
||||
if lq_latent.shape[2] != pH or lq_latent.shape[3] != pW:
|
||||
z_aligned = F.interpolate(lq_latent, size=(pH, pW), mode="nearest")
|
||||
else:
|
||||
z_aligned = lq_latent
|
||||
else:
|
||||
f = self.latent_fold_factor
|
||||
zH_expected, zW_expected = pH * f, pW * f
|
||||
if lq_latent.shape[2] != zH_expected or lq_latent.shape[3] != zW_expected:
|
||||
lq_latent = F.interpolate(lq_latent, size=(zH_expected, zW_expected), mode="nearest")
|
||||
z_aligned = lq_latent.reshape(B, z_dim, pH, f, pW, f).permute(0, 1, 3, 5, 2, 4)
|
||||
z_aligned = z_aligned.reshape(B, z_dim * f * f, pH, pW)
|
||||
return self.latent_proj(z_aligned)
|
||||
|
||||
def forward(self, lq_latent: torch.Tensor, target_pH: int, target_pW: int) -> List[torch.Tensor]:
|
||||
feat = self._align_latent_to_patch_grid(lq_latent, target_pH, target_pW)
|
||||
B, C, H, W = feat.shape
|
||||
tokens = feat.permute(0, 2, 3, 1).contiguous().view(B, H * W, C)
|
||||
return [head(tokens) for head in self.output_heads]
|
||||
|
||||
|
||||
class PidNet(PixDiT_T2I):
|
||||
"""PixDiT_T2I + LQ injection (one sigma-gated feature inserted before each patch block)."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
lq_latent_channels: int = 16,
|
||||
lq_hidden_dim: int = 512,
|
||||
lq_num_res_blocks: int = 4,
|
||||
lq_interval: int = 2,
|
||||
sr_scale: int = 4,
|
||||
latent_spatial_down_factor: int = 8,
|
||||
rope_ref_h: int = 1024, # NTK ref resolution in PIXEL units: 1024px / patch=16 -> grid_ref=64.
|
||||
rope_ref_w: int = 1024,
|
||||
image_model=None,
|
||||
dtype=None, device=None, operations=None,
|
||||
**pixdit_kwargs,
|
||||
):
|
||||
super().__init__(dtype=dtype, device=device, operations=operations, **pixdit_kwargs)
|
||||
|
||||
self.rope_ref_grid_h = rope_ref_h // self.patch_size
|
||||
self.rope_ref_grid_w = rope_ref_w // self.patch_size
|
||||
|
||||
# Parent's PiTBlocks were built with plain RoPE — swap in NTK-aware.
|
||||
def _pit_rope_fn(head_dim, h, w, device=None, dtype=torch.float32, **rope_opts):
|
||||
return precompute_freqs_cis_2d(head_dim, h, w, ref_grid_h=self.rope_ref_grid_h, ref_grid_w=self.rope_ref_grid_w, device=device, dtype=dtype, **rope_opts)
|
||||
for blk in self.pixel_blocks:
|
||||
blk._rope_fn = _pit_rope_fn
|
||||
|
||||
num_lq_outputs = (self.patch_depth + lq_interval - 1) // lq_interval
|
||||
self.lq_proj = LQProjection2D(
|
||||
latent_channels=lq_latent_channels,
|
||||
hidden_dim=lq_hidden_dim,
|
||||
out_dim=self.hidden_size,
|
||||
patch_size=self.patch_size,
|
||||
sr_scale=sr_scale,
|
||||
latent_spatial_down_factor=latent_spatial_down_factor,
|
||||
num_res_blocks=lq_num_res_blocks,
|
||||
num_outputs=num_lq_outputs,
|
||||
interval=lq_interval,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
operations=operations,
|
||||
)
|
||||
|
||||
def _fetch_patch_pos(self, height, width, device, dtype, **rope_opts):
|
||||
return precompute_freqs_cis_2d(
|
||||
self.hidden_size // self.num_groups,
|
||||
height, width,
|
||||
ref_grid_h=self.rope_ref_grid_h, ref_grid_w=self.rope_ref_grid_w,
|
||||
device=device, dtype=dtype, **rope_opts,
|
||||
)
|
||||
|
||||
def _pre_patch_block(self, s, i, pid_lq_features, pid_degrade_sigma, **kwargs):
|
||||
if not self.lq_proj.is_gate_active(i):
|
||||
return s
|
||||
out_idx = self.lq_proj.output_index(i)
|
||||
if out_idx >= len(pid_lq_features):
|
||||
return s
|
||||
return self.lq_proj.gate(s, pid_lq_features[out_idx], pid_degrade_sigma, out_idx)
|
||||
|
||||
def _forward(self, x, timesteps, context=None, attention_mask=None, transformer_options={}, lq_latent=None, degrade_sigma=None, **kwargs):
|
||||
if lq_latent is None:
|
||||
raise ValueError("PidNet requires lq_latent — attach via PiDConditioning")
|
||||
expected_c = self.lq_proj.latent_channels
|
||||
if lq_latent.shape[1] != expected_c:
|
||||
raise ValueError(
|
||||
f"Input latent has {lq_latent.shape[1]} channels, this model variant expects {expected_c}. "
|
||||
f"Flux1/SD3 = 16 channels, Flux2 = 128 channels."
|
||||
)
|
||||
B = x.shape[0]
|
||||
# Match the backbone's pad_to_patch_size (round up) so the LQ grid lines up with the patch stream.
|
||||
Hs = -(-x.shape[2] // self.patch_size)
|
||||
Ws = -(-x.shape[3] // self.patch_size)
|
||||
|
||||
degrade_sigma = degrade_sigma.to(device=x.device, dtype=torch.float32).reshape(-1)
|
||||
if degrade_sigma.numel() == 1 and B > 1:
|
||||
degrade_sigma = degrade_sigma.expand(B).contiguous()
|
||||
|
||||
lq_features = self.lq_proj(lq_latent=lq_latent.to(x), target_pH=Hs, target_pW=Ws)
|
||||
|
||||
return super()._forward(
|
||||
x, timesteps,
|
||||
context=context, attention_mask=attention_mask,
|
||||
transformer_options=transformer_options,
|
||||
pid_lq_features=lq_features,
|
||||
pid_degrade_sigma=degrade_sigma,
|
||||
**kwargs,
|
||||
)
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user