Merge remote-tracking branch 'origin/master' into pixal3d

# Conflicts: # comfy/clip_vision.py # comfy/image_encoders/dino3.py # comfy/supported_models.py # comfy_extras/nodes_save_3d.py
2026-07-03 21:20:49 +08:00 · 2026-06-10 10:37:19 +03:00 · 2026-06-10 10:37:19 +03:00 · 3af63b8961
commit 3af63b8961
parent ad94d3bc93 039ed38ed1
298 changed files with 49511 additions and 13331 deletions
--- a/.ci/windows_amd_base_files/README_VERY_IMPORTANT.txt
+++ b/.ci/windows_amd_base_files/README_VERY_IMPORTANT.txt
@ -1,5 +1,4 @@
-As of the time of writing this you need this driver for best results:
-https://www.amd.com/en/resources/support-articles/release-notes/RN-AMDGPU-WINDOWS-PYTORCH-7-1-1.html
+As of the time of writing this you need a recent driver. Updating to the latest driver is recommended.

 HOW TO RUN:

@ -7,9 +6,9 @@ If you have a AMD gpu:

 run_amd_gpu.bat

-If you have memory issues you can try disabling the smart memory management by running comfyui with:
+If you have memory issues you can try enabling the new dynamic memory management by running comfyui with:

-run_amd_gpu_disable_smart_memory.bat
+run_amd_gpu_enable_dynamic_vram.bat

 IF YOU GET A RED ERROR IN THE UI MAKE SURE YOU HAVE A MODEL/CHECKPOINT IN: ComfyUI\models\checkpoints

--- a/.github/workflows/backport_release.yaml
+++ b/.github/workflows/backport_release.yaml
@ -0,0 +1,519 @@
+name: Backport Release
+
+on:
+  workflow_dispatch:
+    inputs:
+      commit:
+        description: 'Full 40-char SHA of the tip commit of the backport source branch (the PR head commit that passed tests). The branch is resolved from this SHA and must be unique.'
+        required: true
+        type: string
+
+permissions:
+  contents: read
+  pull-requests: read
+  checks: read
+
+jobs:
+  backport-release:
+    name: Create backport release
+    runs-on: ubuntu-latest
+    environment: backport release
+
+    steps:
+      - name: Generate GitHub App token
+        id: app-token
+        uses: actions/create-github-app-token@bcd2ba49218906704ab6c1aa796996da409d3eb1
+        with:
+          app-id: ${{ secrets.FEN_RELEASE_APP_ID }}
+          private-key: ${{ secrets.FEN_RELEASE_PRIVATE_KEY }}
+
+      - name: Checkout repository
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        with:
+          token: ${{ steps.app-token.outputs.token }}
+          fetch-depth: 0
+          fetch-tags: true
+
+      - name: Configure git
+        run: |
+          git config user.name  "fen-release[bot]"
+          git config user.email "fen-release[bot]@users.noreply.github.com"
+
+      - name: Resolve source branch from commit SHA
+        id: resolve
+        env:
+          SOURCE_COMMIT:  ${{ inputs.commit }}
+          DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+        run: |
+          set -euo pipefail
+
+          # Require a full 40-char lowercase-hex SHA. Short SHAs are ambiguous
+          # and we will be comparing this value against API responses (PR head
+          # SHA, ref tips) that always return the full form.
+          if [[ ! "${SOURCE_COMMIT}" =~ ^[0-9a-f]{40}$ ]]; then
+            echo "::error::Input commit '${SOURCE_COMMIT}' is not a full 40-char lowercase hex SHA."
+            exit 1
+          fi
+
+          # Fetch all remote branches so we can search for which one(s) point
+          # at this SHA. `actions/checkout` with fetch-depth: 0 fetches full
+          # history of the checked-out ref but does not necessarily populate
+          # every refs/remotes/origin/*, so do it explicitly.
+          git fetch --prune origin '+refs/heads/*:refs/remotes/origin/*'
+
+          # Verify the commit actually exists in this repo's object DB.
+          if ! git cat-file -e "${SOURCE_COMMIT}^{commit}" 2>/dev/null; then
+            echo "::error::Commit ${SOURCE_COMMIT} was not found in the repository."
+            exit 1
+          fi
+
+          # Find every remote branch whose tip == SOURCE_COMMIT. Exactly one
+          # branch must point at it. If zero, the commit isn't anyone's tip
+          # (likely stale, force-pushed past, or never the PR head). If more
+          # than one, the (branch -> SHA) mapping is ambiguous and we refuse
+          # to guess — the operator must give us a unique branch to release.
+          mapfile -t matching_branches < <(
+            git for-each-ref \
+              --format='%(refname:strip=3)' \
+              --points-at="${SOURCE_COMMIT}" \
+              refs/remotes/origin/ \
+              | grep -vx 'HEAD' || true
+          )
+
+          if [[ "${#matching_branches[@]}" -eq 0 ]]; then
+            echo "::error::No branch on origin has ${SOURCE_COMMIT} as its tip."
+            echo "::error::Either the branch was updated after you copied this SHA, or this commit was never the head of a branch."
+            exit 1
+          fi
+
+          if [[ "${#matching_branches[@]}" -gt 1 ]]; then
+            echo "::error::More than one branch on origin has ${SOURCE_COMMIT} as its tip; cannot pick one:"
+            for b in "${matching_branches[@]}"; do
+              echo "::error::  - ${b}"
+            done
+            echo "::error::Refusing to proceed with an ambiguous source branch."
+            exit 1
+          fi
+
+          source_branch="${matching_branches[0]}"
+
+          if [[ "${source_branch}" == "${DEFAULT_BRANCH}" ]]; then
+            echo "::error::Source branch must not be the default branch ('${DEFAULT_BRANCH}')."
+            exit 1
+          fi
+
+          echo "Resolved commit ${SOURCE_COMMIT} to branch '${source_branch}'."
+          echo "source_branch=${source_branch}" >> "$GITHUB_OUTPUT"
+
+      - name: Determine latest stable release
+        id: latest
+        env:
+          GH_TOKEN: ${{ steps.app-token.outputs.token }}
+        run: |
+          set -euo pipefail
+
+          # List all tags matching vMAJOR.MINOR.PATCH and pick the highest by numeric
+          # comparison of each component. We DO NOT use `sort -V` because it treats
+          # v0.19.99 as higher than v0.20.1.
+          latest_tag="$(
+            git tag --list 'v[0-9]*.[0-9]*.[0-9]*' \
+              | grep -E '^v[0-9]+\.[0-9]+\.[0-9]+$' \
+              | awk -F'[v.]' '{ printf "%010d %010d %010d %s\n", $2, $3, $4, $0 }' \
+              | sort -k1,1n -k2,2n -k3,3n \
+              | tail -n1 \
+              | awk '{print $4}'
+          )"
+
+          if [[ -z "${latest_tag}" ]]; then
+            echo "::error::No stable release tags (vMAJOR.MINOR.PATCH) were found."
+            exit 1
+          fi
+
+          # Parse components
+          ver="${latest_tag#v}"
+          major="${ver%%.*}"
+          rest="${ver#*.}"
+          minor="${rest%%.*}"
+          patch="${rest#*.}"
+
+          new_patch=$((patch + 1))
+          new_version="v${major}.${minor}.${new_patch}"
+          release_branch="release/v${major}.${minor}"
+
+          latest_sha="$(git rev-list -n 1 "refs/tags/${latest_tag}")"
+
+          echo "latest_tag=${latest_tag}"             >> "$GITHUB_OUTPUT"
+          echo "latest_sha=${latest_sha}"             >> "$GITHUB_OUTPUT"
+          echo "major=${major}"                       >> "$GITHUB_OUTPUT"
+          echo "minor=${minor}"                       >> "$GITHUB_OUTPUT"
+          echo "patch=${patch}"                       >> "$GITHUB_OUTPUT"
+          echo "new_version=${new_version}"           >> "$GITHUB_OUTPUT"
+          echo "new_version_no_v=${major}.${minor}.${new_patch}" >> "$GITHUB_OUTPUT"
+          echo "release_branch=${release_branch}"     >> "$GITHUB_OUTPUT"
+
+          echo "Latest stable release: ${latest_tag} (${latest_sha})"
+          echo "New version will be:   ${new_version}"
+          echo "Release branch:        ${release_branch}"
+
+      - name: Validate source branch is cut directly from the latest stable release
+        env:
+          SOURCE_BRANCH:   ${{ steps.resolve.outputs.source_branch }}
+          SOURCE_COMMIT:   ${{ inputs.commit }}
+          LATEST_TAG_SHA:  ${{ steps.latest.outputs.latest_sha }}
+          LATEST_TAG:      ${{ steps.latest.outputs.latest_tag }}
+        run: |
+          set -euo pipefail
+
+          # Use the user-provided SHA directly rather than re-resolving the branch
+          # tip — the resolve step already proved the branch tip equals SOURCE_COMMIT,
+          # and pinning to the SHA here makes the rest of the job TOCTOU-safe against
+          # someone pushing to the branch mid-run.
+          source_sha="${SOURCE_COMMIT}"
+
+          # Walking first-parent from the source tip must reach LATEST_TAG_SHA.
+          # We capture rev-list into a variable and grep against a here-string
+          # rather than piping `rev-list | grep -q`: under `set -o pipefail`,
+          # `grep -q` would exit on first match and SIGPIPE the still-streaming
+          # `rev-list`, propagating exit 141 as a spurious "not found".
+          first_parent_chain="$(git rev-list --first-parent "${source_sha}")"
+          if ! grep -Fxq "${LATEST_TAG_SHA}" <<< "${first_parent_chain}"; then
+            echo "::error::Source branch '${SOURCE_BRANCH}' is not cut from '${LATEST_TAG}'."
+            echo "::error::Its first-parent history does not include ${LATEST_TAG_SHA}."
+            exit 1
+          fi
+
+          # Additionally, every commit added on top of the tag (the set we are
+          # about to publish) must itself be a descendant of the tag along
+          # first-parent — i.e. no sibling commits from master sneak in via a
+          # non-first-parent path. Enforce by requiring that the symmetric
+          # difference is empty in one direction: commits in source that are
+          # NOT first-parent-reachable from source starting at the tag.
+          # We do this by intersecting:
+          #   A = commits reachable from source but not from tag (full DAG)
+          #   B = commits on the first-parent chain from source down to tag
+          # and requiring A == B.
+          all_added="$(git rev-list "${LATEST_TAG_SHA}..${source_sha}" | sort)"
+          first_parent_added="$(
+            git rev-list --first-parent "${LATEST_TAG_SHA}..${source_sha}" | sort
+          )"
+
+          if [[ "${all_added}" != "${first_parent_added}" ]]; then
+            echo "::error::Source branch '${SOURCE_BRANCH}' contains commits not on its first-parent chain from '${LATEST_TAG}'."
+            echo "::error::This usually means the branch was cut from master (not from the tag) or contains a merge from master."
+            echo "Commits reachable but not on first-parent chain:"
+            comm -23 <(printf '%s\n' "${all_added}") <(printf '%s\n' "${first_parent_added}") \
+              | while read -r sha; do
+                  echo "  $(git log -1 --format='%h %s' "${sha}")"
+                done
+            exit 1
+          fi
+
+          added_count="$(printf '%s\n' "${all_added}" | grep -c . || true)"
+          echo "Source branch is cut directly from ${LATEST_TAG} with ${added_count} commit(s) on top."
+
+      - name: Validate PR exists, is open, named correctly, has latest commit, and checks pass
+        env:
+          GH_TOKEN:      ${{ steps.app-token.outputs.token }}
+          SOURCE_BRANCH: ${{ steps.resolve.outputs.source_branch }}
+          SOURCE_COMMIT: ${{ inputs.commit }}
+          NEW_VERSION:   ${{ steps.latest.outputs.new_version }}
+          REPO:          ${{ github.repository }}
+        run: |
+          set -euo pipefail
+
+          expected_title="ComfyUI backport release ${NEW_VERSION}"
+
+          # Find open PRs from this branch into master. The --state open filter
+          # is load-bearing: a closed/merged PR with passing checks must not be
+          # accepted as authorization for a new release.
+          pr_json="$(
+            gh pr list \
+              --repo "${REPO}" \
+              --state open \
+              --head "${SOURCE_BRANCH}" \
+              --base master \
+              --json number,title,headRefOid,state \
+              --limit 10
+          )"
+
+          pr_count="$(echo "${pr_json}" | jq 'length')"
+          if [[ "${pr_count}" -eq 0 ]]; then
+            echo "::error::No open PR found from '${SOURCE_BRANCH}' into 'master'. The PR must exist and be open."
+            exit 1
+          fi
+
+          # Pick the PR matching the expected title
+          pr_number="$(echo "${pr_json}" | jq -r --arg t "${expected_title}" '
+            map(select(.title == $t)) | .[0].number // empty
+          ')"
+          pr_head_sha="$(echo "${pr_json}" | jq -r --arg t "${expected_title}" '
+            map(select(.title == $t)) | .[0].headRefOid // empty
+          ')"
+
+          if [[ -z "${pr_number}" ]]; then
+            echo "::error::No open PR from '${SOURCE_BRANCH}' into 'master' is titled '${expected_title}'."
+            echo "Found PRs:"
+            echo "${pr_json}" | jq -r '.[] | "  #\(.number): \(.title)"'
+            exit 1
+          fi
+
+          # The PR's current head commit must equal the SHA the operator gave us.
+          # This is what closes the door on releasing stale code: if anyone has
+          # pushed to the branch since the operator validated tests passed, the
+          # PR head will have advanced past SOURCE_COMMIT and we abort. (The
+          # resolve step already proved the branch tip == SOURCE_COMMIT; this
+          # ties that same SHA to the PR that authorizes the release.)
+          if [[ "${pr_head_sha}" != "${SOURCE_COMMIT}" ]]; then
+            echo "::error::PR #${pr_number} head commit is ${pr_head_sha}, but the operator-provided commit is ${SOURCE_COMMIT}."
+            echo "::error::The PR has new commits since this release was authorized. Re-run with the new head SHA after verifying its checks."
+            exit 1
+          fi
+
+          echo "Found open PR #${pr_number} titled '${expected_title}' at head ${pr_head_sha} (matches operator-provided commit)."
+
+          # Verify all check runs on the head commit have completed successfully.
+          # A check is considered passing if conclusion is success, neutral, or skipped.
+          checks_json="$(
+            gh api \
+              --paginate \
+              "repos/${REPO}/commits/${pr_head_sha}/check-runs" \
+              --jq '.check_runs[] | {name: .name, status: .status, conclusion: .conclusion}'
+          )"
+
+          if [[ -z "${checks_json}" ]]; then
+            echo "::error::No check runs found on PR head commit ${pr_head_sha}."
+            exit 1
+          fi
+
+          echo "Check runs on ${pr_head_sha}:"
+          echo "${checks_json}" | jq -s '.'
+
+          failing="$(echo "${checks_json}" | jq -s '
+            map(select(
+              .status != "completed"
+              or (.conclusion as $c
+                  | ["success","neutral","skipped"]
+                  | index($c) | not)
+            ))
+          ')"
+
+          failing_count="$(echo "${failing}" | jq 'length')"
+          if [[ "${failing_count}" -gt 0 ]]; then
+            echo "::error::One or more checks have not passed on PR head commit ${pr_head_sha}:"
+            echo "${failing}" | jq -r '.[] | "  - \(.name): status=\(.status) conclusion=\(.conclusion)"'
+            exit 1
+          fi
+
+          echo "All checks have passed on ${pr_head_sha}."
+
+      - name: Prepare release branch
+        id: prepare
+        env:
+          GH_TOKEN:        ${{ steps.app-token.outputs.token }}
+          REPO:            ${{ github.repository }}
+          RELEASE_BRANCH:  ${{ steps.latest.outputs.release_branch }}
+          LATEST_TAG:      ${{ steps.latest.outputs.latest_tag }}
+          LATEST_TAG_SHA:  ${{ steps.latest.outputs.latest_sha }}
+          PATCH:           ${{ steps.latest.outputs.patch }}
+        run: |
+          set -euo pipefail
+
+          # Try to fetch the release branch. If patch == 0, it shouldn't exist yet
+          # and we'll create it from the latest stable tag. If patch > 0, it must
+          # already exist and its tip must equal the latest stable tag commit (i.e.
+          # the previous patch release).
+          if git ls-remote --exit-code --heads origin "${RELEASE_BRANCH}" >/dev/null 2>&1; then
+            echo "Release branch '${RELEASE_BRANCH}' already exists on origin."
+            git fetch origin "refs/heads/${RELEASE_BRANCH}:refs/remotes/origin/${RELEASE_BRANCH}"
+            git checkout -B "${RELEASE_BRANCH}" "refs/remotes/origin/${RELEASE_BRANCH}"
+
+            current_tip="$(git rev-parse HEAD)"
+            if [[ "${current_tip}" != "${LATEST_TAG_SHA}" ]]; then
+              echo "::error::Release branch '${RELEASE_BRANCH}' tip (${current_tip}) is not at the latest stable release '${LATEST_TAG}' (${LATEST_TAG_SHA})."
+              echo "::error::Refusing to release on top of a divergent branch."
+              exit 1
+            fi
+            echo "branch_existed=true" >> "$GITHUB_OUTPUT"
+          else
+            if [[ "${PATCH}" != "0" ]]; then
+              echo "::error::Release branch '${RELEASE_BRANCH}' does not exist on origin, but the latest stable release '${LATEST_TAG}' has patch=${PATCH} (>0). This is inconsistent."
+              exit 1
+            fi
+            echo "Release branch '${RELEASE_BRANCH}' does not exist. Creating from ${LATEST_TAG}."
+            git checkout -B "${RELEASE_BRANCH}" "refs/tags/${LATEST_TAG}"
+            echo "branch_existed=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Fast-forward merge source branch into release branch
+        env:
+          SOURCE_BRANCH:  ${{ steps.resolve.outputs.source_branch }}
+          SOURCE_COMMIT:  ${{ inputs.commit }}
+          RELEASE_BRANCH: ${{ steps.latest.outputs.release_branch }}
+        run: |
+          set -euo pipefail
+
+          # --ff-only guarantees no merge commit is created. If a fast-forward is
+          # not possible (i.e. the release branch has commits the source branch
+          # doesn't), the merge will fail and we abort. Because we already validated
+          # that the source branch is rooted on the latest stable tag, and the
+          # release branch tip equals that same tag, this fast-forward should
+          # always succeed for a well-formed backport branch.
+          #
+          # We merge the operator-provided SHA, not the branch ref, so a push to
+          # the branch in the window between resolve and now cannot smuggle new
+          # commits into the release.
+          if ! git merge --ff-only "${SOURCE_COMMIT}"; then
+            echo "::error::Cannot fast-forward '${RELEASE_BRANCH}' to ${SOURCE_COMMIT} (tip of '${SOURCE_BRANCH}'). A merge commit would be required. Aborting."
+            exit 1
+          fi
+
+          echo "Fast-forwarded '${RELEASE_BRANCH}' to ${SOURCE_COMMIT} (tip of '${SOURCE_BRANCH}')."
+
+      - name: Bump version files
+        env:
+          NEW_VERSION_NO_V: ${{ steps.latest.outputs.new_version_no_v }}
+        run: |
+          set -euo pipefail
+
+          if [[ ! -f comfyui_version.py ]]; then
+            echo "::error::comfyui_version.py not found in repo root."
+            exit 1
+          fi
+          if [[ ! -f pyproject.toml ]]; then
+            echo "::error::pyproject.toml not found in repo root."
+            exit 1
+          fi
+
+          # Replace the version string in comfyui_version.py.
+          # Expected format:  __version__ = "X.Y.Z"
+          python3 - "$NEW_VERSION_NO_V" <<'PY'
+          import re, sys, pathlib
+          new = sys.argv[1]
+
+          p = pathlib.Path("comfyui_version.py")
+          src = p.read_text()
+          new_src, n = re.subn(
+              r'(__version__\s*=\s*[\'"])[^\'"]+([\'"])',
+              lambda m: f'{m.group(1)}{new}{m.group(2)}',
+              src,
+              count=1,
+          )
+          if n != 1:
+              sys.exit("Could not find __version__ assignment in comfyui_version.py")
+          p.write_text(new_src)
+
+          p = pathlib.Path("pyproject.toml")
+          src = p.read_text()
+          # Replace the first `version = "..."` inside [project] or [tool.poetry].
+          new_src, n = re.subn(
+              r'(?m)^(version\s*=\s*")[^"]+(")',
+              lambda m: f'{m.group(1)}{new}{m.group(2)}',
+              src,
+              count=1,
+          )
+          if n != 1:
+              sys.exit("Could not find version assignment in pyproject.toml")
+          p.write_text(new_src)
+          PY
+
+          echo "Updated version to ${NEW_VERSION_NO_V} in comfyui_version.py and pyproject.toml."
+          git --no-pager diff -- comfyui_version.py pyproject.toml
+
+      - name: Commit version bump and tag release
+        env:
+          NEW_VERSION: ${{ steps.latest.outputs.new_version }}
+        run: |
+          set -euo pipefail
+
+          git add comfyui_version.py pyproject.toml
+          git commit -m "ComfyUI ${NEW_VERSION}"
+
+          if git rev-parse -q --verify "refs/tags/${NEW_VERSION}" >/dev/null; then
+            echo "::error::Tag ${NEW_VERSION} already exists locally."
+            exit 1
+          fi
+          git tag "${NEW_VERSION}"
+
+      - name: Verify tag does not already exist on origin
+        env:
+          NEW_VERSION: ${{ steps.latest.outputs.new_version }}
+        run: |
+          set -euo pipefail
+          if git ls-remote --exit-code --tags origin "refs/tags/${NEW_VERSION}" >/dev/null 2>&1; then
+            echo "::error::Tag ${NEW_VERSION} already exists on origin. Aborting."
+            exit 1
+          fi
+
+      - name: Push release branch and tag
+        env:
+          RELEASE_BRANCH: ${{ steps.latest.outputs.release_branch }}
+          NEW_VERSION:    ${{ steps.latest.outputs.new_version }}
+        run: |
+          set -euo pipefail
+
+          # Push the branch first, then the tag. Atomic-ish: if the branch push
+          # fails we never publish the tag.
+          git push origin "refs/heads/${RELEASE_BRANCH}:refs/heads/${RELEASE_BRANCH}"
+          git push origin "refs/tags/${NEW_VERSION}"
+
+          echo "Released ${NEW_VERSION} on ${RELEASE_BRANCH}."
+
+      - name: Delete remote source branch
+        env:
+          GH_TOKEN:        ${{ steps.app-token.outputs.token }}
+          REPO:            ${{ github.repository }}
+          SOURCE_BRANCH:   ${{ steps.resolve.outputs.source_branch }}
+          SOURCE_COMMIT:   ${{ inputs.commit }}
+          RELEASE_BRANCH:  ${{ steps.latest.outputs.release_branch }}
+          DEFAULT_BRANCH:  ${{ github.event.repository.default_branch }}
+        run: |
+          set -euo pipefail
+
+          # Belt-and-braces: the resolve step already refuses the default branch,
+          # but never delete the default or the release branch under any
+          # circumstances.
+          if [[ "${SOURCE_BRANCH}" == "${DEFAULT_BRANCH}" || "${SOURCE_BRANCH}" == "${RELEASE_BRANCH}" ]]; then
+            echo "::error::Refusing to delete '${SOURCE_BRANCH}' (matches default or release branch)."
+            exit 1
+          fi
+
+          # Delete the source branch on origin, but only if its tip is still the
+          # SHA we released from. If someone pushed new commits to it after we
+          # resolved it, leave it alone — those commits would be silently lost.
+          current_tip="$(git ls-remote origin "refs/heads/${SOURCE_BRANCH}" | awk '{print $1}')"
+          if [[ -z "${current_tip}" ]]; then
+            echo "Source branch '${SOURCE_BRANCH}' no longer exists on origin; nothing to delete."
+            exit 0
+          fi
+          if [[ "${current_tip}" != "${SOURCE_COMMIT}" ]]; then
+            echo "::warning::Source branch '${SOURCE_BRANCH}' tip (${current_tip}) no longer matches released commit (${SOURCE_COMMIT}). Leaving it in place."
+            exit 0
+          fi
+
+          git push origin --delete "refs/heads/${SOURCE_BRANCH}"
+          echo "Deleted remote branch '${SOURCE_BRANCH}'."
+
+      - name: Summary
+        if: always()
+        env:
+          NEW_VERSION:    ${{ steps.latest.outputs.new_version }}
+          RELEASE_BRANCH: ${{ steps.latest.outputs.release_branch }}
+          LATEST_TAG:     ${{ steps.latest.outputs.latest_tag }}
+          SOURCE_BRANCH:  ${{ steps.resolve.outputs.source_branch }}
+          SOURCE_COMMIT:  ${{ inputs.commit }}
+        run: |
+          # SOURCE_BRANCH is empty if the resolve step never produced an output
+          # (e.g. the workflow failed in or before that step). Show a placeholder
+          # in that case so the summary table still renders cleanly.
+          source_branch_display="${SOURCE_BRANCH:-(unresolved)}"
+          {
+            echo "## Backport release"
+            echo ""
+            echo "| Field | Value |"
+            echo "|---|---|"
+            echo "| Source commit | \`${SOURCE_COMMIT}\` |"
+            echo "| Source branch | \`${source_branch_display}\` |"
+            echo "| Previous stable | \`${LATEST_TAG}\` |"
+            echo "| New version | \`${NEW_VERSION}\` |"
+            echo "| Release branch | \`${RELEASE_BRANCH}\` |"
+          } >> "$GITHUB_STEP_SUMMARY"
--- a/.github/workflows/check-line-endings.yml
+++ b/.github/workflows/check-line-endings.yml
@ -17,7 +17,7 @@ jobs:
      - name: Check for Windows line endings (CRLF)
        run: |
          # Get the list of changed files in the PR
-          CHANGED_FILES=$(git diff --name-only ${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }})
+          CHANGED_FILES=$(git diff --name-only ${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }} -- ':!.ci')

          # Flag to track if CRLF is found
          CRLF_FOUND=false
--- a/.github/workflows/detect-unreviewed-merge.yml
+++ b/.github/workflows/detect-unreviewed-merge.yml
@ -0,0 +1,24 @@
+name: Detect Unreviewed Merge
+
+# SOC 2 compliance — reusable workflow lives in Comfy-Org/github-workflows,
+# tracking issues are filed in Comfy-Org/unreviewed-merges.
+
+on:
+  push:
+    branches: [master]
+
+concurrency:
+  group: detect-unreviewed-merge-${{ github.sha }}
+  cancel-in-progress: false
+
+permissions:
+  contents: read
+  pull-requests: read
+
+jobs:
+  detect:
+    uses: Comfy-Org/github-workflows/.github/workflows/detect-unreviewed-merge.yml@4d9cb6b87f953bb7cd69954280e1465fb9bd2040 # v1
+    with:
+      approval-mode: latest-per-reviewer
+    secrets:
+      UNREVIEWED_MERGES_TOKEN: ${{ secrets.UNREVIEWED_MERGES_TOKEN }}
--- a/README.md
+++ b/README.md
@ -20,7 +20,7 @@
 [website-url]: https://www.comfy.org/
 <!-- Workaround to display total user from https://github.com/badges/shields/issues/4500#issuecomment-2060079995 -->
 [discord-shield]: https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fdiscord.com%2Fapi%2Finvites%2Fcomfyorg%3Fwith_counts%3Dtrue&query=%24.approximate_member_count&logo=discord&logoColor=white&label=Discord&color=green&suffix=%20total
-[discord-url]: https://www.comfy.org/discord
+[discord-url]: https://discord.com/invite/comfyorg
 [twitter-shield]: https://img.shields.io/twitter/follow/ComfyUI
 [twitter-url]: https://x.com/ComfyUI

@ -433,7 +433,7 @@ See also: [https://www.comfy.org/](https://www.comfy.org/)

 ## Frontend Development

-As of August 15, 2024, we have transitioned to a new frontend, which is now hosted in a separate repository: [ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend). This repository now hosts the compiled JS (from TS/Vue) under the `web/` directory.
+As of August 15, 2024, we have transitioned to a new frontend, which is now hosted in a separate repository: [ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend). The compiled JS files (from TS/Vue) are published to [pypi](https://pypi.org/project/comfyui-frontend-package) and installed as a dependency in ComfyUI.

 ### Reporting Issues and Requesting Features

--- a/alembic_db/versions/0004_drop_tag_type.py
+++ b/alembic_db/versions/0004_drop_tag_type.py
@ -0,0 +1,39 @@
+"""
+Drop the vestigial tags.tag_type column.
+
+tag_type was always "user" in practice — no code path ever set it to anything
+else (no system/seeded classification was ever wired up) and nothing queried it.
+The column, its index (ix_tags_tag_type), and the corresponding API field were
+dead weight, so they are removed.
+
+Revision ID: 0004_drop_tag_type
+Revises: 0003_add_metadata_job_id
+Create Date: 2026-06-03
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+revision = "0004_drop_tag_type"
+down_revision = "0003_add_metadata_job_id"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    with op.batch_alter_table("tags") as batch_op:
+        batch_op.drop_index("ix_tags_tag_type")
+        batch_op.drop_column("tag_type")
+
+
+def downgrade() -> None:
+    with op.batch_alter_table("tags") as batch_op:
+        batch_op.add_column(
+            sa.Column(
+                "tag_type",
+                sa.String(length=32),
+                nullable=False,
+                server_default="user",
+            )
+        )
+        batch_op.create_index("ix_tags_tag_type", ["tag_type"])
--- a/app/assets/api/routes.py
+++ b/app/assets/api/routes.py
@ -39,6 +39,7 @@ from app.assets.services import (
    update_asset_metadata,
    upload_from_temp_path,
 )
+from app.assets.services.cursor import InvalidCursorError
 from app.assets.services.tagging import list_tag_histogram

 ROUTES = web.RouteTableDef()
@ -160,10 +161,12 @@ def _build_asset_response(result: schemas.AssetDetailResult | schemas.UploadResu
            preview_url = None
    else:
        preview_url = _build_preview_url_from_view(result.tags, result.ref.user_metadata)
+    asset_content_hash = result.asset.hash if result.asset else None
    return schemas_out.Asset(
        id=result.ref.id,
        name=result.ref.name,
-        asset_hash=result.asset.hash if result.asset else None,
+        hash=asset_content_hash,
+        asset_hash=asset_content_hash,
        size=int(result.asset.size_bytes) if result.asset else None,
        mime_type=result.asset.mime_type if result.asset else None,
        tags=result.tags,
@ -172,7 +175,7 @@ def _build_asset_response(result: schemas.AssetDetailResult | schemas.UploadResu
        user_metadata=result.ref.user_metadata or {},
        metadata=result.ref.system_metadata,
        job_id=result.ref.job_id,
-        prompt_id=result.ref.job_id,  # deprecated: mirrors job_id for cloud compat
+        prompt_id=result.ref.job_id,  # deprecated alias of job_id, kept for compatibility
        created_at=result.ref.created_at,
        updated_at=result.ref.updated_at,
        last_access_time=result.ref.last_access_time,
@ -209,24 +212,37 @@ async def list_assets_route(request: web.Request) -> web.Response:
    order_candidate = (q.order or "desc").lower()
    order = order_candidate if order_candidate in {"asc", "desc"} else "desc"

-    result = list_assets_page(
-        owner_id=USER_MANAGER.get_request_user_id(request),
-        include_tags=q.include_tags,
-        exclude_tags=q.exclude_tags,
-        name_contains=q.name_contains,
-        metadata_filter=q.metadata_filter,
-        limit=q.limit,
-        offset=q.offset,
-        sort=sort,
-        order=order,
-    )
+    try:
+        result = list_assets_page(
+            owner_id=USER_MANAGER.get_request_user_id(request),
+            include_tags=q.include_tags,
+            exclude_tags=q.exclude_tags,
+            name_contains=q.name_contains,
+            metadata_filter=q.metadata_filter,
+            limit=q.limit,
+            offset=q.offset,
+            sort=sort,
+            order=order,
+            after=q.after,
+        )
+    except InvalidCursorError as e:
+        return _build_error_response(400, "INVALID_CURSOR", str(e))

    summaries = [_build_asset_response(item) for item in result.items]

+    # has_more semantics differ by mode:
+    #   - cursor mode: a non-empty next_cursor means there are more results.
+    #   - offset mode: derived from total - (offset + page size).
+    if q.after is not None:
+        has_more = result.next_cursor is not None
+    else:
+        has_more = (q.offset + len(summaries)) < result.total
+
    payload = schemas_out.AssetsList(
        assets=summaries,
        total=result.total,
-        has_more=(q.offset + len(summaries)) < result.total,
+        has_more=has_more,
+        next_cursor=result.next_cursor,
    )
    return web.json_response(payload.model_dump(mode="json", exclude_none=True))

@ -517,18 +533,14 @@ async def update_asset_route(request: web.Request) -> web.Response:
@_require_assets_feature_enabled
 async def delete_asset_route(request: web.Request) -> web.Response:
    reference_id = str(uuid.UUID(request.match_info["id"]))
-    delete_content_param = request.query.get("delete_content")
-    delete_content = (
-        False
-        if delete_content_param is None
-        else delete_content_param.lower() not in {"0", "false", "no"}
-    )

    try:
+        # Deleting an asset is a soft delete of the reference; the underlying
+        # content is preserved (it may be shared with other references).
        deleted = delete_asset_reference(
            reference_id=reference_id,
            owner_id=USER_MANAGER.get_request_user_id(request),
-            delete_content_if_orphan=delete_content,
+            delete_content_if_orphan=False,
        )
    except Exception:
        logging.exception(
@ -573,8 +585,8 @@ async def get_tags(request: web.Request) -> web.Response:
    )

    tags = [
-        schemas_out.TagUsage(name=name, count=count, type=tag_type)
-        for (name, tag_type, count) in rows
+        schemas_out.TagUsage(name=name, count=count)
+        for (name, count) in rows
    ]
    payload = schemas_out.TagsList(
        tags=tags, total=total, has_more=(query.offset + len(tags)) < total
--- a/app/assets/api/schemas_in.py
+++ b/app/assets/api/schemas_in.py
@ -59,6 +59,11 @@ class ListAssetsQuery(BaseModel):

    limit: conint(ge=1, le=500) = 20
    offset: conint(ge=0) = 0
+    # Opaque keyset cursor. When supplied, `offset` is ignored. Cursor pagination
+    # is supported for sort values `created_at`, `updated_at`, `name`, `size`.
+    # Supplying `after` together with `sort=last_access_time` returns
+    # 400 INVALID_CURSOR; that sort only supports offset/limit.
+    after: str | None = None

    sort: Literal["name", "created_at", "updated_at", "size", "last_access_time"] = (
        "created_at"
--- a/app/assets/api/schemas_out.py
+++ b/app/assets/api/schemas_out.py
@ -10,6 +10,7 @@ class Asset(BaseModel):

    id: str
    name: str
+    hash: str | None = None
    asset_hash: str | None = None
    size: int | None = None
    mime_type: str | None = None
@ -40,12 +41,13 @@ class AssetsList(BaseModel):
    assets: list[Asset]
    total: int
    has_more: bool
+    # Opaque cursor for the next page. Omitted when there are no more results.
+    next_cursor: str | None = None


 class TagUsage(BaseModel):
    name: str
    count: int
-    type: str


 class TagsList(BaseModel):
--- a/app/assets/database/models.py
+++ b/app/assets/database/models.py
@ -227,7 +227,6 @@ class Tag(Base):
    __tablename__ = "tags"

    name: Mapped[str] = mapped_column(String(512), primary_key=True)
-    tag_type: Mapped[str] = mapped_column(String(32), nullable=False, default="user")

    asset_reference_links: Mapped[list[AssetReferenceTag]] = relationship(
        back_populates="tag",
@ -240,7 +239,5 @@ class Tag(Base):
        overlaps="asset_reference_links,tag_links,tags,asset_reference",
    )

-    __table_args__ = (Index("ix_tags_tag_type", "tag_type"),)
-
    def __repr__(self) -> str:
        return f"<Tag {self.name}>"
--- a/app/assets/database/queries/asset_reference.py
+++ b/app/assets/database/queries/asset_reference.py
@ -266,9 +266,18 @@ def list_references_page(
    metadata_filter: dict | None = None,
    sort: str | None = None,
    order: str | None = None,
+    after_cursor_value: object | None = None,
+    after_cursor_id: str | None = None,
 ) -> tuple[list[AssetReference], dict[str, list[str]], int]:
    """List references with pagination, filtering, and sorting.

+    When ``after_cursor_value``/``after_cursor_id`` are supplied the query uses
+    keyset pagination — ``offset`` is ignored and a WHERE clause selects rows
+    strictly after the given ``(sort_col, id)`` position in the active sort
+    direction. The cursor value must already be typed for the column
+    (datetime for time sorts, int for size, str for name); the caller decodes
+    the opaque cursor string and resolves to the typed value.
+
    Returns (references, tag_map, total_count).
    """
    base = (
@ -297,9 +306,31 @@ def list_references_page(
        "size": Asset.size_bytes,
    }
    sort_col = sort_map.get(sort, AssetReference.created_at)
-    sort_exp = sort_col.desc() if order == "desc" else sort_col.asc()
+    descending = order == "desc"

-    base = base.order_by(sort_exp).limit(limit).offset(offset)
+    # Keyset WHERE: (sort_col, id) strictly less-than / greater-than the cursor.
+    # Equivalent to: sort_col <op> v  OR  (sort_col = v AND id <op> cursor_id).
+    if after_cursor_value is not None and after_cursor_id is not None:
+        if descending:
+            keyset = sa.or_(
+                sort_col < after_cursor_value,
+                sa.and_(sort_col == after_cursor_value, AssetReference.id < after_cursor_id),
+            )
+        else:
+            keyset = sa.or_(
+                sort_col > after_cursor_value,
+                sa.and_(sort_col == after_cursor_value, AssetReference.id > after_cursor_id),
+            )
+        base = base.where(keyset)
+
+    # Secondary ORDER BY id (matching the primary direction) gives the keyset
+    # comparison a deterministic tiebreaker on duplicate sort_col values.
+    id_exp = AssetReference.id.desc() if descending else AssetReference.id.asc()
+    sort_exp = sort_col.desc() if descending else sort_col.asc()
+
+    base = base.order_by(sort_exp, id_exp).limit(limit)
+    if after_cursor_id is None:
+        base = base.offset(offset)

    count_stmt = (
        select(sa.func.count())
--- a/app/assets/database/queries/tags.py
+++ b/app/assets/database/queries/tags.py
@ -55,13 +55,11 @@ def validate_tags_exist(session: Session, tags: list[str]) -> None:
        raise ValueError(f"Unknown tags: {missing}")


-def ensure_tags_exist(
-    session: Session, names: Iterable[str], tag_type: str = "user"
-) -> None:
+def ensure_tags_exist(session: Session, names: Iterable[str]) -> None:
    wanted = normalize_tags(list(names))
    if not wanted:
        return
-    rows = [{"name": n, "tag_type": tag_type} for n in list(dict.fromkeys(wanted))]
+    rows = [{"name": n} for n in list(dict.fromkeys(wanted))]
    ins = (
        sqlite.insert(Tag)
        .values(rows)
@ -97,7 +95,7 @@ def set_reference_tags(
    to_remove = [t for t in current if t not in desired]

    if to_add:
-        ensure_tags_exist(session, to_add, tag_type="user")
+        ensure_tags_exist(session, to_add)
        session.add_all(
            [
                AssetReferenceTag(
@ -142,7 +140,7 @@ def add_tags_to_reference(
        return AddTagsResult(added=[], already_present=[], total_tags=total)

    if create_if_missing:
-        ensure_tags_exist(session, norm, tag_type="user")
+        ensure_tags_exist(session, norm)

    current = set(get_reference_tags(session, reference_id))

@ -289,7 +287,6 @@ def list_tags_with_usage(
    q = (
        select(
            Tag.name,
-            Tag.tag_type,
            func.coalesce(counts_sq.c.cnt, 0).label("count"),
        )
        .select_from(Tag)
@ -331,7 +328,7 @@ def list_tags_with_usage(
    rows = (session.execute(q.limit(limit).offset(offset))).all()
    total = (session.execute(total_q)).scalar_one()

-    rows_norm = [(name, ttype, int(count or 0)) for (name, ttype, count) in rows]
+    rows_norm = [(name, int(count or 0)) for (name, count) in rows]
    return rows_norm, int(total or 0)


--- a/app/assets/scanner.py
+++ b/app/assets/scanner.py
@ -33,6 +33,7 @@ from app.assets.services.file_utils import (
    verify_file_unchanged,
 )
 from app.assets.services.hashing import HashCheckpoint, compute_blake3_hash
+from app.assets.services.image_dimensions import extract_image_dimensions
 from app.assets.services.metadata_extract import extract_file_metadata
 from app.assets.services.path_utils import (
    compute_relative_filename,
@ -354,7 +355,7 @@ def insert_asset_specs(specs: list[SeedAssetSpec], tag_pool: set[str]) -> int:
        return 0
    with create_session() as sess:
        if tag_pool:
-            ensure_tags_exist(sess, tag_pool, tag_type="user")
+            ensure_tags_exist(sess, tag_pool)
        result = batch_insert_seed_assets(sess, specs=specs, owner_id="")
        sess.commit()
        return result.inserted_refs
@ -506,6 +507,10 @@ def enrich_asset(

    if extract_metadata and metadata:
        system_metadata = metadata.to_user_metadata()
+        if mime_type and mime_type.startswith("image/"):
+            dims = extract_image_dimensions(file_path, mime_type=mime_type)
+            if dims:
+                system_metadata.update(dims)
        set_reference_system_metadata(session, reference_id, system_metadata)

    if full_hash:
--- a/app/assets/services/asset_management.py
+++ b/app/assets/services/asset_management.py
@ -1,8 +1,19 @@
 import contextlib
 import mimetypes
 import os
+from datetime import timezone
 from typing import Sequence

+from app.assets.services.cursor import (
+    CursorPayload,
+    InvalidCursorError,
+    decode_cursor,
+    decode_cursor_int,
+    decode_cursor_time,
+    encode_cursor,
+    encode_cursor_from_time,
+)
+

 from app.assets.database.models import Asset
 from app.assets.database.queries import (
@ -149,6 +160,16 @@ def delete_asset_reference(
    owner_id: str,
    delete_content_if_orphan: bool = True,
 ) -> bool:
+    """Delete an asset reference.
+
+    With ``delete_content_if_orphan=False`` (a soft delete), the reference is
+    hidden and the underlying content is preserved. With ``True``, the content
+    is also removed once it becomes orphaned.
+
+    Note: the public DELETE /api/assets/{id} endpoint always soft-deletes
+    (passes ``False``); the orphan-reclamation path is intentionally
+    internal-only, retained for a future GC/admin caller.
+    """
    with create_session() as session:
        if not delete_content_if_orphan:
            # Soft delete: mark the reference as deleted but keep everything
@ -242,6 +263,11 @@ def get_asset_by_hash(asset_hash: str) -> AssetData | None:
        return extract_asset_data(asset)


+# Sort fields that support cursor pagination. `last_access_time` is not
+# in this list — it falls back to offset/limit.
+_CURSOR_SORT_FIELDS = ("created_at", "updated_at", "name", "size")
+
+
 def list_assets_page(
    owner_id: str = "",
    include_tags: Sequence[str] | None = None,
@ -252,7 +278,39 @@ def list_assets_page(
    offset: int = 0,
    sort: str = "created_at",
    order: str = "desc",
+    after: str | None = None,
 ) -> ListAssetsResult:
+    """List assets with optional cursor pagination.
+
+    When ``after`` is supplied it overrides ``offset``. The cursor's sort field
+    must match ``sort`` and be in the cursor-supported allowlist; mismatches
+    raise InvalidCursorError so the handler can map to 400 INVALID_CURSOR.
+    """
+    cursor_value: object | None = None
+    cursor_id: str | None = None
+    # Mint next_cursor on every page where the sort is cursor-supported, not
+    # only when the request itself arrived with a cursor. Otherwise a first
+    # request (no `after`) returns next_cursor=None and the client can never
+    # enter cursor mode.
+    mint_cursor = sort in _CURSOR_SORT_FIELDS
+
+    if after is not None:
+        if sort not in _CURSOR_SORT_FIELDS:
+            raise InvalidCursorError(
+                f"cursor pagination is not supported for sort={sort!r}"
+            )
+        payload = decode_cursor(after, _CURSOR_SORT_FIELDS, expected_order=order)
+        if payload.sort_field != sort:
+            raise InvalidCursorError(
+                f"cursor sort field {payload.sort_field!r} does not match request sort {sort!r}"
+            )
+        cursor_value, cursor_id = _resolve_cursor_value(payload), payload.id
+
+    # Over-fetch by one row so we can distinguish "exactly `limit` rows total
+    # remaining" from "more rows past this page" without a second query. Drop
+    # the sentinel before returning.
+    fetch_limit = limit + 1 if mint_cursor else limit
+
    with create_session() as session:
        refs, tag_map, total = list_references_page(
            session,
@ -261,12 +319,22 @@ def list_assets_page(
            exclude_tags=exclude_tags,
            name_contains=name_contains,
            metadata_filter=metadata_filter,
-            limit=limit,
+            limit=fetch_limit,
            offset=offset,
            sort=sort,
            order=order,
+            after_cursor_value=cursor_value,
+            after_cursor_id=cursor_id,
        )

+        next_cursor: str | None = None
+        if mint_cursor and len(refs) > limit:
+            # There's at least one more row past this page — mint a cursor from
+            # the last row of the page (i.e. index `limit - 1`, since we
+            # over-fetched), and drop the sentinel.
+            next_cursor = _encode_next_cursor(refs[limit - 1], sort, order)
+            refs = refs[:limit]
+
        items: list[AssetSummaryData] = []
        for ref in refs:
            items.append(
@ -277,7 +345,39 @@ def list_assets_page(
                )
            )

-        return ListAssetsResult(items=items, total=total)
+        return ListAssetsResult(items=items, total=total, next_cursor=next_cursor)
+
+
+def _resolve_cursor_value(payload: CursorPayload) -> object:
+    """Map a decoded cursor payload to a column-typed Python value."""
+    if payload.sort_field in ("created_at", "updated_at"):
+        # DB stores naive UTC; strip tzinfo so the comparison binds against a
+        # `TIMESTAMP WITHOUT TIME ZONE` column without an offset shift.
+        return decode_cursor_time(payload).replace(tzinfo=None)
+    if payload.sort_field == "size":
+        return decode_cursor_int(payload)
+    return payload.value  # name, str-typed
+
+
+def _encode_next_cursor(ref, sort: str, order: str) -> str | None:
+    """Mint a cursor pointing at *ref* for the given sort dimension.
+
+    Returns None when the boundary row carries a NULL sort value (e.g. an asset
+    record whose size_bytes hasn't been backfilled). Continuing pagination
+    across a NULL boundary is undefined under keyset ordering — better to
+    truncate cleanly here than to mint a cursor that mis-positions.
+    """
+    if sort == "name":
+        return encode_cursor("name", ref.name, ref.id, order=order)
+    if sort == "size":
+        if ref.asset is None or ref.asset.size_bytes is None:
+            return None
+        return encode_cursor("size", str(ref.asset.size_bytes), ref.id, order=order)
+    # created_at / updated_at — DB datetimes are naive UTC; attach tz before encoding.
+    value = ref.created_at if sort == "created_at" else ref.updated_at
+    if value is None:
+        return None
+    return encode_cursor_from_time(sort, value.replace(tzinfo=timezone.utc), ref.id, order=order)


 def resolve_hash_to_path(
--- a/app/assets/services/cursor.py
+++ b/app/assets/services/cursor.py
@ -0,0 +1,213 @@
+"""Opaque keyset-pagination cursor for /api/assets.
+
+Payload JSON uses short keys to keep the encoded length small:
+
+    {"s": <sort_field>, "v": <value>, "id": <id>, "o": <order>}
+
+The `o` key binds the cursor to the sort direction it was minted under,
+so replaying a `desc` cursor against an `asc` request fails with
+``INVALID_CURSOR`` rather than silently walking the wrong direction.
+`o` is mandatory on every payload — a cursor without it is rejected as
+malformed.
+
+Encoding is base64url with no padding. Cursors are opaque tokens: the
+payload format is internal to this server, and clients must treat a
+cursor as a black box handed back via `next_cursor`. No byte-level
+compatibility with any other implementation is required.
+
+Time values are serialized as Unix microseconds (UTC) — microsecond
+precision is sufficient to round-trip the timestamps stored by the
+database without rounding rows in the same millisecond bucket.
+"""
+from __future__ import annotations
+
+import base64
+import json
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from typing import Iterable, Optional
+
+
+class InvalidCursorError(ValueError):
+    """Raised on a malformed, oversized, or unsupported-sort-field cursor.
+
+    Map to a 400 response with code ``INVALID_CURSOR`` at the handler.
+    """
+
+
+# Wire-format length caps. Cursors are user-controlled, so caps protect the
+# decode path from oversized allocations and downstream SQL predicates from
+# unbounded strings.
+#
+# MAX_CURSOR_VALUE_LENGTH is 512 to fit the `AssetReference.name` column max
+# (`String(512)`) — otherwise a long-named asset would mint a cursor the same
+# server then refuses on the next request.
+#
+# MAX_ENCODED_CURSOR_LENGTH is the decode-path guard, sized comfortably above
+# the largest cursor the per-field caps can produce. Worst case is value + id
+# at their caps with every character JSON-escaping to the six-byte `\uXXXX`
+# form (control characters), which is ~5.2 KB once base64url-encoded. At 8192
+# the encoder can never mint a cursor that exceeds it, so a freshly minted
+# cursor always decodes on the next request and there is no user-visible
+# "cursor too long" failure.
+MAX_ENCODED_CURSOR_LENGTH = 8192
+MAX_CURSOR_VALUE_LENGTH = 512
+MAX_CURSOR_ID_LENGTH = 128
+
+
+@dataclass(frozen=True)
+class CursorPayload:
+    sort_field: str
+    value: str
+    id: str
+    order: str
+
+
+_VALID_ORDERS = ("asc", "desc")
+
+
+def encode_cursor(sort_field: str, value: str, id: str, order: str = "desc") -> str:
+    """Encode a cursor payload as a base64url (no-padding) string.
+
+    `order` binds the cursor to the sort direction it was minted under so a
+    later request with a flipped `order` query parameter is rejected with
+    ``INVALID_CURSOR`` rather than silently walking the wrong direction.
+    """
+    if order not in _VALID_ORDERS:
+        raise InvalidCursorError(f"order must be one of {_VALID_ORDERS}, got {order!r}")
+    # Symmetric input validation: the encoder must reject anything the
+    # decoder rejects, or the same server will mint cursors it then 400s on
+    # the next request.
+    if not id:
+        raise InvalidCursorError("id must be non-empty")
+    if len(id) > MAX_CURSOR_ID_LENGTH:
+        raise InvalidCursorError("id exceeds maximum length")
+    if len(value) > MAX_CURSOR_VALUE_LENGTH:
+        raise InvalidCursorError("value exceeds maximum length")
+    payload = {"s": sort_field, "v": value, "id": id, "o": order}
+    raw = json.dumps(payload, separators=(",", ":"), ensure_ascii=False)
+    # No mint-time length guard is needed: the per-field caps above bound the
+    # encoded length well below MAX_ENCODED_CURSOR_LENGTH (see its definition),
+    # so the encoder can never produce a cursor the decode path would reject.
+    return base64.urlsafe_b64encode(raw.encode("utf-8")).rstrip(b"=").decode("ascii")
+
+
+def encode_cursor_from_time(sort_field: str, t: datetime, id: str, order: str = "desc") -> str:
+    """Encode a time-typed cursor at Unix microsecond precision.
+
+    Accepts an aware datetime (any timezone) and normalizes to UTC. Naive
+    datetimes are rejected so callers can't accidentally encode the local
+    wall-clock value of a UTC-stored timestamp.
+    """
+    if t.tzinfo is None:
+        raise ValueError("encode_cursor_from_time requires an aware datetime")
+    micros = _datetime_to_unix_micros(t.astimezone(timezone.utc))
+    return encode_cursor(sort_field, str(micros), id, order=order)
+
+
+def decode_cursor(
+    cursor: str,
+    allowed_sort_fields: Iterable[str],
+    expected_order: str | None = None,
+) -> CursorPayload:
+    """Parse an opaque cursor.
+
+    ``allowed_sort_fields`` is the endpoint's accepted sort-field list — a
+    cursor carrying a field outside this set is rejected so a cursor minted
+    for one column can't be replayed against another (e.g. a ``created_at``
+    timestamp string compared against a ``name`` column).
+
+    ``expected_order`` (``"asc"``/``"desc"``), when supplied, must match the
+    payload's ``o`` field. ``o`` is required on every payload; a cursor
+    missing it is rejected as malformed.
+
+    Passing no allowed fields rejects every cursor.
+    """
+    if len(cursor) > MAX_ENCODED_CURSOR_LENGTH:
+        raise InvalidCursorError("cursor exceeds maximum length")
+
+    try:
+        # urlsafe_b64decode requires correct padding; we strip on encode, so
+        # restore the trailing '=' pad here.
+        padding = "=" * (-len(cursor) % 4)
+        raw = base64.urlsafe_b64decode(cursor + padding)
+    except (ValueError, base64.binascii.Error) as e:
+        raise InvalidCursorError(f"encoding: {e}") from e
+
+    try:
+        decoded = json.loads(raw)
+    except (json.JSONDecodeError, UnicodeDecodeError) as e:
+        raise InvalidCursorError(f"payload: {e}") from e
+
+    if not isinstance(decoded, dict):
+        raise InvalidCursorError("payload: expected object")
+
+    sort_field = decoded.get("s")
+    value = decoded.get("v")
+    id = decoded.get("id")
+    order = decoded.get("o")
+
+    if not isinstance(sort_field, str) or not isinstance(value, str) or not isinstance(id, str):
+        raise InvalidCursorError("payload: missing or non-string s/v/id")
+
+    if id == "":
+        raise InvalidCursorError("missing id")
+    if len(id) > MAX_CURSOR_ID_LENGTH:
+        raise InvalidCursorError("id exceeds maximum length")
+    if len(value) > MAX_CURSOR_VALUE_LENGTH:
+        raise InvalidCursorError("value exceeds maximum length")
+
+    if sort_field not in allowed_sort_fields:
+        raise InvalidCursorError(f"unsupported sort field {sort_field!r}")
+
+    if not isinstance(order, str):
+        raise InvalidCursorError("missing or non-string o")
+    if order not in _VALID_ORDERS:
+        raise InvalidCursorError(f"unsupported order {order!r}")
+    if expected_order is not None and order != expected_order:
+        raise InvalidCursorError(
+            f"cursor order {order!r} does not match request order {expected_order!r}"
+        )
+
+    return CursorPayload(sort_field=sort_field, value=value, id=id, order=order)
+
+
+def decode_cursor_time(payload: Optional[CursorPayload]) -> datetime:
+    """Parse a time-typed cursor value as Unix microseconds, returning UTC."""
+    if payload is None:
+        raise InvalidCursorError("nil cursor payload")
+    try:
+        micros = int(payload.value)
+    except ValueError as e:
+        raise InvalidCursorError(f"value is not a valid timestamp: {e}") from e
+    try:
+        return _unix_micros_to_datetime(micros)
+    except (OverflowError, OSError, ValueError) as e:
+        # Crafted out-of-range microseconds (e.g. > datetime.MAX_YEAR) blow up
+        # in fromtimestamp / datetime construction. Map to 400, not 500.
+        raise InvalidCursorError(f"value is out of representable range: {e}") from e
+
+
+def decode_cursor_int(payload: Optional[CursorPayload]) -> int:
+    """Parse a cursor value as a base-10 integer."""
+    if payload is None:
+        raise InvalidCursorError("nil cursor payload")
+    try:
+        return int(payload.value)
+    except ValueError as e:
+        raise InvalidCursorError(f"value is not a valid integer: {e}") from e
+
+
+_EPOCH = datetime(1970, 1, 1, tzinfo=timezone.utc)
+
+
+def _datetime_to_unix_micros(t: datetime) -> int:
+    """Convert an aware UTC datetime to Unix microseconds (integer math)."""
+    delta = t - _EPOCH
+    return (delta.days * 86_400 + delta.seconds) * 1_000_000 + delta.microseconds
+
+
+def _unix_micros_to_datetime(micros: int) -> datetime:
+    """Convert Unix microseconds to a UTC datetime, preserving precision."""
+    seconds, micro_remainder = divmod(micros, 1_000_000)
+    return datetime.fromtimestamp(seconds, tz=timezone.utc).replace(microsecond=micro_remainder)
--- a/app/assets/services/image_dimensions.py
+++ b/app/assets/services/image_dimensions.py
@ -0,0 +1,63 @@
+"""Image dimension extraction for asset ingest.
+
+Reads only the image header via Pillow to capture width/height cheaply,
+without a full pixel decode. Returns a metadata dict suitable for merging
+into ``AssetReference.system_metadata``.
+"""
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+def extract_image_dimensions(
+    file_path: str, mime_type: str | None = None
+) -> dict[str, Any] | None:
+    """Extract image dimensions for the file at ``file_path``.
+
+    Args:
+        file_path: Absolute path to a file on disk.
+        mime_type: Optional MIME type hint. When provided and not prefixed
+            with ``image/``, extraction is skipped without touching the file.
+
+    Returns:
+        ``{"kind": "image", "width": W, "height": H}`` when the file is a
+        recognizable image with positive dimensions, otherwise ``None``.
+
+    The dict shape is intended to be merged into ``system_metadata`` so the
+    asset response surfaces ``metadata.kind`` plus dimension fields for image
+    assets. Forward-compatible: future media kinds (e.g. ``"video"`` with
+    duration/fps) can extend this shape without schema changes.
+    """
+    if mime_type is not None and not mime_type.startswith("image/"):
+        return None
+
+    try:
+        from PIL import Image, UnidentifiedImageError
+    except ImportError:
+        logger.debug(
+            "Pillow not available; skipping image dimension extraction for %s",
+            file_path,
+        )
+        return None
+
+    try:
+        with Image.open(file_path) as img:
+            width, height = img.size
+    except (OSError, UnidentifiedImageError, ValueError) as exc:
+        logger.debug(
+            "Failed to read image dimensions from %s: %s", file_path, exc
+        )
+        return None
+
+    if (
+        not isinstance(width, int)
+        or not isinstance(height, int)
+        or width <= 0
+        or height <= 0
+    ):
+        return None
+
+    return {"kind": "image", "width": width, "height": height}
--- a/app/assets/services/ingest.py
+++ b/app/assets/services/ingest.py
@ -17,9 +17,11 @@ from app.assets.database.queries import (
    get_reference_by_file_path,
    get_reference_tags,
    get_or_create_reference,
+    list_references_by_asset_id,
    reference_exists,
    remove_missing_tag_for_asset_id,
    set_reference_metadata,
+    set_reference_system_metadata,
    set_reference_tags,
    update_asset_hash_and_mime,
    upsert_asset,
@ -29,6 +31,7 @@ from app.assets.database.queries import (
 from app.assets.helpers import get_utc_now, normalize_tags
 from app.assets.services.bulk_ingest import batch_insert_seed_assets
 from app.assets.services.file_utils import get_size_and_mtime_ns
+from app.assets.services.image_dimensions import extract_image_dimensions
 from app.assets.services.path_utils import (
    compute_relative_filename,
    get_name_and_tags_from_asset_path,
@ -118,6 +121,14 @@ def _ingest_file_from_path(
                user_metadata=user_metadata,
            )

+            _maybe_store_image_dimensions(
+                session,
+                reference_id=reference_id,
+                file_path=locator,
+                mime_type=mime_type,
+                current_system_metadata=ref.system_metadata,
+            )
+
        try:
            remove_missing_tag_for_asset_id(session, asset_id=asset.id)
        except Exception:
@ -288,6 +299,13 @@ def _register_existing_asset(
                user_metadata=new_meta,
            )

+        _backfill_image_dimensions_from_siblings(
+            session,
+            asset_id=asset.id,
+            new_reference_id=ref.id,
+            current_system_metadata=ref.system_metadata,
+        )
+
        if tags is not None:
            set_reference_tags(
                session,
@ -334,6 +352,87 @@ def _update_metadata_with_filename(
        )


+_IMAGE_DIMENSION_KEYS = ("kind", "width", "height")
+
+
+def _maybe_store_image_dimensions(
+    session: Session,
+    reference_id: str,
+    file_path: str,
+    mime_type: str | None,
+    current_system_metadata: dict | None,
+) -> None:
+    """Populate ``kind``/``width``/``height`` on system_metadata for image refs.
+
+    Non-image MIME types are a no-op. Pre-existing keys (e.g. enricher-written
+    safetensors metadata, download provenance) are preserved by merge.
+    """
+    if not mime_type or not mime_type.startswith("image/"):
+        return
+
+    dims = extract_image_dimensions(file_path, mime_type=mime_type)
+    if not dims:
+        return
+
+    current = current_system_metadata or {}
+    merged = dict(current)
+    merged.update(dims)
+    if merged != current:
+        set_reference_system_metadata(
+            session,
+            reference_id=reference_id,
+            system_metadata=merged,
+        )
+
+
+def _backfill_image_dimensions_from_siblings(
+    session: Session,
+    asset_id: str,
+    new_reference_id: str,
+    current_system_metadata: dict | None,
+) -> None:
+    """Copy image dimension keys from any sibling reference of the same asset.
+
+    The from-hash path doesn't read the file bytes, so dimensions can't be
+    extracted there directly. When another reference of the same asset already
+    carries image dimensions, copy them onto the new reference so consumers
+    see consistent metadata regardless of how the asset was registered.
+
+    Best-effort: missing siblings, non-image siblings, or absent dimension
+    keys leave the target reference unchanged.
+    """
+    current = current_system_metadata or {}
+    if current.get("kind") == "image" and "width" in current and "height" in current:
+        return
+
+    for sibling in list_references_by_asset_id(session, asset_id):
+        if sibling.id == new_reference_id:
+            continue
+        meta = sibling.system_metadata or {}
+        if meta.get("kind") != "image":
+            continue
+        width = meta.get("width")
+        height = meta.get("height")
+        if (
+            type(width) is not int
+            or type(height) is not int
+            or width <= 0
+            or height <= 0
+        ):
+            continue
+        merged = dict(current)
+        merged["kind"] = "image"
+        merged["width"] = width
+        merged["height"] = height
+        if merged != current:
+            set_reference_system_metadata(
+                session,
+                reference_id=new_reference_id,
+                system_metadata=merged,
+            )
+        return
+
+
 def _sanitize_filename(name: str | None, fallback: str) -> str:
    n = os.path.basename((name or "").strip() or fallback)
    return n if n else fallback
--- a/app/assets/services/metadata_extract.py
+++ b/app/assets/services/metadata_extract.py
@ -4,7 +4,6 @@ Tier 1: Filesystem metadata (zero parsing)
 Tier 2: Safetensors header metadata (fast JSON read only)
 """

-from __future__ import annotations

 import json
 import logging
--- a/app/assets/services/schemas.py
+++ b/app/assets/services/schemas.py
@ -56,7 +56,6 @@ class IngestResult:

 class TagUsage(NamedTuple):
    name: str
-    tag_type: str
    count: int


@ -71,6 +70,7 @@ class AssetSummaryData:
 class ListAssetsResult:
    items: list[AssetSummaryData]
    total: int
+    next_cursor: str | None = None


@dataclass(frozen=True)
--- a/app/assets/services/tagging.py
+++ b/app/assets/services/tagging.py
@ -75,7 +75,7 @@ def list_tags(
            owner_id=owner_id,
        )

-    return [TagUsage(name, tag_type, count) for name, tag_type, count in rows], total
+    return [TagUsage(name, count) for name, count in rows], total


 def list_tag_histogram(
--- a/app/custom_node_manager.py
+++ b/app/custom_node_manager.py
@ -1,5 +1,3 @@
-from __future__ import annotations
-
 import os
 import folder_paths
 import glob
--- a/app/frontend_management.py
+++ b/app/frontend_management.py
@ -1,4 +1,3 @@
-from __future__ import annotations
 import argparse
 import logging
 import os
@ -62,6 +61,8 @@ def get_comfy_package_versions():
 def check_comfy_packages_versions():
    """Warn for every comfy* package whose installed version is below requirements.txt."""
    from packaging.version import InvalidVersion, parse as parse_pep440
+    outdated_packages = []
+
    for pkg in get_comfy_package_versions():
        installed_str = pkg["installed"]
        required_str = pkg["required"]
@ -73,19 +74,26 @@ def check_comfy_packages_versions():
            logging.error(f"Failed to check {pkg['name']} version: {e}")
            continue
        if outdated:
-            app.logger.log_startup_warning(
-                f"""
+            outdated_packages.append((pkg["name"], installed_str, required_str))
+        else:
+            logging.info("{} version: {}".format(pkg["name"], installed_str))
+
+    if outdated_packages:
+        package_warnings = "\n".join(
+            f"Installed {name} version {installed} is lower than the recommended version {required}."
+            for name, installed, required in outdated_packages
+        )
+        app.logger.log_startup_warning(
+            f"""
 ________________________________________________________________________
 WARNING WARNING WARNING WARNING WARNING

-Installed {pkg["name"]} version {installed_str} is lower than the recommended version {required_str}.
+{package_warnings}

 {get_missing_requirements_message()}
 ________________________________________________________________________
 """.strip()
-            )
-        else:
-            logging.info("{} version: {}".format(pkg["name"], installed_str))
+        )


 REQUEST_TIMEOUT = 10  # seconds
--- a/app/logger.py
+++ b/app/logger.py
@ -5,6 +5,40 @@ import logging
 import sys
 import threading

+ANSI_NAMED_COLORS = {
+    'black':   '\033[30m',
+    'red':     '\033[31m',
+    'green':   '\033[32m',
+    'yellow':  '\033[33m',
+    'blue':    '\033[34m',
+    'magenta': '\033[35m',
+    'cyan':    '\033[36m',
+    'white':   '\033[37m',
+}
+
+ANSI_LEVEL_COLORS = {
+    'DEBUG':    ANSI_NAMED_COLORS['cyan'],
+    'INFO':     ANSI_NAMED_COLORS['green'],
+    'WARNING':  ANSI_NAMED_COLORS['yellow'],
+    'ERROR':    ANSI_NAMED_COLORS['red'],
+    'CRITICAL': ANSI_NAMED_COLORS['magenta'],
+}
+
+ANSI_RESET = '\033[0m'
+ANSI_BOLD  = '\033[1m'
+
+
+class ColoredFormatter(logging.Formatter):
+    def format(self, record):
+        color = ANSI_LEVEL_COLORS.get(record.levelname, '')
+        bold  = ANSI_BOLD if record.levelno >= logging.WARNING else ''
+        level_tag = f"{bold}{color}[{record.levelname}]{ANSI_RESET} "
+        message = super().format(record)
+        line_color = ANSI_NAMED_COLORS.get(getattr(record, 'color', ''), '')
+        if line_color:
+            return f"{level_tag}{line_color}{message}{ANSI_RESET}"
+        return level_tag + message
+
 logs = None
 stdout_interceptor = None
 stderr_interceptor = None
@ -68,8 +102,10 @@ def setup_logger(log_level: str = 'INFO', capacity: int = 300, use_stdout: bool
    logger = logging.getLogger()
    logger.setLevel(log_level)

+    formatter = ColoredFormatter("%(message)s")
+
    stream_handler = logging.StreamHandler()
-    stream_handler.setFormatter(logging.Formatter("%(message)s"))
+    stream_handler.setFormatter(formatter)

    if use_stdout:
        # Only errors and critical to stderr
@ -77,7 +113,7 @@ def setup_logger(log_level: str = 'INFO', capacity: int = 300, use_stdout: bool

        # Lesser to stdout
        stdout_handler = logging.StreamHandler(sys.stdout)
-        stdout_handler.setFormatter(logging.Formatter("%(message)s"))
+        stdout_handler.setFormatter(formatter)
        stdout_handler.addFilter(lambda record: record.levelno < logging.ERROR)
        logger.addHandler(stdout_handler)

--- a/app/model_manager.py
+++ b/app/model_manager.py
@ -1,5 +1,3 @@
-from __future__ import annotations
-
 import os
 import base64
 import json
--- a/app/user_manager.py
+++ b/app/user_manager.py
@ -1,4 +1,3 @@
-from __future__ import annotations
 import json
 import os
 import re
--- a/blueprints/Audio
+++ b/blueprints/Audio
--- a/blueprints/Audio
+++ b/blueprints/Audio
--- a/(Z-Image-Turbo).json
+++ b/(Z-Image-Turbo).json
@ -1553,7 +1553,7 @@
          "VHS_MetadataImage": true,
          "VHS_KeepIntermediate": true
        },
-        "category": "Image generation and editing/Canny to image",
+        "category": "Image generation and editing/Conditioned",
        "description": "Generates an image from a Canny edge map using Z-Image-Turbo, with text conditioning."
      }
    ]
--- a/blueprints/Canny
+++ b/blueprints/Canny
@ -3600,7 +3600,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Video generation and editing/Canny to video",
+        "category": "Video generation and editing/Conditioned",
        "description": "Generates video from Canny edge maps using LTX-2, with optional synchronized audio."
      }
    ]
--- a/blueprints/ControlNet
+++ b/blueprints/ControlNet
@ -1401,7 +1401,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image generation and editing/ControlNet",
+        "category": "Image generation and editing/Conditioned",
        "description": "Generates images from a text prompt and ControlNet conditioning (e.g. depth, canny) using Z-Image-Turbo."
      }
    ]
--- a/(Z-Image-Turbo).json
+++ b/(Z-Image-Turbo).json
@ -1579,7 +1579,7 @@
          "VHS_MetadataImage": true,
          "VHS_KeepIntermediate": true
        },
-        "category": "Image generation and editing/Depth to image",
+        "category": "Image generation and editing/Conditioned",
        "description": "Generates an image from a depth map using Z-Image-Turbo with text conditioning."
      },
      {
--- a/blueprints/Depth
+++ b/blueprints/Depth
@ -4233,7 +4233,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Video generation and editing/Depth to video",
+        "category": "Video generation and editing/Conditioned",
        "description": "Generates depth-controlled video with LTX-2: motion and structure follow a depth-reference video alongside text prompting, optional first-frame image conditioning, with optional synchronized audio."
      },
      {
--- a/blueprints/First-Last-Frame
+++ b/blueprints/First-Last-Frame
@ -3350,7 +3350,7 @@
          }
        ],
        "extra": {},
-        "category": "Video generation and editing/First-Last-Frame to Video",
+        "category": "Video generation and editing/Conditioned",
        "description": "Generates a video interpolating between first and last keyframes using LTX-2.3."
      }
    ]
--- a/blueprints/First-Last-Frame
+++ b/blueprints/First-Last-Frame
@ -3350,7 +3350,7 @@
          }
        ],
        "extra": {},
-        "category": "Video generation and editing/First-Last-Frame to Video",
+        "category": "Video generation and editing/FLF2V",
        "description": "Generates a video that interpolates between the first and last keyframes using LTX-2.3, including optional audio."
      }
    ]
--- a/blueprints/Geometry
+++ b/blueprints/Geometry
--- a/blueprints/Image
+++ b/blueprints/Image
@ -310,9 +310,9 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Text generation/Image Captioning",
+        "category": "Image Tools",
        "description": "Generates descriptive captions for images using Google's Gemini multimodal LLM."
      }
    ]
  }
-}
+}
--- a/blueprints/Image
+++ b/blueprints/Image
@ -1,19 +1,18 @@
 {
-  "id": "6af0a6c1-0161-4528-8685-65776e838d44",
  "revision": 0,
-  "last_node_id": 75,
-  "last_link_id": 245,
+  "last_node_id": 76,
+  "last_link_id": 0,
  "nodes": [
    {
-      "id": 75,
-      "type": "488652fd-6edf-4d06-8f9f-4d84d3a34eaf",
+      "id": 76,
+      "type": "96338968-1242-4f02-b6a1-d496af4bcffe",
      "pos": [
-        600,
-        830
+        670,
+        1280
      ],
      "size": [
        400,
-        110
+        201.3125
      ],
      "flags": {},
      "order": 0,
@ -59,47 +58,44 @@
          "links": []
        }
      ],
+      "title": "Image Depth Estimation (Lotus Depth)",
      "properties": {
        "proxyWidgets": [
          [
-            "-1",
+            "28",
            "sigma"
          ],
          [
-            "-1",
+            "10",
            "unet_name"
          ],
          [
-            "-1",
+            "14",
            "vae_name"
          ]
        ],
        "cnr_id": "comfy-core",
        "ver": "0.14.1"
      },
-      "widgets_values": [
-        999.0000000000002,
-        "lotus-depth-d-v1-1.safetensors",
-        "vae-ft-mse-840000-ema-pruned.safetensors"
-      ]
+      "widgets_values": []
    }
  ],
  "links": [],
-  "groups": [],
+  "version": 0.4,
  "definitions": {
    "subgraphs": [
      {
-        "id": "488652fd-6edf-4d06-8f9f-4d84d3a34eaf",
+        "id": "96338968-1242-4f02-b6a1-d496af4bcffe",
        "version": 1,
        "state": {
          "lastGroupId": 1,
-          "lastNodeId": 75,
+          "lastNodeId": 76,
          "lastLinkId": 245,
          "lastRerouteId": 0
        },
        "revision": 0,
        "config": {},
-        "name": "Image to Depth Map (Lotus)",
+        "name": "Image Depth Estimation (Lotus Depth)",
        "inputNode": {
          "id": -10,
          "bounding": [
@ -191,12 +187,12 @@
            "id": 10,
            "type": "UNETLoader",
            "pos": [
-              108.05555555555557,
-              -253.05555555555557
+              110,
+              -250
            ],
            "size": [
-              254.93706597222226,
-              82
+              260,
+              90
            ],
            "flags": {},
            "order": 4,
@ -234,9 +230,9 @@
              }
            ],
            "properties": {
+              "Node name for S&R": "UNETLoader",
              "cnr_id": "comfy-core",
              "ver": "0.3.34",
-              "Node name for S&R": "UNETLoader",
              "models": [
                {
                  "name": "lotus-depth-d-v1-1.safetensors",
@ -255,12 +251,12 @@
            "id": 18,
            "type": "DisableNoise",
            "pos": [
-              607.0641494069639,
-              -268.33337840371513
+              610,
+              -270
            ],
            "size": [
-              175,
-              33.333333333333336
+              180,
+              40
            ],
            "flags": {},
            "order": 0,
@ -278,26 +274,25 @@
              }
            ],
            "properties": {
+              "Node name for S&R": "DisableNoise",
              "cnr_id": "comfy-core",
              "ver": "0.3.34",
-              "Node name for S&R": "DisableNoise",
              "widget_ue_connectable": {}
-            },
-            "widgets_values": []
+            }
          },
          {
-            "id": 23,
+            "id": 74,
            "type": "VAEEncode",
            "pos": [
              620,
              160
            ],
            "size": [
-              175,
+              180,
              50
            ],
            "flags": {},
-            "order": 10,
+            "order": 11,
            "mode": 0,
            "inputs": [
              {
@ -325,12 +320,11 @@
              }
            ],
            "properties": {
+              "Node name for S&R": "VAEEncode",
              "cnr_id": "comfy-core",
              "ver": "0.3.34",
-              "Node name for S&R": "VAEEncode",
              "widget_ue_connectable": {}
-            },
-            "widgets_values": []
+            }
          },
          {
            "id": 21,
@ -341,7 +335,7 @@
            ],
            "size": [
              210,
-              58
+              60
            ],
            "flags": {},
            "order": 1,
@ -369,9 +363,9 @@
              }
            ],
            "properties": {
+              "Node name for S&R": "KSamplerSelect",
              "cnr_id": "comfy-core",
              "ver": "0.3.34",
-              "Node name for S&R": "KSamplerSelect",
              "widget_ue_connectable": {}
            },
            "widgets_values": [
@ -386,7 +380,7 @@
              -170
            ],
            "size": [
-              175,
+              180,
              50
            ],
            "flags": {},
@ -418,12 +412,11 @@
              }
            ],
            "properties": {
+              "Node name for S&R": "BasicGuider",
              "cnr_id": "comfy-core",
              "ver": "0.3.34",
-              "Node name for S&R": "BasicGuider",
              "widget_ue_connectable": {}
-            },
-            "widgets_values": []
+            }
          },
          {
            "id": 16,
@ -433,8 +426,8 @@
              -130
            ],
            "size": [
-              295.99609375,
-              271.65798611111114
+              300,
+              280
            ],
            "flags": {},
            "order": 6,
@ -490,12 +483,11 @@
              }
            ],
            "properties": {
+              "Node name for S&R": "SamplerCustomAdvanced",
              "cnr_id": "comfy-core",
              "ver": "0.3.34",
-              "Node name for S&R": "SamplerCustomAdvanced",
              "widget_ue_connectable": {}
-            },
-            "widgets_values": []
+            }
          },
          {
            "id": 28,
@ -506,10 +498,10 @@
            ],
            "size": [
              210,
-              58
+              60
            ],
            "flags": {},
-            "order": 11,
+            "order": 10,
            "mode": 0,
            "inputs": [
              {
@ -540,9 +532,9 @@
              }
            ],
            "properties": {
+              "Node name for S&R": "SetFirstSigma",
              "cnr_id": "comfy-core",
              "ver": "0.3.34",
-              "Node name for S&R": "SetFirstSigma",
              "widget_ue_connectable": {}
            },
            "widgets_values": [
@ -557,7 +549,7 @@
              -120
            ],
            "size": [
-              175,
+              180,
              50
            ],
            "flags": {},
@ -589,12 +581,11 @@
              }
            ],
            "properties": {
+              "Node name for S&R": "VAEDecode",
              "cnr_id": "comfy-core",
              "ver": "0.3.34",
-              "Node name for S&R": "VAEDecode",
              "widget_ue_connectable": {}
-            },
-            "widgets_values": []
+            }
          },
          {
            "id": 22,
@ -604,8 +595,8 @@
              -220
            ],
            "size": [
-              175,
-              33.333333333333336
+              180,
+              40
            ],
            "flags": {},
            "order": 9,
@ -630,12 +621,11 @@
              }
            ],
            "properties": {
+              "Node name for S&R": "ImageInvert",
              "cnr_id": "comfy-core",
              "ver": "0.3.34",
-              "Node name for S&R": "ImageInvert",
              "widget_ue_connectable": {}
-            },
-            "widgets_values": []
+            }
          },
          {
            "id": 14,
@ -645,8 +635,8 @@
              -90
            ],
            "size": [
-              254.93706597222226,
-              58
+              260,
+              60
            ],
            "flags": {},
            "order": 5,
@ -675,9 +665,9 @@
              }
            ],
            "properties": {
+              "Node name for S&R": "VAELoader",
              "cnr_id": "comfy-core",
              "ver": "0.3.34",
-              "Node name for S&R": "VAELoader",
              "models": [
                {
                  "name": "vae-ft-mse-840000-ema-pruned.safetensors",
@ -692,15 +682,15 @@
            ]
          },
          {
-            "id": 68,
+            "id": 75,
            "type": "LotusConditioning",
            "pos": [
              400,
              -150
            ],
            "size": [
-              175,
-              33.333333333333336
+              180,
+              40
            ],
            "flags": {},
            "order": 2,
@ -718,12 +708,11 @@
              }
            ],
            "properties": {
+              "Node name for S&R": "LotusConditioning",
              "cnr_id": "comfy-core",
              "ver": "0.3.34",
-              "Node name for S&R": "LotusConditioning",
              "widget_ue_connectable": {}
-            },
-            "widgets_values": []
+            }
          },
          {
            "id": 20,
@ -734,7 +723,7 @@
            ],
            "size": [
              210,
-              106
+              110
            ],
            "flags": {},
            "order": 8,
@ -786,9 +775,9 @@
              }
            ],
            "properties": {
+              "Node name for S&R": "BasicScheduler",
              "cnr_id": "comfy-core",
              "ver": "0.3.34",
-              "Node name for S&R": "BasicScheduler",
              "widget_ue_connectable": {}
            },
            "widgets_values": [
@ -850,7 +839,7 @@
          },
          {
            "id": 201,
-            "origin_id": 23,
+            "origin_id": 74,
            "origin_slot": 0,
            "target_id": 16,
            "target_slot": 4,
@ -866,7 +855,7 @@
          },
          {
            "id": 238,
-            "origin_id": 68,
+            "origin_id": 75,
            "origin_slot": 0,
            "target_id": 19,
            "target_slot": 1,
@ -892,7 +881,7 @@
            "id": 38,
            "origin_id": 14,
            "origin_slot": 0,
-            "target_id": 23,
+            "target_id": 74,
            "target_slot": 1,
            "type": "VAE"
          },
@ -908,7 +897,7 @@
            "id": 37,
            "origin_id": -10,
            "origin_slot": 0,
-            "target_id": 23,
+            "target_id": 74,
            "target_slot": 0,
            "type": "IMAGE"
          },
@ -948,12 +937,11 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image generation and editing/Depth to image",
+        "category": "Conditioning & Preprocessors/Depth",
        "description": "Estimates a monocular depth map from an input image using the Lotus depth estimation model."
      }
    ]
  },
-  "config": {},
  "extra": {
    "ds": {
      "scale": 1.3589709866044692,
@ -961,8 +949,6 @@
        -138.53613935617864,
        -786.0629126022195
      ]
-    },
-    "workflowRendererVersion": "LG"
-  },
-  "version": 0.4
+    }
+  }
 }
--- a/blueprints/Image
+++ b/blueprints/Image
--- a/(Mediapipe).json
+++ b/(Mediapipe).json
@ -0,0 +1,779 @@
+{
+  "revision": 0,
+  "last_node_id": 33,
+  "last_link_id": 0,
+  "nodes": [
+    {
+      "id": 33,
+      "type": "6062babb-b649-4a71-be9e-20ebce567744",
+      "pos": [
+        -450,
+        4240
+      ],
+      "size": [
+        420,
+        400
+      ],
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "inputs": [
+        {
+          "localized_name": "image",
+          "name": "image",
+          "type": "IMAGE",
+          "link": null
+        },
+        {
+          "name": "face_landmarker",
+          "type": "FACE_LANDMARKER",
+          "link": null
+        },
+        {
+          "name": "detector_variant",
+          "type": "COMBO",
+          "widget": {
+            "name": "detector_variant"
+          },
+          "link": null
+        },
+        {
+          "name": "num_faces",
+          "type": "INT",
+          "widget": {
+            "name": "num_faces"
+          },
+          "link": null
+        },
+        {
+          "label": "custom_face_oval",
+          "name": "regions.face_oval",
+          "type": "BOOLEAN",
+          "widget": {
+            "name": "regions.face_oval"
+          },
+          "link": null
+        },
+        {
+          "label": "custom_lips",
+          "name": "regions.lips",
+          "type": "BOOLEAN",
+          "widget": {
+            "name": "regions.lips"
+          },
+          "link": null
+        },
+        {
+          "label": "custom_left_eye",
+          "name": "regions.left_eye",
+          "type": "BOOLEAN",
+          "widget": {
+            "name": "regions.left_eye"
+          },
+          "link": null
+        },
+        {
+          "label": "custom_right_eye",
+          "name": "regions.right_eye",
+          "type": "BOOLEAN",
+          "widget": {
+            "name": "regions.right_eye"
+          },
+          "link": null
+        },
+        {
+          "label": "custom_irises",
+          "name": "regions.irises",
+          "type": "BOOLEAN",
+          "widget": {
+            "name": "regions.irises"
+          },
+          "link": null
+        },
+        {
+          "name": "model_name",
+          "type": "COMBO",
+          "widget": {
+            "name": "model_name"
+          },
+          "link": null
+        }
+      ],
+      "outputs": [
+        {
+          "localized_name": "face_landmarks",
+          "name": "face_landmarks",
+          "type": "FACE_LANDMARKS",
+          "links": []
+        },
+        {
+          "localized_name": "bboxes",
+          "name": "bboxes",
+          "type": "BOUNDING_BOX",
+          "links": []
+        },
+        {
+          "label": "mask",
+          "name": "MASK_1",
+          "type": "MASK",
+          "links": []
+        }
+      ],
+      "title": "Image Face Detection (Mediapipe)",
+      "properties": {
+        "proxyWidgets": [
+          [
+            "11",
+            "detector_variant"
+          ],
+          [
+            "11",
+            "num_faces"
+          ],
+          [
+            "20",
+            "regions.face_oval"
+          ],
+          [
+            "20",
+            "regions.lips"
+          ],
+          [
+            "20",
+            "regions.left_eye"
+          ],
+          [
+            "20",
+            "regions.right_eye"
+          ],
+          [
+            "20",
+            "regions.irises"
+          ],
+          [
+            "2",
+            "model_name"
+          ]
+        ],
+        "cnr_id": "comfy-core",
+        "ver": "0.22.0",
+        "enableTabs": false,
+        "tabWidth": 65,
+        "tabXOffset": 10,
+        "hasSecondTab": false,
+        "secondTabText": "Send Back",
+        "secondTabOffset": 80,
+        "secondTabWidth": 65
+      },
+      "widgets_values": []
+    }
+  ],
+  "links": [],
+  "version": 0.4,
+  "definitions": {
+    "subgraphs": [
+      {
+        "id": "6062babb-b649-4a71-be9e-20ebce567744",
+        "version": 1,
+        "state": {
+          "lastGroupId": 2,
+          "lastNodeId": 158,
+          "lastLinkId": 140,
+          "lastRerouteId": 0
+        },
+        "revision": 0,
+        "config": {},
+        "name": "Image Face Detection (Mediapipe)",
+        "description": "Detects facial landmarks from an image using MediaPipe, outputting landmark data, face bounding boxes, and an optional face-region mask.",
+        "inputNode": {
+          "id": -10,
+          "bounding": [
+            -710,
+            4300,
+            148.880859375,
+            248
+          ]
+        },
+        "outputNode": {
+          "id": -20,
+          "bounding": [
+            140,
+            4480,
+            137.677734375,
+            108
+          ]
+        },
+        "inputs": [
+          {
+            "id": "705dc1ae-6dc9-4155-92df-52f816ad451e",
+            "name": "image",
+            "type": "IMAGE",
+            "linkIds": [
+              60
+            ],
+            "localized_name": "image",
+            "pos": [
+              -585.119140625,
+              4324
+            ]
+          },
+          {
+            "id": "d6277190-732c-4604-b7cd-d3a9588bf761",
+            "name": "face_landmarker",
+            "type": "FACE_LANDMARKER",
+            "linkIds": [
+              74
+            ],
+            "pos": [
+              -585.119140625,
+              4344
+            ]
+          },
+          {
+            "id": "ac473a08-6a86-42a7-b460-e70c6c5e1e2b",
+            "name": "detector_variant",
+            "type": "COMBO",
+            "linkIds": [
+              75
+            ],
+            "pos": [
+              -585.119140625,
+              4364
+            ]
+          },
+          {
+            "id": "1bec2252-ca2d-496e-8a33-33a61d21f897",
+            "name": "num_faces",
+            "type": "INT",
+            "linkIds": [
+              76
+            ],
+            "pos": [
+              -585.119140625,
+              4384
+            ]
+          },
+          {
+            "id": "17994fa2-0ea0-4c9b-a70a-19789c459c80",
+            "name": "regions.face_oval",
+            "type": "BOOLEAN",
+            "linkIds": [
+              77
+            ],
+            "label": "custom_face_oval",
+            "pos": [
+              -585.119140625,
+              4404
+            ]
+          },
+          {
+            "id": "1c6c5893-2aee-4c37-b702-15ef2e20d863",
+            "name": "regions.lips",
+            "type": "BOOLEAN",
+            "linkIds": [
+              78
+            ],
+            "label": "custom_lips",
+            "pos": [
+              -585.119140625,
+              4424
+            ]
+          },
+          {
+            "id": "f353fcea-4b6f-42a1-8fdd-32b3aa1e1f09",
+            "name": "regions.left_eye",
+            "type": "BOOLEAN",
+            "linkIds": [
+              79
+            ],
+            "label": "custom_left_eye",
+            "pos": [
+              -585.119140625,
+              4444
+            ]
+          },
+          {
+            "id": "1387e121-c1fb-4522-8f0d-43459e11dd86",
+            "name": "regions.right_eye",
+            "type": "BOOLEAN",
+            "linkIds": [
+              80
+            ],
+            "label": "custom_right_eye",
+            "pos": [
+              -585.119140625,
+              4464
+            ]
+          },
+          {
+            "id": "14acb0a0-d1f4-48f3-ba31-811b26236ef9",
+            "name": "regions.irises",
+            "type": "BOOLEAN",
+            "linkIds": [
+              81
+            ],
+            "label": "custom_irises",
+            "pos": [
+              -585.119140625,
+              4484
+            ]
+          },
+          {
+            "id": "25a82859-87de-42c8-8431-09948665546e",
+            "name": "model_name",
+            "type": "COMBO",
+            "linkIds": [
+              86
+            ],
+            "pos": [
+              -585.119140625,
+              4504
+            ]
+          }
+        ],
+        "outputs": [
+          {
+            "id": "d2ba3f92-e8b1-49c3-9590-cfad56c54cf4",
+            "name": "face_landmarks",
+            "type": "FACE_LANDMARKS",
+            "linkIds": [
+              44
+            ],
+            "localized_name": "face_landmarks",
+            "pos": [
+              164,
+              4504
+            ]
+          },
+          {
+            "id": "4f356bb0-d4c4-4f93-b4cf-0845a65c4e6d",
+            "name": "bboxes",
+            "type": "BOUNDING_BOX",
+            "linkIds": [
+              25
+            ],
+            "localized_name": "bboxes",
+            "pos": [
+              164,
+              4524
+            ]
+          },
+          {
+            "id": "f6309e1d-6397-4363-b38f-778a122abc51",
+            "name": "MASK_1",
+            "type": "MASK",
+            "linkIds": [
+              83
+            ],
+            "label": "mask",
+            "pos": [
+              164,
+              4544
+            ]
+          }
+        ],
+        "widgets": [],
+        "nodes": [
+          {
+            "id": 11,
+            "type": "MediaPipeFaceLandmarker",
+            "pos": [
+              -280,
+              4280
+            ],
+            "size": [
+              350,
+              220
+            ],
+            "flags": {},
+            "order": 1,
+            "mode": 0,
+            "inputs": [
+              {
+                "localized_name": "face_detection_model",
+                "name": "face_detection_model",
+                "type": "FACE_DETECTION_MODEL",
+                "link": 66
+              },
+              {
+                "localized_name": "image",
+                "name": "image",
+                "type": "IMAGE",
+                "link": 60
+              },
+              {
+                "localized_name": "detector_variant",
+                "name": "detector_variant",
+                "type": "COMBO",
+                "widget": {
+                  "name": "detector_variant"
+                },
+                "link": 75
+              },
+              {
+                "localized_name": "num_faces",
+                "name": "num_faces",
+                "type": "INT",
+                "widget": {
+                  "name": "num_faces"
+                },
+                "link": 76
+              },
+              {
+                "localized_name": "min_confidence",
+                "name": "min_confidence",
+                "type": "FLOAT",
+                "widget": {
+                  "name": "min_confidence"
+                },
+                "link": null
+              },
+              {
+                "localized_name": "missing_frame_fallback",
+                "name": "missing_frame_fallback",
+                "type": "COMBO",
+                "widget": {
+                  "name": "missing_frame_fallback"
+                },
+                "link": null
+              },
+              {
+                "name": "face_landmarker",
+                "type": "FACE_LANDMARKER",
+                "link": 74
+              }
+            ],
+            "outputs": [
+              {
+                "localized_name": "face_landmarks",
+                "name": "face_landmarks",
+                "type": "FACE_LANDMARKS",
+                "links": [
+                  44,
+                  46
+                ]
+              },
+              {
+                "localized_name": "bboxes",
+                "name": "bboxes",
+                "type": "BOUNDING_BOX",
+                "links": [
+                  25
+                ]
+              }
+            ],
+            "properties": {
+              "Node name for S&R": "MediaPipeFaceLandmarker",
+              "cnr_id": "comfy-core",
+              "ver": "0.22.0",
+              "enableTabs": false,
+              "tabWidth": 65,
+              "tabXOffset": 10,
+              "hasSecondTab": false,
+              "secondTabText": "Send Back",
+              "secondTabOffset": 80,
+              "secondTabWidth": 65
+            },
+            "widgets_values": [
+              "full",
+              0,
+              0.5,
+              "empty"
+            ]
+          },
+          {
+            "id": 2,
+            "type": "LoadMediaPipeFaceLandmarker",
+            "pos": [
+              -290,
+              4060
+            ],
+            "size": [
+              350,
+              140
+            ],
+            "flags": {},
+            "order": 0,
+            "mode": 0,
+            "inputs": [
+              {
+                "localized_name": "model_name",
+                "name": "model_name",
+                "type": "COMBO",
+                "widget": {
+                  "name": "model_name"
+                },
+                "link": 86
+              }
+            ],
+            "outputs": [
+              {
+                "localized_name": "FACE_DETECTION_MODEL",
+                "name": "FACE_DETECTION_MODEL",
+                "type": "FACE_DETECTION_MODEL",
+                "links": [
+                  66
+                ]
+              }
+            ],
+            "properties": {
+              "Node name for S&R": "LoadMediaPipeFaceLandmarker",
+              "cnr_id": "comfy-core",
+              "ver": "0.22.0",
+              "models": [
+                {
+                  "name": "mediapipe_face_fp32.safetensors",
+                  "url": "https://huggingface.co/Comfy-Org/mediapipe/resolve/main/detection/mediapipe_face_fp32.safetensors",
+                  "directory": "detection"
+                }
+              ],
+              "enableTabs": false,
+              "tabWidth": 65,
+              "tabXOffset": 10,
+              "hasSecondTab": false,
+              "secondTabText": "Send Back",
+              "secondTabOffset": 80,
+              "secondTabWidth": 65
+            },
+            "widgets_values": [
+              "mediapipe_face_fp32.safetensors"
+            ]
+          },
+          {
+            "id": 20,
+            "type": "MediaPipeFaceMask",
+            "pos": [
+              -290,
+              4560
+            ],
+            "size": [
+              360,
+              180
+            ],
+            "flags": {},
+            "order": 2,
+            "mode": 0,
+            "inputs": [
+              {
+                "localized_name": "face_landmarks",
+                "name": "face_landmarks",
+                "type": "FACE_LANDMARKS",
+                "link": 46
+              },
+              {
+                "localized_name": "regions",
+                "name": "regions",
+                "type": "COMFY_DYNAMICCOMBO_V3",
+                "widget": {
+                  "name": "regions"
+                },
+                "link": null
+              },
+              {
+                "localized_name": "regions.face_oval",
+                "name": "regions.face_oval",
+                "type": "BOOLEAN",
+                "widget": {
+                  "name": "regions.face_oval"
+                },
+                "link": 77
+              },
+              {
+                "localized_name": "regions.lips",
+                "name": "regions.lips",
+                "type": "BOOLEAN",
+                "widget": {
+                  "name": "regions.lips"
+                },
+                "link": 78
+              },
+              {
+                "localized_name": "regions.left_eye",
+                "name": "regions.left_eye",
+                "type": "BOOLEAN",
+                "widget": {
+                  "name": "regions.left_eye"
+                },
+                "link": 79
+              },
+              {
+                "localized_name": "regions.right_eye",
+                "name": "regions.right_eye",
+                "type": "BOOLEAN",
+                "widget": {
+                  "name": "regions.right_eye"
+                },
+                "link": 80
+              },
+              {
+                "localized_name": "regions.irises",
+                "name": "regions.irises",
+                "type": "BOOLEAN",
+                "widget": {
+                  "name": "regions.irises"
+                },
+                "link": 81
+              }
+            ],
+            "outputs": [
+              {
+                "localized_name": "MASK",
+                "name": "MASK",
+                "type": "MASK",
+                "links": [
+                  83
+                ]
+              }
+            ],
+            "properties": {
+              "Node name for S&R": "MediaPipeFaceMask",
+              "cnr_id": "comfy-core",
+              "ver": "0.22.0",
+              "enableTabs": false,
+              "tabWidth": 65,
+              "tabXOffset": 10,
+              "hasSecondTab": false,
+              "secondTabText": "Send Back",
+              "secondTabOffset": 80,
+              "secondTabWidth": 65
+            },
+            "widgets_values": [
+              "custom",
+              true,
+              false,
+              false,
+              false,
+              false
+            ]
+          }
+        ],
+        "groups": [],
+        "links": [
+          {
+            "id": 66,
+            "origin_id": 2,
+            "origin_slot": 0,
+            "target_id": 11,
+            "target_slot": 0,
+            "type": "FACE_DETECTION_MODEL"
+          },
+          {
+            "id": 46,
+            "origin_id": 11,
+            "origin_slot": 0,
+            "target_id": 20,
+            "target_slot": 0,
+            "type": "FACE_LANDMARKS"
+          },
+          {
+            "id": 60,
+            "origin_id": -10,
+            "origin_slot": 0,
+            "target_id": 11,
+            "target_slot": 1,
+            "type": "IMAGE"
+          },
+          {
+            "id": 44,
+            "origin_id": 11,
+            "origin_slot": 0,
+            "target_id": -20,
+            "target_slot": 0,
+            "type": "FACE_LANDMARKS"
+          },
+          {
+            "id": 25,
+            "origin_id": 11,
+            "origin_slot": 1,
+            "target_id": -20,
+            "target_slot": 1,
+            "type": "BOUNDING_BOX"
+          },
+          {
+            "id": 74,
+            "origin_id": -10,
+            "origin_slot": 1,
+            "target_id": 11,
+            "target_slot": 6,
+            "type": "FACE_LANDMARKER"
+          },
+          {
+            "id": 75,
+            "origin_id": -10,
+            "origin_slot": 2,
+            "target_id": 11,
+            "target_slot": 2,
+            "type": "COMBO"
+          },
+          {
+            "id": 76,
+            "origin_id": -10,
+            "origin_slot": 3,
+            "target_id": 11,
+            "target_slot": 3,
+            "type": "INT"
+          },
+          {
+            "id": 77,
+            "origin_id": -10,
+            "origin_slot": 4,
+            "target_id": 20,
+            "target_slot": 2,
+            "type": "BOOLEAN"
+          },
+          {
+            "id": 78,
+            "origin_id": -10,
+            "origin_slot": 5,
+            "target_id": 20,
+            "target_slot": 3,
+            "type": "BOOLEAN"
+          },
+          {
+            "id": 79,
+            "origin_id": -10,
+            "origin_slot": 6,
+            "target_id": 20,
+            "target_slot": 4,
+            "type": "BOOLEAN"
+          },
+          {
+            "id": 80,
+            "origin_id": -10,
+            "origin_slot": 7,
+            "target_id": 20,
+            "target_slot": 5,
+            "type": "BOOLEAN"
+          },
+          {
+            "id": 81,
+            "origin_id": -10,
+            "origin_slot": 8,
+            "target_id": 20,
+            "target_slot": 6,
+            "type": "BOOLEAN"
+          },
+          {
+            "id": 83,
+            "origin_id": 20,
+            "origin_slot": 0,
+            "target_id": -20,
+            "target_slot": 2,
+            "type": "MASK"
+          },
+          {
+            "id": 86,
+            "origin_id": -10,
+            "origin_slot": 9,
+            "target_id": 2,
+            "target_slot": 0,
+            "type": "COMBO"
+          }
+        ],
+        "extra": {},
+        "category": "Conditioning & Preprocessors/Face Detection"
+      }
+    ]
+  },
+  "extra": {}
+}
--- a/blueprints/Image
+++ b/blueprints/Image
@ -703,7 +703,7 @@
          }
        ],
        "extra": {},
-        "category": "Image Tools/Image Segmentation",
+        "category": "Conditioning & Preprocessors/Segmentation & Mask",
        "description": "Segments images into masks using Meta SAM3 from text prompts, points, or boxes."
      }
    ]
--- a/Upscale(Z-image-Turbo).json
+++ b/Upscale(Z-image-Turbo).json
@ -1302,7 +1302,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image generation and editing/Enhance",
+        "category": "Image generation and editing/Upscale",
        "description": "Upscales images to higher resolution using Z-Image-Turbo."
      }
    ]
@ -1312,4 +1312,4 @@
    "workflowRendererVersion": "LG"
  },
  "version": 0.4
-}
+}
--- a/Multi-Person).json
+++ b/Multi-Person).json
--- a/(SDPose-OOD).json
+++ b/(SDPose-OOD).json
@ -0,0 +1,888 @@
+{
+  "revision": 0,
+  "last_node_id": 675,
+  "last_link_id": 0,
+  "nodes": [
+    {
+      "id": 675,
+      "type": "01b6a731-fb78-4070-9a38-c87146da9604",
+      "pos": [
+        -2480,
+        3400
+      ],
+      "size": [
+        360,
+        433.3125
+      ],
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [
+        {
+          "localized_name": "input",
+          "name": "input",
+          "type": "IMAGE,MASK",
+          "link": null
+        },
+        {
+          "label": "resize_target_longer_size",
+          "name": "resize_type.longer_size",
+          "type": "INT",
+          "widget": {
+            "name": "resize_type.longer_size"
+          },
+          "link": null
+        },
+        {
+          "name": "scale_method",
+          "type": "COMBO",
+          "widget": {
+            "name": "scale_method"
+          },
+          "link": null
+        },
+        {
+          "name": "draw_body",
+          "type": "BOOLEAN",
+          "widget": {
+            "name": "draw_body"
+          },
+          "link": null
+        },
+        {
+          "name": "draw_hands",
+          "type": "BOOLEAN",
+          "widget": {
+            "name": "draw_hands"
+          },
+          "link": null
+        },
+        {
+          "name": "draw_face",
+          "type": "BOOLEAN",
+          "widget": {
+            "name": "draw_face"
+          },
+          "link": null
+        },
+        {
+          "name": "draw_feet",
+          "type": "BOOLEAN",
+          "widget": {
+            "name": "draw_feet"
+          },
+          "link": null
+        },
+        {
+          "name": "stick_width",
+          "type": "INT",
+          "widget": {
+            "name": "stick_width"
+          },
+          "link": null
+        },
+        {
+          "name": "face_point_size",
+          "type": "INT",
+          "widget": {
+            "name": "face_point_size"
+          },
+          "link": null
+        },
+        {
+          "name": "score_threshold",
+          "type": "FLOAT",
+          "widget": {
+            "name": "score_threshold"
+          },
+          "link": null
+        },
+        {
+          "name": "ckpt_name",
+          "type": "COMBO",
+          "widget": {
+            "name": "ckpt_name"
+          },
+          "link": null
+        },
+        {
+          "name": "bboxes",
+          "shape": 7,
+          "type": "BOUNDING_BOX",
+          "link": null
+        }
+      ],
+      "outputs": [
+        {
+          "localized_name": "IMAGE",
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": []
+        },
+        {
+          "name": "keypoints",
+          "type": "POSE_KEYPOINT",
+          "links": null
+        }
+      ],
+      "properties": {
+        "proxyWidgets": [
+          [
+            "674",
+            "resize_type.longer_size"
+          ],
+          [
+            "674",
+            "scale_method"
+          ],
+          [
+            "672",
+            "draw_body"
+          ],
+          [
+            "672",
+            "draw_hands"
+          ],
+          [
+            "672",
+            "draw_face"
+          ],
+          [
+            "672",
+            "draw_feet"
+          ],
+          [
+            "672",
+            "stick_width"
+          ],
+          [
+            "672",
+            "face_point_size"
+          ],
+          [
+            "672",
+            "score_threshold"
+          ],
+          [
+            "673",
+            "ckpt_name"
+          ]
+        ],
+        "cnr_id": "comfy-core",
+        "ver": "0.15.1",
+        "ue_properties": {
+          "widget_ue_connectable": {},
+          "version": "7.7",
+          "input_ue_unconnectable": {}
+        }
+      },
+      "widgets_values": [],
+      "title": "Image to Pose Map (SDPose-OOD)"
+    }
+  ],
+  "links": [],
+  "version": 0.4,
+  "definitions": {
+    "subgraphs": [
+      {
+        "id": "01b6a731-fb78-4070-9a38-c87146da9604",
+        "version": 1,
+        "state": {
+          "lastGroupId": 0,
+          "lastNodeId": 676,
+          "lastLinkId": 1715,
+          "lastRerouteId": 0
+        },
+        "revision": 0,
+        "config": {},
+        "name": "Image to Pose Map (SDPose-OOD)",
+        "inputNode": {
+          "id": -10,
+          "bounding": [
+            -3290,
+            3590,
+            190.8984375,
+            288
+          ]
+        },
+        "outputNode": {
+          "id": -20,
+          "bounding": [
+            -1756.2451602089645,
+            3366,
+            128,
+            88
+          ]
+        },
+        "inputs": [
+          {
+            "id": "e24699c3-1356-4634-9eb4-19bb58e5c0b0",
+            "name": "input",
+            "type": "IMAGE,MASK",
+            "linkIds": [
+              1700
+            ],
+            "localized_name": "input",
+            "pos": [
+              -3123.1015625,
+              3614
+            ]
+          },
+          {
+            "id": "088eefc1-cd8a-4573-993f-9e4da008a12d",
+            "name": "resize_type.longer_size",
+            "type": "INT",
+            "linkIds": [
+              1704
+            ],
+            "label": "resize_target_longer_size",
+            "pos": [
+              -3123.1015625,
+              3634
+            ]
+          },
+          {
+            "id": "b6449bd3-73d4-41c8-b81f-cf8d33f76a2e",
+            "name": "scale_method",
+            "type": "COMBO",
+            "linkIds": [
+              1705
+            ],
+            "pos": [
+              -3123.1015625,
+              3654
+            ]
+          },
+          {
+            "id": "4cff52ad-ed07-4c97-8803-fcbd89554fd0",
+            "name": "draw_body",
+            "type": "BOOLEAN",
+            "linkIds": [
+              1706
+            ],
+            "pos": [
+              -3123.1015625,
+              3674
+            ]
+          },
+          {
+            "id": "7af63dce-f7df-4d7e-8215-d7c7f60bf81c",
+            "name": "draw_hands",
+            "type": "BOOLEAN",
+            "linkIds": [
+              1707
+            ],
+            "pos": [
+              -3123.1015625,
+              3694
+            ]
+          },
+          {
+            "id": "af3a9bce-61f9-4aca-b530-9f65e028b35e",
+            "name": "draw_face",
+            "type": "BOOLEAN",
+            "linkIds": [
+              1708
+            ],
+            "pos": [
+              -3123.1015625,
+              3714
+            ]
+          },
+          {
+            "id": "4620f6a3-2c85-4b79-ad8f-35d0326b568f",
+            "name": "draw_feet",
+            "type": "BOOLEAN",
+            "linkIds": [
+              1709
+            ],
+            "pos": [
+              -3123.1015625,
+              3734
+            ]
+          },
+          {
+            "id": "fee5d0c9-8d4b-4934-81d8-ba2206dc56cb",
+            "name": "stick_width",
+            "type": "INT",
+            "linkIds": [
+              1710
+            ],
+            "pos": [
+              -3123.1015625,
+              3754
+            ]
+          },
+          {
+            "id": "aafdd060-ba81-4324-a9cc-b656e1ebc133",
+            "name": "face_point_size",
+            "type": "INT",
+            "linkIds": [
+              1711
+            ],
+            "pos": [
+              -3123.1015625,
+              3774
+            ]
+          },
+          {
+            "id": "514c5503-f9e6-4d23-b1ae-1d3291acb2a3",
+            "name": "score_threshold",
+            "type": "FLOAT",
+            "linkIds": [
+              1712
+            ],
+            "pos": [
+              -3123.1015625,
+              3794
+            ]
+          },
+          {
+            "id": "ae46de61-2cc6-483e-8ee9-87e4144a2ffa",
+            "name": "ckpt_name",
+            "type": "COMBO",
+            "linkIds": [
+              1713
+            ],
+            "pos": [
+              -3123.1015625,
+              3814
+            ]
+          },
+          {
+            "id": "41bec0c6-dffa-4c78-9289-ee678715ae54",
+            "name": "bboxes",
+            "type": "BOUNDING_BOX",
+            "linkIds": [
+              1714
+            ],
+            "pos": [
+              -3123.1015625,
+              3834
+            ]
+          }
+        ],
+        "outputs": [
+          {
+            "id": "f05ed8cc-9403-4f14-8085-4364b06f8a48",
+            "name": "IMAGE",
+            "type": "IMAGE",
+            "linkIds": [
+              1701
+            ],
+            "localized_name": "IMAGE",
+            "pos": [
+              -1732.2451602089645,
+              3390
+            ]
+          },
+          {
+            "id": "29a6584e-4685-4986-8ffd-e6d8539953fd",
+            "name": "keypoints",
+            "type": "POSE_KEYPOINT",
+            "linkIds": [
+              1715
+            ],
+            "pos": [
+              -1732.2451602089645,
+              3410
+            ]
+          }
+        ],
+        "widgets": [],
+        "nodes": [
+          {
+            "id": 671,
+            "type": "SDPoseKeypointExtractor",
+            "pos": [
+              -2470,
+              3250
+            ],
+            "size": [
+              270,
+              180
+            ],
+            "flags": {},
+            "order": 0,
+            "mode": 0,
+            "inputs": [
+              {
+                "localized_name": "model",
+                "name": "model",
+                "type": "MODEL",
+                "link": 1696
+              },
+              {
+                "localized_name": "vae",
+                "name": "vae",
+                "type": "VAE",
+                "link": 1697
+              },
+              {
+                "localized_name": "image",
+                "name": "image",
+                "type": "IMAGE",
+                "link": 1698
+              },
+              {
+                "localized_name": "bboxes",
+                "name": "bboxes",
+                "shape": 7,
+                "type": "BOUNDING_BOX",
+                "link": 1714
+              },
+              {
+                "localized_name": "batch_size",
+                "name": "batch_size",
+                "type": "INT",
+                "widget": {
+                  "name": "batch_size"
+                },
+                "link": null
+              }
+            ],
+            "outputs": [
+              {
+                "localized_name": "keypoints",
+                "name": "keypoints",
+                "type": "POSE_KEYPOINT",
+                "links": [
+                  1699,
+                  1715
+                ]
+              }
+            ],
+            "properties": {
+              "Node name for S&R": "SDPoseKeypointExtractor",
+              "cnr_id": "comfy-core",
+              "ver": "0.15.0",
+              "ue_properties": {
+                "widget_ue_connectable": {},
+                "version": "7.7",
+                "input_ue_unconnectable": {}
+              }
+            },
+            "widgets_values": [
+              16
+            ]
+          },
+          {
+            "id": 674,
+            "type": "ResizeImageMaskNode",
+            "pos": [
+              -2960,
+              3490
+            ],
+            "size": [
+              270,
+              110
+            ],
+            "flags": {},
+            "order": 3,
+            "mode": 0,
+            "inputs": [
+              {
+                "localized_name": "input",
+                "name": "input",
+                "type": "IMAGE,MASK",
+                "link": 1700
+              },
+              {
+                "localized_name": "resize_type",
+                "name": "resize_type",
+                "type": "COMFY_DYNAMICCOMBO_V3",
+                "widget": {
+                  "name": "resize_type"
+                },
+                "link": null
+              },
+              {
+                "localized_name": "resize_type.longer_size",
+                "name": "resize_type.longer_size",
+                "type": "INT",
+                "widget": {
+                  "name": "resize_type.longer_size"
+                },
+                "link": 1704
+              },
+              {
+                "localized_name": "scale_method",
+                "name": "scale_method",
+                "type": "COMBO",
+                "widget": {
+                  "name": "scale_method"
+                },
+                "link": 1705
+              }
+            ],
+            "outputs": [
+              {
+                "localized_name": "resized",
+                "name": "resized",
+                "type": "*",
+                "links": [
+                  1698
+                ]
+              }
+            ],
+            "properties": {
+              "Node name for S&R": "ResizeImageMaskNode",
+              "cnr_id": "comfy-core",
+              "ver": "0.15.0",
+              "ue_properties": {
+                "widget_ue_connectable": {},
+                "version": "7.7",
+                "input_ue_unconnectable": {}
+              }
+            },
+            "widgets_values": [
+              "scale longer dimension",
+              1024,
+              "area"
+            ]
+          },
+          {
+            "id": 672,
+            "type": "SDPoseDrawKeypoints",
+            "pos": [
+              -2120,
+              3260
+            ],
+            "size": [
+              270,
+              280
+            ],
+            "flags": {},
+            "order": 1,
+            "mode": 0,
+            "inputs": [
+              {
+                "localized_name": "keypoints",
+                "name": "keypoints",
+                "type": "POSE_KEYPOINT",
+                "link": 1699
+              },
+              {
+                "localized_name": "draw_body",
+                "name": "draw_body",
+                "type": "BOOLEAN",
+                "widget": {
+                  "name": "draw_body"
+                },
+                "link": 1706
+              },
+              {
+                "localized_name": "draw_hands",
+                "name": "draw_hands",
+                "type": "BOOLEAN",
+                "widget": {
+                  "name": "draw_hands"
+                },
+                "link": 1707
+              },
+              {
+                "localized_name": "draw_face",
+                "name": "draw_face",
+                "type": "BOOLEAN",
+                "widget": {
+                  "name": "draw_face"
+                },
+                "link": 1708
+              },
+              {
+                "localized_name": "draw_feet",
+                "name": "draw_feet",
+                "type": "BOOLEAN",
+                "widget": {
+                  "name": "draw_feet"
+                },
+                "link": 1709
+              },
+              {
+                "localized_name": "stick_width",
+                "name": "stick_width",
+                "type": "INT",
+                "widget": {
+                  "name": "stick_width"
+                },
+                "link": 1710
+              },
+              {
+                "localized_name": "face_point_size",
+                "name": "face_point_size",
+                "type": "INT",
+                "widget": {
+                  "name": "face_point_size"
+                },
+                "link": 1711
+              },
+              {
+                "localized_name": "score_threshold",
+                "name": "score_threshold",
+                "type": "FLOAT",
+                "widget": {
+                  "name": "score_threshold"
+                },
+                "link": 1712
+              }
+            ],
+            "outputs": [
+              {
+                "localized_name": "IMAGE",
+                "name": "IMAGE",
+                "type": "IMAGE",
+                "links": [
+                  1701
+                ]
+              }
+            ],
+            "properties": {
+              "Node name for S&R": "SDPoseDrawKeypoints",
+              "cnr_id": "comfy-core",
+              "ver": "0.15.0",
+              "ue_properties": {
+                "widget_ue_connectable": {},
+                "version": "7.7",
+                "input_ue_unconnectable": {}
+              }
+            },
+            "widgets_values": [
+              true,
+              true,
+              true,
+              true,
+              4,
+              2,
+              0.5
+            ]
+          },
+          {
+            "id": 673,
+            "type": "CheckpointLoaderSimple",
+            "pos": [
+              -2960,
+              3250
+            ],
+            "size": [
+              390,
+              190
+            ],
+            "flags": {},
+            "order": 2,
+            "mode": 0,
+            "inputs": [
+              {
+                "localized_name": "ckpt_name",
+                "name": "ckpt_name",
+                "type": "COMBO",
+                "widget": {
+                  "name": "ckpt_name"
+                },
+                "link": 1713
+              }
+            ],
+            "outputs": [
+              {
+                "localized_name": "MODEL",
+                "name": "MODEL",
+                "type": "MODEL",
+                "links": [
+                  1696
+                ]
+              },
+              {
+                "localized_name": "CLIP",
+                "name": "CLIP",
+                "type": "CLIP",
+                "links": []
+              },
+              {
+                "localized_name": "VAE",
+                "name": "VAE",
+                "type": "VAE",
+                "links": [
+                  1697
+                ]
+              }
+            ],
+            "properties": {
+              "Node name for S&R": "CheckpointLoaderSimple",
+              "cnr_id": "comfy-core",
+              "ver": "0.15.0",
+              "models": [
+                {
+                  "name": "sdpose_wholebody_fp16.safetensors",
+                  "url": "https://huggingface.co/Comfy-Org/SDPose/resolve/main/checkpoints/sdpose_wholebody_fp16.safetensors",
+                  "directory": "checkpoints"
+                }
+              ],
+              "ue_properties": {
+                "widget_ue_connectable": {},
+                "version": "7.7",
+                "input_ue_unconnectable": {}
+              }
+            },
+            "widgets_values": [
+              "sdpose_wholebody_fp16.safetensors"
+            ]
+          }
+        ],
+        "groups": [],
+        "links": [
+          {
+            "id": 1696,
+            "origin_id": 673,
+            "origin_slot": 0,
+            "target_id": 671,
+            "target_slot": 0,
+            "type": "MODEL"
+          },
+          {
+            "id": 1697,
+            "origin_id": 673,
+            "origin_slot": 2,
+            "target_id": 671,
+            "target_slot": 1,
+            "type": "VAE"
+          },
+          {
+            "id": 1698,
+            "origin_id": 674,
+            "origin_slot": 0,
+            "target_id": 671,
+            "target_slot": 2,
+            "type": "IMAGE"
+          },
+          {
+            "id": 1699,
+            "origin_id": 671,
+            "origin_slot": 0,
+            "target_id": 672,
+            "target_slot": 0,
+            "type": "POSE_KEYPOINT"
+          },
+          {
+            "id": 1700,
+            "origin_id": -10,
+            "origin_slot": 0,
+            "target_id": 674,
+            "target_slot": 0,
+            "type": "IMAGE,MASK"
+          },
+          {
+            "id": 1701,
+            "origin_id": 672,
+            "origin_slot": 0,
+            "target_id": -20,
+            "target_slot": 0,
+            "type": "IMAGE"
+          },
+          {
+            "id": 1704,
+            "origin_id": -10,
+            "origin_slot": 1,
+            "target_id": 674,
+            "target_slot": 2,
+            "type": "INT"
+          },
+          {
+            "id": 1705,
+            "origin_id": -10,
+            "origin_slot": 2,
+            "target_id": 674,
+            "target_slot": 3,
+            "type": "COMBO"
+          },
+          {
+            "id": 1706,
+            "origin_id": -10,
+            "origin_slot": 3,
+            "target_id": 672,
+            "target_slot": 1,
+            "type": "BOOLEAN"
+          },
+          {
+            "id": 1707,
+            "origin_id": -10,
+            "origin_slot": 4,
+            "target_id": 672,
+            "target_slot": 2,
+            "type": "BOOLEAN"
+          },
+          {
+            "id": 1708,
+            "origin_id": -10,
+            "origin_slot": 5,
+            "target_id": 672,
+            "target_slot": 3,
+            "type": "BOOLEAN"
+          },
+          {
+            "id": 1709,
+            "origin_id": -10,
+            "origin_slot": 6,
+            "target_id": 672,
+            "target_slot": 4,
+            "type": "BOOLEAN"
+          },
+          {
+            "id": 1710,
+            "origin_id": -10,
+            "origin_slot": 7,
+            "target_id": 672,
+            "target_slot": 5,
+            "type": "INT"
+          },
+          {
+            "id": 1711,
+            "origin_id": -10,
+            "origin_slot": 8,
+            "target_id": 672,
+            "target_slot": 6,
+            "type": "INT"
+          },
+          {
+            "id": 1712,
+            "origin_id": -10,
+            "origin_slot": 9,
+            "target_id": 672,
+            "target_slot": 7,
+            "type": "FLOAT"
+          },
+          {
+            "id": 1713,
+            "origin_id": -10,
+            "origin_slot": 10,
+            "target_id": 673,
+            "target_slot": 0,
+            "type": "COMBO"
+          },
+          {
+            "id": 1714,
+            "origin_id": -10,
+            "origin_slot": 11,
+            "target_id": 671,
+            "target_slot": 3,
+            "type": "BOUNDING_BOX"
+          },
+          {
+            "id": 1715,
+            "origin_id": 671,
+            "origin_slot": 0,
+            "target_id": -20,
+            "target_slot": 1,
+            "type": "POSE_KEYPOINT"
+          }
+        ],
+        "extra": {
+          "workflowRendererVersion": "LG"
+        },
+        "category": "Conditioning & Preprocessors/Pose",
+        "description": "Extracts human pose keypoints and stick-figure visuals from an image using SDPose-OOD, with optional bounding-box input per subject."
+      }
+    ]
+  },
+  "extra": {
+    "ue_links": []
+  }
+}
--- a/blueprints/Merge
+++ b/blueprints/Merge
--- a/(Z-Image-Turbo).json
+++ b/(Z-Image-Turbo).json
@ -1298,7 +1298,7 @@
          "VHS_MetadataImage": true,
          "VHS_KeepIntermediate": true
        },
-        "category": "Image generation and editing/Pose to image",
+        "category": "Image generation and editing/Conditioned",
        "description": "Generates an image from pose keypoints using Z-Image-Turbo with text conditioning."
      }
    ]
--- a/blueprints/Pose
+++ b/blueprints/Pose
@ -3870,7 +3870,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Video generation and editing/Pose to video",
+        "category": "Video generation and editing/Conditioned",
        "description": "Generates video from pose reference frames using LTX-2, with optional synchronized audio."
      }
    ]
--- a/blueprints/Prompt
+++ b/blueprints/Prompt
@ -270,7 +270,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Text generation/Prompt enhance",
+        "category": "Text Tools",
        "description": "Expands short text prompts into detailed descriptions using a text generation model for better generation quality."
      }
    ]
--- a/blueprints/Remove
+++ b/blueprints/Remove
@ -389,7 +389,7 @@
          }
        ],
        "extra": {},
-        "category": "Image generation and editing/Background Removal"
+        "category": "Image Tools/Background Removal"
      }
    ]
  },
--- a/blueprints/Select
+++ b/blueprints/Select
@ -0,0 +1,485 @@
+{
+  "revision": 0,
+  "last_node_id": 10,
+  "last_link_id": 0,
+  "nodes": [
+    {
+      "id": 10,
+      "type": "3fb7557a-470d-4983-9d8c-6d5caa9788f0",
+      "pos": [
+        -250,
+        8590
+      ],
+      "size": [
+        280,
+        360
+      ],
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "inputs": [
+        {
+          "localized_name": "text_per_line",
+          "name": "text_per_line",
+          "type": "STRING",
+          "widget": {
+            "name": "text_per_line"
+          },
+          "link": null
+        },
+        {
+          "localized_name": "index",
+          "name": "index",
+          "type": "INT",
+          "widget": {
+            "name": "index"
+          },
+          "link": null
+        }
+      ],
+      "outputs": [
+        {
+          "localized_name": "selected_line",
+          "name": "selected_line",
+          "type": "STRING",
+          "links": []
+        }
+      ],
+      "properties": {
+        "proxyWidgets": [
+          [
+            "2",
+            "string"
+          ],
+          [
+            "3",
+            "value"
+          ]
+        ],
+        "cnr_id": "comfy-core",
+        "ver": "0.19.0",
+        "ue_properties": {
+          "widget_ue_connectable": {},
+          "input_ue_unconnectable": {}
+        }
+      },
+      "widgets_values": [],
+      "title": "Select Per-Line Text by Index"
+    }
+  ],
+  "links": [],
+  "version": 0.4,
+  "definitions": {
+    "subgraphs": [
+      {
+        "id": "3fb7557a-470d-4983-9d8c-6d5caa9788f0",
+        "version": 1,
+        "state": {
+          "lastGroupId": 0,
+          "lastNodeId": 10,
+          "lastLinkId": 14,
+          "lastRerouteId": 0
+        },
+        "revision": 0,
+        "config": {},
+        "name": "Select Per-Line Text by Index",
+        "inputNode": {
+          "id": -10,
+          "bounding": [
+            -990,
+            8595,
+            128,
+            88
+          ]
+        },
+        "outputNode": {
+          "id": -20,
+          "bounding": [
+            710,
+            8585,
+            128,
+            68
+          ]
+        },
+        "inputs": [
+          {
+            "id": "75417d82-a934-4ac9-b667-d8dcd5a3bfb3",
+            "name": "text_per_line",
+            "type": "STRING",
+            "linkIds": [
+              13
+            ],
+            "localized_name": "text_per_line",
+            "pos": [
+              -886,
+              8619
+            ]
+          },
+          {
+            "id": "46e69a73-1804-4ca6-9175-31445bf0be96",
+            "name": "index",
+            "type": "INT",
+            "linkIds": [
+              14
+            ],
+            "localized_name": "index",
+            "pos": [
+              -886,
+              8639
+            ]
+          }
+        ],
+        "outputs": [
+          {
+            "id": "e34e8ad1-84d2-4bd2-a460-eb7de6067c10",
+            "name": "selected_line",
+            "type": "STRING",
+            "linkIds": [
+              10
+            ],
+            "localized_name": "selected_line",
+            "pos": [
+              734,
+              8609
+            ]
+          }
+        ],
+        "widgets": [],
+        "nodes": [
+          {
+            "id": 1,
+            "type": "PreviewAny",
+            "pos": [
+              -500,
+              8400
+            ],
+            "size": [
+              230,
+              180
+            ],
+            "flags": {},
+            "order": 0,
+            "mode": 0,
+            "inputs": [
+              {
+                "localized_name": "source",
+                "name": "source",
+                "type": "*",
+                "link": 1
+              }
+            ],
+            "outputs": [
+              {
+                "localized_name": "STRING",
+                "name": "STRING",
+                "type": "STRING",
+                "links": [
+                  6
+                ]
+              }
+            ],
+            "properties": {
+              "Node name for S&R": "PreviewAny",
+              "cnr_id": "comfy-core",
+              "ver": "0.19.0",
+              "ue_properties": {
+                "widget_ue_connectable": {},
+                "input_ue_unconnectable": {}
+              }
+            },
+            "widgets_values": [
+              null,
+              null,
+              null
+            ]
+          },
+          {
+            "id": 2,
+            "type": "RegexExtract",
+            "pos": [
+              -240,
+              8740
+            ],
+            "size": [
+              470,
+              460
+            ],
+            "flags": {},
+            "order": 1,
+            "mode": 0,
+            "showAdvanced": false,
+            "inputs": [
+              {
+                "localized_name": "string",
+                "name": "string",
+                "type": "STRING",
+                "widget": {
+                  "name": "string"
+                },
+                "link": 13
+              },
+              {
+                "localized_name": "regex_pattern",
+                "name": "regex_pattern",
+                "type": "STRING",
+                "widget": {
+                  "name": "regex_pattern"
+                },
+                "link": 9
+              },
+              {
+                "localized_name": "mode",
+                "name": "mode",
+                "type": "COMBO",
+                "widget": {
+                  "name": "mode"
+                },
+                "link": null
+              },
+              {
+                "localized_name": "case_insensitive",
+                "name": "case_insensitive",
+                "type": "BOOLEAN",
+                "widget": {
+                  "name": "case_insensitive"
+                },
+                "link": null
+              },
+              {
+                "localized_name": "multiline",
+                "name": "multiline",
+                "type": "BOOLEAN",
+                "widget": {
+                  "name": "multiline"
+                },
+                "link": null
+              },
+              {
+                "localized_name": "dotall",
+                "name": "dotall",
+                "type": "BOOLEAN",
+                "widget": {
+                  "name": "dotall"
+                },
+                "link": null
+              },
+              {
+                "localized_name": "group_index",
+                "name": "group_index",
+                "type": "INT",
+                "widget": {
+                  "name": "group_index"
+                },
+                "link": null
+              }
+            ],
+            "outputs": [
+              {
+                "localized_name": "STRING",
+                "name": "STRING",
+                "type": "STRING",
+                "links": [
+                  10
+                ]
+              }
+            ],
+            "properties": {
+              "Node name for S&R": "RegexExtract",
+              "cnr_id": "comfy-core",
+              "ver": "0.19.0",
+              "ue_properties": {
+                "widget_ue_connectable": {},
+                "input_ue_unconnectable": {}
+              }
+            },
+            "widgets_values": [
+              "",
+              "",
+              "First Group",
+              false,
+              false,
+              false,
+              1
+            ]
+          },
+          {
+            "id": 3,
+            "type": "PrimitiveInt",
+            "pos": [
+              -810,
+              8400
+            ],
+            "size": [
+              270,
+              110
+            ],
+            "flags": {},
+            "order": 2,
+            "mode": 0,
+            "inputs": [
+              {
+                "localized_name": "value",
+                "name": "value",
+                "type": "INT",
+                "widget": {
+                  "name": "value"
+                },
+                "link": 14
+              }
+            ],
+            "outputs": [
+              {
+                "localized_name": "INT",
+                "name": "INT",
+                "type": "INT",
+                "links": [
+                  1
+                ]
+              }
+            ],
+            "title": "Int (line index)",
+            "properties": {
+              "Node name for S&R": "Int (line index)",
+              "cnr_id": "comfy-core",
+              "ver": "0.19.0",
+              "ue_properties": {
+                "widget_ue_connectable": {},
+                "input_ue_unconnectable": {}
+              }
+            },
+            "widgets_values": [
+              0,
+              "fixed"
+            ]
+          },
+          {
+            "id": 8,
+            "type": "StringReplace",
+            "pos": [
+              -240,
+              8400
+            ],
+            "size": [
+              400,
+              280
+            ],
+            "flags": {},
+            "order": 3,
+            "mode": 0,
+            "inputs": [
+              {
+                "localized_name": "string",
+                "name": "string",
+                "type": "STRING",
+                "widget": {
+                  "name": "string"
+                },
+                "link": null
+              },
+              {
+                "localized_name": "find",
+                "name": "find",
+                "type": "STRING",
+                "widget": {
+                  "name": "find"
+                },
+                "link": null
+              },
+              {
+                "localized_name": "replace",
+                "name": "replace",
+                "type": "STRING",
+                "widget": {
+                  "name": "replace"
+                },
+                "link": 6
+              }
+            ],
+            "outputs": [
+              {
+                "localized_name": "STRING",
+                "name": "STRING",
+                "type": "STRING",
+                "links": [
+                  9
+                ]
+              }
+            ],
+            "properties": {
+              "Node name for S&R": "StringReplace",
+              "cnr_id": "comfy-core",
+              "ver": "0.19.0",
+              "ue_properties": {
+                "widget_ue_connectable": {},
+                "input_ue_unconnectable": {}
+              }
+            },
+            "widgets_values": [
+              "^(?:[^\\n]*\\n){index}([^\\n]*)(?:\\n|$)",
+              "index",
+              ""
+            ]
+          }
+        ],
+        "groups": [],
+        "links": [
+          {
+            "id": 1,
+            "origin_id": 3,
+            "origin_slot": 0,
+            "target_id": 1,
+            "target_slot": 0,
+            "type": "INT"
+          },
+          {
+            "id": 9,
+            "origin_id": 8,
+            "origin_slot": 0,
+            "target_id": 2,
+            "target_slot": 1,
+            "type": "STRING"
+          },
+          {
+            "id": 6,
+            "origin_id": 1,
+            "origin_slot": 0,
+            "target_id": 8,
+            "target_slot": 2,
+            "type": "STRING"
+          },
+          {
+            "id": 10,
+            "origin_id": 2,
+            "origin_slot": 0,
+            "target_id": -20,
+            "target_slot": 0,
+            "type": "STRING"
+          },
+          {
+            "id": 13,
+            "origin_id": -10,
+            "origin_slot": 0,
+            "target_id": 2,
+            "target_slot": 0,
+            "type": "STRING"
+          },
+          {
+            "id": 14,
+            "origin_id": -10,
+            "origin_slot": 1,
+            "target_id": 3,
+            "target_slot": 0,
+            "type": "INT"
+          }
+        ],
+        "extra": {},
+        "category": "Text Tools",
+        "description": "Selects one line from multiline text by zero-based index for batch or list-driven prompt workflows."
+      }
+    ]
+  },
+  "extra": {
+    "ue_links": [],
+    "links_added_by_ue": []
+  }
+}
--- a/blueprints/Split
+++ b/blueprints/Split
@ -0,0 +1,714 @@
+{
+  "revision": 0,
+  "last_node_id": 251,
+  "last_link_id": 0,
+  "nodes": [
+    {
+      "id": 251,
+      "type": "609e1fd1-b731-4b78-89ac-d19b1156b025",
+      "pos": [
+        -1490,
+        130
+      ],
+      "size": [
+        230,
+        164
+      ],
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "inputs": [
+        {
+          "localized_name": "source_image",
+          "name": "source_image",
+          "type": "IMAGE",
+          "link": null
+        },
+        {
+          "localized_name": "columns",
+          "name": "columns",
+          "type": "INT",
+          "widget": {
+            "name": "columns"
+          },
+          "link": null
+        },
+        {
+          "localized_name": "rows",
+          "name": "rows",
+          "type": "INT",
+          "widget": {
+            "name": "rows"
+          },
+          "link": null
+        }
+      ],
+      "outputs": [
+        {
+          "localized_name": "tiles",
+          "name": "tiles",
+          "type": "IMAGE",
+          "links": []
+        }
+      ],
+      "properties": {
+        "proxyWidgets": [
+          [
+            "228",
+            "value"
+          ],
+          [
+            "252",
+            "value"
+          ]
+        ],
+        "cnr_id": "comfy-core",
+        "ver": "0.20.1",
+        "enableTabs": false,
+        "tabWidth": 65,
+        "tabXOffset": 10,
+        "hasSecondTab": false,
+        "secondTabText": "Send Back",
+        "secondTabOffset": 80,
+        "secondTabWidth": 65
+      },
+      "widgets_values": [],
+      "title": "Split Image Grid to Tiles"
+    }
+  ],
+  "links": [],
+  "version": 0.4,
+  "definitions": {
+    "subgraphs": [
+      {
+        "id": "609e1fd1-b731-4b78-89ac-d19b1156b025",
+        "version": 1,
+        "state": {
+          "lastGroupId": 9,
+          "lastNodeId": 252,
+          "lastLinkId": 429,
+          "lastRerouteId": 0
+        },
+        "revision": 0,
+        "config": {},
+        "name": "Split Image Grid to Tiles",
+        "inputNode": {
+          "id": -10,
+          "bounding": [
+            -1690,
+            260,
+            128,
+            108
+          ]
+        },
+        "outputNode": {
+          "id": -20,
+          "bounding": [
+            -510,
+            590,
+            128,
+            68
+          ]
+        },
+        "inputs": [
+          {
+            "id": "866ac798-cfbc-450a-b755-e704f86404d9",
+            "name": "source_image",
+            "type": "IMAGE",
+            "linkIds": [
+              386,
+              389
+            ],
+            "localized_name": "source_image",
+            "pos": [
+              -1586,
+              284
+            ]
+          },
+          {
+            "id": "bc37b1f8-8ab2-4f19-bd00-75d4fbc4feb3",
+            "name": "columns",
+            "type": "INT",
+            "linkIds": [
+              427
+            ],
+            "localized_name": "columns",
+            "pos": [
+              -1586,
+              304
+            ]
+          },
+          {
+            "id": "d45915da-e848-43dd-9ccc-e3161e9c99d9",
+            "name": "rows",
+            "type": "INT",
+            "linkIds": [
+              428
+            ],
+            "localized_name": "rows",
+            "pos": [
+              -1586,
+              324
+            ]
+          }
+        ],
+        "outputs": [
+          {
+            "id": "18bc780f-064b-4038-87c6-67dba71deb08",
+            "name": "tiles",
+            "type": "IMAGE",
+            "linkIds": [
+              394
+            ],
+            "localized_name": "tiles",
+            "shape": 6,
+            "pos": [
+              -486,
+              614
+            ]
+          }
+        ],
+        "widgets": [],
+        "nodes": [
+          {
+            "id": 225,
+            "type": "SplitImageToTileList",
+            "pos": [
+              -1010,
+              620
+            ],
+            "size": [
+              290,
+              170
+            ],
+            "flags": {},
+            "order": 0,
+            "mode": 0,
+            "inputs": [
+              {
+                "localized_name": "image",
+                "name": "image",
+                "type": "IMAGE",
+                "link": 386
+              },
+              {
+                "localized_name": "tile_width",
+                "name": "tile_width",
+                "type": "INT",
+                "widget": {
+                  "name": "tile_width"
+                },
+                "link": 403
+              },
+              {
+                "localized_name": "tile_height",
+                "name": "tile_height",
+                "type": "INT",
+                "widget": {
+                  "name": "tile_height"
+                },
+                "link": 404
+              },
+              {
+                "localized_name": "overlap",
+                "name": "overlap",
+                "type": "INT",
+                "widget": {
+                  "name": "overlap"
+                },
+                "link": null
+              }
+            ],
+            "outputs": [
+              {
+                "localized_name": "IMAGE",
+                "name": "IMAGE",
+                "shape": 6,
+                "type": "IMAGE",
+                "links": [
+                  394
+                ]
+              }
+            ],
+            "properties": {
+              "Node name for S&R": "SplitImageToTileList",
+              "cnr_id": "comfy-core",
+              "ver": "0.20.1",
+              "enableTabs": false,
+              "tabWidth": 65,
+              "tabXOffset": 10,
+              "hasSecondTab": false,
+              "secondTabText": "Send Back",
+              "secondTabOffset": 80,
+              "secondTabWidth": 65
+            },
+            "widgets_values": [
+              1024,
+              1024,
+              0
+            ]
+          },
+          {
+            "id": 231,
+            "type": "ComfyMathExpression",
+            "pos": [
+              -1080,
+              330
+            ],
+            "size": [
+              370,
+              190
+            ],
+            "flags": {},
+            "order": 4,
+            "mode": 0,
+            "inputs": [
+              {
+                "label": "a",
+                "localized_name": "values.a",
+                "name": "values.a",
+                "type": "FLOAT,INT,BOOLEAN",
+                "link": 390
+              },
+              {
+                "label": "b",
+                "localized_name": "values.b",
+                "name": "values.b",
+                "shape": 7,
+                "type": "FLOAT,INT,BOOLEAN",
+                "link": 429
+              },
+              {
+                "label": "c",
+                "localized_name": "values.c",
+                "name": "values.c",
+                "shape": 7,
+                "type": "FLOAT,INT,BOOLEAN",
+                "link": null
+              },
+              {
+                "localized_name": "expression",
+                "name": "expression",
+                "type": "STRING",
+                "widget": {
+                  "name": "expression"
+                },
+                "link": null
+              }
+            ],
+            "outputs": [
+              {
+                "localized_name": "FLOAT",
+                "name": "FLOAT",
+                "type": "FLOAT",
+                "links": null
+              },
+              {
+                "localized_name": "INT",
+                "name": "INT",
+                "type": "INT",
+                "links": [
+                  404
+                ]
+              },
+              {
+                "localized_name": "BOOL",
+                "name": "BOOL",
+                "type": "BOOLEAN",
+                "links": null
+              }
+            ],
+            "title": "Math Expression （Height）",
+            "properties": {
+              "Node name for S&R": "ComfyMathExpression",
+              "cnr_id": "comfy-core",
+              "ver": "0.18.1",
+              "enableTabs": false,
+              "tabWidth": 65,
+              "tabXOffset": 10,
+              "hasSecondTab": false,
+              "secondTabText": "Send Back",
+              "secondTabOffset": 80,
+              "secondTabWidth": 65,
+              "ue_properties": {
+                "widget_ue_connectable": {},
+                "version": "7.7",
+                "input_ue_unconnectable": {}
+              }
+            },
+            "widgets_values": [
+              "max(1, (int(a) + int(b) - 1) // int(b))"
+            ]
+          },
+          {
+            "id": 229,
+            "type": "ComfyMathExpression",
+            "pos": [
+              -1090,
+              -30
+            ],
+            "size": [
+              370,
+              190
+            ],
+            "flags": {},
+            "order": 2,
+            "mode": 0,
+            "inputs": [
+              {
+                "label": "a",
+                "localized_name": "values.a",
+                "name": "values.a",
+                "type": "FLOAT,INT,BOOLEAN",
+                "link": 387
+              },
+              {
+                "label": "b",
+                "localized_name": "values.b",
+                "name": "values.b",
+                "shape": 7,
+                "type": "FLOAT,INT,BOOLEAN",
+                "link": 388
+              },
+              {
+                "label": "c",
+                "localized_name": "values.c",
+                "name": "values.c",
+                "shape": 7,
+                "type": "FLOAT,INT,BOOLEAN",
+                "link": null
+              },
+              {
+                "localized_name": "expression",
+                "name": "expression",
+                "type": "STRING",
+                "widget": {
+                  "name": "expression"
+                },
+                "link": null
+              }
+            ],
+            "outputs": [
+              {
+                "localized_name": "FLOAT",
+                "name": "FLOAT",
+                "type": "FLOAT",
+                "links": null
+              },
+              {
+                "localized_name": "INT",
+                "name": "INT",
+                "type": "INT",
+                "links": [
+                  403
+                ]
+              },
+              {
+                "localized_name": "BOOL",
+                "name": "BOOL",
+                "type": "BOOLEAN",
+                "links": null
+              }
+            ],
+            "title": "Math Expression （Width）",
+            "properties": {
+              "Node name for S&R": "ComfyMathExpression",
+              "cnr_id": "comfy-core",
+              "ver": "0.18.1",
+              "enableTabs": false,
+              "tabWidth": 65,
+              "tabXOffset": 10,
+              "hasSecondTab": false,
+              "secondTabText": "Send Back",
+              "secondTabOffset": 80,
+              "secondTabWidth": 65,
+              "ue_properties": {
+                "widget_ue_connectable": {},
+                "version": "7.7",
+                "input_ue_unconnectable": {}
+              }
+            },
+            "widgets_values": [
+              "max(1, (int(a) + int(b) - 1) // int(b))"
+            ]
+          },
+          {
+            "id": 228,
+            "type": "PrimitiveInt",
+            "pos": [
+              -1380,
+              90
+            ],
+            "size": [
+              230,
+              110
+            ],
+            "flags": {},
+            "order": 1,
+            "mode": 0,
+            "inputs": [
+              {
+                "localized_name": "value",
+                "name": "value",
+                "type": "INT",
+                "widget": {
+                  "name": "value"
+                },
+                "link": 427
+              }
+            ],
+            "outputs": [
+              {
+                "localized_name": "INT",
+                "name": "INT",
+                "type": "INT",
+                "links": [
+                  388
+                ]
+              }
+            ],
+            "title": "Int (grid columns)",
+            "properties": {
+              "Node name for S&R": "Int (grid columns)",
+              "cnr_id": "comfy-core",
+              "ver": "0.18.1",
+              "enableTabs": false,
+              "tabWidth": 65,
+              "tabXOffset": 10,
+              "hasSecondTab": false,
+              "secondTabText": "Send Back",
+              "secondTabOffset": 80,
+              "secondTabWidth": 65,
+              "ue_properties": {
+                "widget_ue_connectable": {},
+                "version": "7.7",
+                "input_ue_unconnectable": {}
+              }
+            },
+            "widgets_values": [
+              2,
+              "fixed"
+            ]
+          },
+          {
+            "id": 230,
+            "type": "GetImageSize",
+            "pos": [
+              -1380,
+              290
+            ],
+            "size": [
+              230,
+              100
+            ],
+            "flags": {},
+            "order": 3,
+            "mode": 0,
+            "inputs": [
+              {
+                "localized_name": "image",
+                "name": "image",
+                "type": "IMAGE",
+                "link": 389
+              }
+            ],
+            "outputs": [
+              {
+                "localized_name": "width",
+                "name": "width",
+                "type": "INT",
+                "links": [
+                  387
+                ]
+              },
+              {
+                "localized_name": "height",
+                "name": "height",
+                "type": "INT",
+                "links": [
+                  390
+                ]
+              },
+              {
+                "localized_name": "batch_size",
+                "name": "batch_size",
+                "type": "INT",
+                "links": null
+              }
+            ],
+            "properties": {
+              "Node name for S&R": "GetImageSize",
+              "cnr_id": "comfy-core",
+              "ver": "0.18.1",
+              "enableTabs": false,
+              "tabWidth": 65,
+              "tabXOffset": 10,
+              "hasSecondTab": false,
+              "secondTabText": "Send Back",
+              "secondTabOffset": 80,
+              "secondTabWidth": 65,
+              "ue_properties": {
+                "widget_ue_connectable": {},
+                "version": "7.7",
+                "input_ue_unconnectable": {}
+              }
+            }
+          },
+          {
+            "id": 252,
+            "type": "PrimitiveInt",
+            "pos": [
+              -1380,
+              470
+            ],
+            "size": [
+              230,
+              110
+            ],
+            "flags": {},
+            "order": 5,
+            "mode": 0,
+            "inputs": [
+              {
+                "localized_name": "value",
+                "name": "value",
+                "type": "INT",
+                "widget": {
+                  "name": "value"
+                },
+                "link": 428
+              }
+            ],
+            "outputs": [
+              {
+                "localized_name": "INT",
+                "name": "INT",
+                "type": "INT",
+                "links": [
+                  429
+                ]
+              }
+            ],
+            "title": "Int (grid rows)",
+            "properties": {
+              "Node name for S&R": "Int (grid rows)",
+              "cnr_id": "comfy-core",
+              "ver": "0.18.1",
+              "enableTabs": false,
+              "tabWidth": 65,
+              "tabXOffset": 10,
+              "hasSecondTab": false,
+              "secondTabText": "Send Back",
+              "secondTabOffset": 80,
+              "secondTabWidth": 65,
+              "ue_properties": {
+                "widget_ue_connectable": {},
+                "version": "7.7",
+                "input_ue_unconnectable": {}
+              }
+            },
+            "widgets_values": [
+              3,
+              "fixed"
+            ]
+          }
+        ],
+        "groups": [],
+        "links": [
+          {
+            "id": 403,
+            "origin_id": 229,
+            "origin_slot": 1,
+            "target_id": 225,
+            "target_slot": 1,
+            "type": "INT"
+          },
+          {
+            "id": 404,
+            "origin_id": 231,
+            "origin_slot": 1,
+            "target_id": 225,
+            "target_slot": 2,
+            "type": "INT"
+          },
+          {
+            "id": 390,
+            "origin_id": 230,
+            "origin_slot": 1,
+            "target_id": 231,
+            "target_slot": 0,
+            "type": "INT"
+          },
+          {
+            "id": 387,
+            "origin_id": 230,
+            "origin_slot": 0,
+            "target_id": 229,
+            "target_slot": 0,
+            "type": "INT"
+          },
+          {
+            "id": 388,
+            "origin_id": 228,
+            "origin_slot": 0,
+            "target_id": 229,
+            "target_slot": 1,
+            "type": "INT"
+          },
+          {
+            "id": 386,
+            "origin_id": -10,
+            "origin_slot": 0,
+            "target_id": 225,
+            "target_slot": 0,
+            "type": "IMAGE"
+          },
+          {
+            "id": 389,
+            "origin_id": -10,
+            "origin_slot": 0,
+            "target_id": 230,
+            "target_slot": 0,
+            "type": "IMAGE"
+          },
+          {
+            "id": 394,
+            "origin_id": 225,
+            "origin_slot": 0,
+            "target_id": -20,
+            "target_slot": 0,
+            "type": "IMAGE"
+          },
+          {
+            "id": 427,
+            "origin_id": -10,
+            "origin_slot": 1,
+            "target_id": 228,
+            "target_slot": 0,
+            "type": "INT"
+          },
+          {
+            "id": 428,
+            "origin_id": -10,
+            "origin_slot": 2,
+            "target_id": 252,
+            "target_slot": 0,
+            "type": "INT"
+          },
+          {
+            "id": 429,
+            "origin_id": 252,
+            "origin_slot": 0,
+            "target_id": 231,
+            "target_slot": 1,
+            "type": "INT"
+          }
+        ],
+        "extra": {},
+        "category": "Image Tools/Crop",
+        "description": "Splits an image into a configurable columns×rows grid of equal tiles for tiled generation or processing."
+      }
+    ]
+  },
+  "extra": {}
+}
--- a/blueprints/Text
+++ b/blueprints/Text
--- a/blueprints/Video
+++ b/blueprints/Video
@ -307,9 +307,9 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Text generation/Video Captioning",
+        "category": "Video Tools",
        "description": "Generates descriptive captions for video input using Google's Gemini multimodal LLM."
      }
    ]
  }
-}
+}
--- a/blueprints/Video
+++ b/blueprints/Video
--- a/(Mediapipe).json
+++ b/(Mediapipe).json
--- a/blueprints/Video
+++ b/blueprints/Video
--- a/blueprints/Video
+++ b/blueprints/Video
--- a/blueprints/Video
+++ b/blueprints/Video
--- a/blueprints/Video
+++ b/blueprints/Video
@ -818,7 +818,7 @@
          }
        ],
        "extra": {},
-        "category": "Video Tools",
+        "category": "Conditioning & Preprocessors/Segmentation & Mask",
        "description": "Segments video into temporally consistent masks using Meta SAM3 from text or interactive prompts."
      }
    ]
--- a/blueprints/Video
+++ b/blueprints/Video
@ -412,7 +412,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Video generation and editing/Enhance video",
+        "category": "Video generation and editing/Upscale",
        "description": "Upscales video to 4× resolution using a GAN-based upscaling model."
      }
    ]
--- a/Multi-Person).json
+++ b/Multi-Person).json
--- a/comfy/background_removal/birefnet.py
+++ b/comfy/background_removal/birefnet.py
@ -105,7 +105,7 @@ class WindowAttention(nn.Module):

        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.long().view(-1)].view(
            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
-        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        relative_position_bias = comfy.ops.cast_to_input(relative_position_bias.permute(2, 0, 1).contiguous(), attn)  # nH, Wh*Ww, Wh*Ww
        attn = attn + relative_position_bias.unsqueeze(0)

        if mask is not None:
--- a/comfy/bg_removal_model.py
+++ b/comfy/bg_removal_model.py
@ -55,12 +55,7 @@ class BackgroundRemovalModel():
        out = torch.nn.functional.interpolate(out, size=(H, W), mode="bicubic", antialias=False)

        mask = out.sigmoid().to(device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype())
-        if mask.ndim == 3:
-            mask = mask.unsqueeze(0)
-        if mask.shape[1] != 1:
-            mask = mask.movedim(-1, 1)
-
-        return mask
+        return mask.squeeze(1)  # (B, 1, H, W) -> (B, H, W)


 def load_background_removal_model(sd):
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@ -49,7 +49,7 @@ parser.add_argument("--temp-directory", type=str, default=None, help="Set the Co
 parser.add_argument("--input-directory", type=str, default=None, help="Set the ComfyUI input directory. Overrides --base-directory.")
 parser.add_argument("--auto-launch", action="store_true", help="Automatically launch ComfyUI in the default browser.")
 parser.add_argument("--disable-auto-launch", action="store_true", help="Disable auto launching the browser.")
-parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use. All other devices will not be visible.")
+parser.add_argument("--cuda-device", type=str, default=None, metavar="DEVICE_ID", help="Set the ids of cuda devices this instance will use, as a comma-separated list (e.g. '0' or '0,1'). All other devices will not be visible.")
 parser.add_argument("--default-device", type=int, default=None, metavar="DEFAULT_DEVICE_ID", help="Set the id of the default device, all other devices will stay visible.")
 cm_group = parser.add_mutually_exclusive_group()
 cm_group.add_argument("--cuda-malloc", action="store_true", help="Enable cudaMallocAsync (enabled by default for torch 2.0 and up).")
@ -111,7 +111,7 @@ parser.add_argument("--preview-method", type=LatentPreviewMethod, default=Latent
 parser.add_argument("--preview-size", type=int, default=512, help="Sets the maximum preview size for sampler nodes.")

 cache_group = parser.add_mutually_exclusive_group()
-cache_group.add_argument("--cache-ram", nargs='*', type=float, default=[], metavar="GB", help="Use RAM pressure caching with the specified headroom thresholds. This is the default caching mode. The first value sets the active-cache threshold; the optional second value sets the inactive-cache/pin threshold. Defaults when no values are provided: active 25%% of system RAM (min 4GB, max 32GB), inactive 75%% of system RAM (min 12GB, max 96GB).")
+cache_group.add_argument("--cache-ram", nargs='*', type=float, default=[], metavar="GB", help="Use RAM pressure caching with the specified headroom thresholds. This is the default caching mode. The first value sets the active-cache threshold; the optional second value sets the inactive-cache/pin threshold. Defaults when no values are provided: active 10%% of system RAM (min 2GB, max 10GB), inactive 100%% of system RAM (max 96GB).")
 cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.")
 cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.")
 cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.")
@ -149,6 +149,7 @@ parser.add_argument("--async-offload", nargs='?', const=2, type=int, default=Non
 parser.add_argument("--disable-async-offload", action="store_true", help="Disable async weight offloading.")
 parser.add_argument("--disable-dynamic-vram", action="store_true", help="Disable dynamic VRAM and use estimate based model loading.")
 parser.add_argument("--enable-dynamic-vram", action="store_true", help="Enable dynamic VRAM on systems where it's not enabled by default.")
+parser.add_argument("--fast-disk", action="store_true", help="Prefer disk-backed dynamic loading and offload over unpinned RAM. Can be faster for users with fast NVME disks.")

 parser.add_argument("--force-non-blocking", action="store_true", help="Force ComfyUI to use non-blocking operations for all applicable tensors. This may improve performance on some non-Nvidia systems but can cause issues with some workflows.")

@ -165,6 +166,8 @@ class PerformanceFeature(enum.Enum):

 parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. This is used to test new features so using it might crash your comfyui. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: {}".format(" ".join(map(lambda c: c.value, PerformanceFeature))))

+parser.add_argument("--debug-hang", action="store_true", help="Enable stack trace dumps on Ctrl-C for debugging hangs.")
+
 parser.add_argument("--disable-pinned-memory", action="store_true", help="Disable pinned memory use.")

 parser.add_argument("--mmap-torch-files", action="store_true", help="Use mmap when loading ckpt/pt files.")
--- a/comfy/clip_vision.py
+++ b/comfy/clip_vision.py
@ -24,13 +24,16 @@ IMAGE_ENCODERS = {
    "siglip_vision_model": comfy.clip_model.CLIPVisionModelProjection,
    "siglip2_vision_model": comfy.clip_model.CLIPVisionModelProjection,
    "dinov2": comfy.image_encoders.dino2.Dinov2Model,
-    "dinov3": comfy.image_encoders.dino3.DINOv3ViTModel
+    "dinov3": comfy.image_encoders.dino3.DINOv3ViTModel,
 }

 class ClipVisionModel():
    def __init__(self, json_config):
-        with open(json_config) as f:
-            config = json.load(f)
+        if isinstance(json_config, dict):
+            config = json_config
+        else:
+            with open(json_config) as f:
+                config = json.load(f)

        self.image_size = config.get("image_size", 224)
        self.image_mean = config.get("image_mean", [0.48145466, 0.4578275, 0.40821073])
@ -136,8 +139,10 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
        json_config = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "image_encoders"), "dino2_giant.json")
    elif 'encoder.layer.23.layer_scale2.lambda1' in sd:
        json_config = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "image_encoders"), "dino2_large.json")
-    elif 'layer.9.attention.o_proj.bias' in sd: # dinov3
+    elif 'layer.9.attention.o_proj.bias' in sd: # dinov3 large (24 layers)
        json_config = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "image_encoders"), "dino3_large.json")
+    elif 'layer.0.mlp.gate_proj.weight' in sd and 'layer.31.norm1.weight' in sd: # Dinov3 ViT-H/16+ (SwiGLU gated MLP, 32 layers)
+        json_config = comfy.image_encoders.dino3.DINOV3_VITH_CONFIG
    else:
        return None

--- a/comfy/comfy_types/node_typing.py
+++ b/comfy/comfy_types/node_typing.py
@ -1,6 +1,5 @@
 """Comfy-specific type hinting"""

-from __future__ import annotations
 from typing import Literal, TypedDict, Optional
 from typing_extensions import NotRequired
 from abc import ABC, abstractmethod
--- a/comfy/controlnet.py
+++ b/comfy/controlnet.py
@ -15,13 +15,14 @@
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """
-
+from __future__ import annotations

 import torch
 from enum import Enum
 import math
 import os
 import logging
+import copy
 import comfy.utils
 import comfy.model_management
 import comfy.model_detection
@ -38,7 +39,7 @@ import comfy.ldm.hydit.controlnet
 import comfy.ldm.flux.controlnet
 import comfy.ldm.qwen_image.controlnet
 import comfy.cldm.dit_embedder
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Union
 if TYPE_CHECKING:
    from comfy.hooks import HookGroup

@ -64,6 +65,18 @@ class StrengthType(Enum):
    CONSTANT = 1
    LINEAR_UP = 2

+class ControlIsolation:
+    '''Temporarily set a ControlBase object's previous_controlnet to None to prevent cascading calls.'''
+    def __init__(self, control: ControlBase):
+        self.control = control
+        self.orig_previous_controlnet = control.previous_controlnet
+
+    def __enter__(self):
+        self.control.previous_controlnet = None
+
+    def __exit__(self, *args):
+        self.control.previous_controlnet = self.orig_previous_controlnet
+
 class ControlBase:
    def __init__(self):
        self.cond_hint_original = None
@ -77,7 +90,7 @@ class ControlBase:
        self.compression_ratio = 8
        self.upscale_algorithm = 'nearest-exact'
        self.extra_args = {}
-        self.previous_controlnet = None
+        self.previous_controlnet: Union[ControlBase, None] = None
        self.extra_conds = []
        self.strength_type = StrengthType.CONSTANT
        self.concat_mask = False
@ -85,6 +98,7 @@ class ControlBase:
        self.extra_concat = None
        self.extra_hooks: HookGroup = None
        self.preprocess_image = lambda a: a
+        self.multigpu_clones: dict[torch.device, ControlBase] = {}

    def set_cond_hint(self, cond_hint, strength=1.0, timestep_percent_range=(0.0, 1.0), vae=None, extra_concat=[]):
        self.cond_hint_original = cond_hint
@ -111,17 +125,38 @@ class ControlBase:
    def cleanup(self):
        if self.previous_controlnet is not None:
            self.previous_controlnet.cleanup()
-
+        for device_cnet in self.multigpu_clones.values():
+            with ControlIsolation(device_cnet):
+                device_cnet.cleanup()
        self.cond_hint = None
        self.extra_concat = None
        self.timestep_range = None

    def get_models(self):
        out = []
+        for device_cnet in self.multigpu_clones.values():
+            out += device_cnet.get_models_only_self()
        if self.previous_controlnet is not None:
            out += self.previous_controlnet.get_models()
        return out

+    def get_models_only_self(self):
+        'Calls get_models, but temporarily sets previous_controlnet to None.'
+        with ControlIsolation(self):
+            return self.get_models()
+
+    def get_instance_for_device(self, device):
+        'Returns instance of this Control object intended for selected device.'
+        return self.multigpu_clones.get(device, self)
+
+    def deepclone_multigpu(self, load_device, autoregister=False):
+        '''
+        Create deep clone of Control object where model(s) is set to other devices.
+
+        When autoregister is set to True, the deep clone is also added to multigpu_clones dict.
+        '''
+        raise NotImplementedError("Classes inheriting from ControlBase should define their own deepclone_multigpu funtion.")
+
    def get_extra_hooks(self):
        out = []
        if self.extra_hooks is not None:
@ -130,7 +165,7 @@ class ControlBase:
            out += self.previous_controlnet.get_extra_hooks()
        return out

-    def copy_to(self, c):
+    def copy_to(self, c: ControlBase):
        c.cond_hint_original = self.cond_hint_original
        c.strength = self.strength
        c.timestep_percent_range = self.timestep_percent_range
@ -284,6 +319,14 @@ class ControlNet(ControlBase):
        self.copy_to(c)
        return c

+    def deepclone_multigpu(self, load_device, autoregister=False):
+        c = self.copy()
+        c.control_model = copy.deepcopy(c.control_model)
+        c.control_model_wrapped = comfy.model_patcher.ModelPatcher(c.control_model, load_device=load_device, offload_device=comfy.model_management.unet_offload_device())
+        if autoregister:
+            self.multigpu_clones[load_device] = c
+        return c
+
    def get_models(self):
        out = super().get_models()
        out.append(self.control_model_wrapped)
@ -314,6 +357,10 @@ class QwenFunControlNet(ControlNet):
        super().pre_run(model, percent_to_timestep_function)
        self.set_extra_arg("base_model", model.diffusion_model)

+    def cleanup(self):
+        self.extra_args.pop("base_model", None)
+        super().cleanup()
+
    def copy(self):
        c = QwenFunControlNet(None, global_average_pooling=self.global_average_pooling, load_device=self.load_device, manual_cast_dtype=self.manual_cast_dtype)
        c.control_model = self.control_model
@ -906,6 +953,14 @@ class T2IAdapter(ControlBase):
        self.copy_to(c)
        return c

+    def deepclone_multigpu(self, load_device, autoregister=False):
+        c = self.copy()
+        c.t2i_model = copy.deepcopy(c.t2i_model)
+        c.device = load_device
+        if autoregister:
+            self.multigpu_clones[load_device] = c
+        return c
+
 def load_t2i_adapter(t2i_data, model_options={}): #TODO: model_options
    compression_ratio = 8
    upscale_algorithm = 'nearest-exact'
--- a/comfy/float.py
+++ b/comfy/float.py
@ -1,5 +1,20 @@
+import logging
+
 import torch

+_CK_STOCHASTIC_ROUNDING_AVAILABLE = False
+try:
+    import comfy_kitchen as ck
+    _ck_stochastic_rounding_fp8 = ck.stochastic_rounding_fp8
+    _CK_STOCHASTIC_ROUNDING_AVAILABLE = True
+except (AttributeError, ImportError):
+    logging.warning("comfy_kitchen does not support stochastic FP8 rounding, please update comfy_kitchen.")
+
+if not _CK_STOCHASTIC_ROUNDING_AVAILABLE:
+    def _ck_stochastic_rounding_fp8(value, rng, dtype):
+        raise NotImplementedError("comfy_kitchen does not support stochastic FP8 rounding")
+
+
 def calc_mantissa(abs_x, exponent, normal_mask, MANTISSA_BITS, EXPONENT_BIAS, generator=None):
    mantissa_scaled = torch.where(
        normal_mask,
@ -57,6 +72,10 @@ def stochastic_rounding(value, dtype, seed=0):
    if dtype == torch.float8_e4m3fn or dtype == torch.float8_e5m2:
        generator = torch.Generator(device=value.device)
        generator.manual_seed(seed)
+        if _CK_STOCHASTIC_ROUNDING_AVAILABLE:
+            rng = torch.randint(0, 256, value.size(), dtype=torch.uint8, layout=value.layout, device=value.device, generator=generator)
+            return _ck_stochastic_rounding_fp8(value, rng, dtype)
+
        output = torch.empty_like(value, dtype=dtype)
        num_slices = max(1, (value.numel() / (4096 * 4096)))
        slice_size = max(1, round(value.shape[0] / num_slices))
--- a/comfy/image_encoders/dino2.py
+++ b/comfy/image_encoders/dino2.py
@ -1,7 +1,13 @@
 import torch
+import torch.nn.functional as F
+
 from comfy.text_encoders.bert import BertAttention
 import comfy.model_management
 from comfy.ldm.modules.attention import optimized_attention_for_device
+from comfy.ldm.depth_anything_3.reference_view_selector import (
+    select_reference_view, reorder_by_reference, restore_original_order,
+    THRESH_FOR_REF_SELECTION,
+)


 class Dino2AttentionOutput(torch.nn.Module):
@ -14,13 +20,41 @@ class Dino2AttentionOutput(torch.nn.Module):


 class Dino2AttentionBlock(torch.nn.Module):
-    def __init__(self, embed_dim, heads, layer_norm_eps, dtype, device, operations):
+    def __init__(self, embed_dim, heads, layer_norm_eps, dtype, device, operations,
+                 qk_norm=False):
        super().__init__()
+        self.heads = heads
+        self.head_dim = embed_dim // heads
        self.attention = BertAttention(embed_dim, heads, dtype, device, operations)
        self.output = Dino2AttentionOutput(embed_dim, embed_dim, layer_norm_eps, dtype, device, operations)
+        if qk_norm:
+            self.q_norm = operations.LayerNorm(self.head_dim, dtype=dtype, device=device)
+            self.k_norm = operations.LayerNorm(self.head_dim, dtype=dtype, device=device)
+        else:
+            self.q_norm = None
+            self.k_norm = None

-    def forward(self, x, mask, optimized_attention):
-        return self.output(self.attention(x, mask, optimized_attention))
+    def forward(self, x, mask, optimized_attention, pos=None, rope=None):
+        # Fast path used by the existing CLIP-vision DINOv2 (no DA3 extensions).
+        if self.q_norm is None and rope is None:
+            return self.output(self.attention(x, mask, optimized_attention))
+
+        # DA3 path: do QKV manually so we can apply per-head QK-norm and 2D RoPE.
+        attn = self.attention
+        B, N, C = x.shape
+        h = self.heads
+        d = self.head_dim
+        q = attn.query(x).view(B, N, h, d).transpose(1, 2)
+        k = attn.key(x).view(B, N, h, d).transpose(1, 2)
+        v = attn.value(x).view(B, N, h, d).transpose(1, 2)
+        if self.q_norm is not None:
+            q = self.q_norm(q)
+            k = self.k_norm(k)
+        if rope is not None and pos is not None:
+            q = rope(q, pos)
+            k = rope(k, pos)
+        out = optimized_attention(q, k, v, h, mask=mask, skip_reshape=True)
+        return self.output(out)


 class LayerScale(torch.nn.Module):
@ -64,9 +98,11 @@ class SwiGLUFFN(torch.nn.Module):


 class Dino2Block(torch.nn.Module):
-    def __init__(self, dim, num_heads, layer_norm_eps, dtype, device, operations, use_swiglu_ffn):
+    def __init__(self, dim, num_heads, layer_norm_eps, dtype, device, operations, use_swiglu_ffn,
+                 qk_norm=False):
        super().__init__()
-        self.attention = Dino2AttentionBlock(dim, num_heads, layer_norm_eps, dtype, device, operations)
+        self.attention = Dino2AttentionBlock(dim, num_heads, layer_norm_eps, dtype, device, operations,
+                                             qk_norm=qk_norm)
        self.layer_scale1 = LayerScale(dim, dtype, device, operations)
        self.layer_scale2 = LayerScale(dim, dtype, device, operations)
        if use_swiglu_ffn:
@ -76,19 +112,90 @@ class Dino2Block(torch.nn.Module):
        self.norm1 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
        self.norm2 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)

-    def forward(self, x, optimized_attention):
-        x = x + self.layer_scale1(self.attention(self.norm1(x), None, optimized_attention))
+    def forward(self, x, optimized_attention, pos=None, rope=None, attn_mask=None):
+        x = x + self.layer_scale1(self.attention(self.norm1(x), attn_mask, optimized_attention,
+                                                 pos=pos, rope=rope))
        x = x + self.layer_scale2(self.mlp(self.norm2(x)))
        return x


-class Dino2Encoder(torch.nn.Module):
-    def __init__(self, dim, num_heads, layer_norm_eps, num_layers, dtype, device, operations, use_swiglu_ffn):
+# -----------------------------------------------------------------------------
+# 2D Rotary position embedding (DA3 extension)
+# -----------------------------------------------------------------------------
+
+
+class _PositionGetter:
+    """Cache (h, w) -> flat (y, x) position grid used to feed ``rope``."""
+
+    def __init__(self):
+        self._cache: dict = {}
+
+    def __call__(self, batch_size: int, height: int, width: int, device) -> torch.Tensor:
+        key = (height, width, device)
+        if key not in self._cache:
+            y = torch.arange(height, device=device)
+            x = torch.arange(width, device=device)
+            self._cache[key] = torch.cartesian_prod(y, x)
+        cached = self._cache[key]
+        return cached.view(1, height * width, 2).expand(batch_size, -1, -1).clone()
+
+
+class RotaryPositionEmbedding2D(torch.nn.Module):
+    """2D RoPE used by DA3-Small/Base. No learnable parameters."""
+
+    def __init__(self, frequency: float = 100.0):
        super().__init__()
-        self.layer = torch.nn.ModuleList([Dino2Block(dim, num_heads, layer_norm_eps, dtype, device, operations, use_swiglu_ffn = use_swiglu_ffn)
-                                          for _ in range(num_layers)])
+        self.base_frequency = frequency
+        self._freq_cache: dict = {}
+
+    def _components(self, dim: int, seq_len: int, device, dtype):
+        key = (dim, seq_len, device, dtype)
+        if key not in self._freq_cache:
+            exp = torch.arange(0, dim, 2, device=device).float() / dim
+            inv_freq = 1.0 / (self.base_frequency ** exp)
+            pos = torch.arange(seq_len, device=device, dtype=inv_freq.dtype)
+            ang = torch.einsum("i,j->ij", pos, inv_freq)
+            ang = ang.to(dtype)
+            ang = torch.cat((ang, ang), dim=-1)
+            self._freq_cache[key] = (ang.cos().to(dtype), ang.sin().to(dtype))
+        return self._freq_cache[key]
+
+    @staticmethod
+    def _rotate(x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1]
+        x1, x2 = x[..., : d // 2], x[..., d // 2:]
+        return torch.cat((-x2, x1), dim=-1)
+
+    def _apply_1d(self, tokens, positions, cos_c, sin_c):
+        cos = F.embedding(positions, cos_c)[:, None, :, :]
+        sin = F.embedding(positions, sin_c)[:, None, :, :]
+        return (tokens * cos) + (self._rotate(tokens) * sin)
+
+    def forward(self, tokens: torch.Tensor, positions: torch.Tensor) -> torch.Tensor:
+        feature_dim = tokens.size(-1) // 2
+        max_pos = int(positions.max()) + 1
+        cos_c, sin_c = self._components(feature_dim, max_pos, tokens.device, tokens.dtype)
+        v, h = tokens.chunk(2, dim=-1)
+        v = self._apply_1d(v, positions[..., 0], cos_c, sin_c)
+        h = self._apply_1d(h, positions[..., 1], cos_c, sin_c)
+        return torch.cat((v, h), dim=-1)
+
+
+class Dino2Encoder(torch.nn.Module):
+    def __init__(self, dim, num_heads, layer_norm_eps, num_layers, dtype, device, operations, use_swiglu_ffn,
+                 qknorm_start: int = -1):
+        super().__init__()
+        self.layer = torch.nn.ModuleList([
+            Dino2Block(
+                dim, num_heads, layer_norm_eps, dtype, device, operations,
+                use_swiglu_ffn=use_swiglu_ffn,
+                qk_norm=(qknorm_start != -1 and i >= qknorm_start),
+            )
+            for i in range(num_layers)
+        ])

    def forward(self, x, intermediate_output=None):
+        # Backward-compat path used by ``ClipVisionModel`` (no DA3 extensions).
        optimized_attention = optimized_attention_for_device(x.device, False, small_input=True)

        if intermediate_output is not None:
@ -122,16 +229,27 @@ class Dino2PatchEmbeddings(torch.nn.Module):


 class Dino2Embeddings(torch.nn.Module):
-    def __init__(self, dim, dtype, device, operations):
+    def __init__(self, dim, dtype, device, operations,
+                 patch_size: int = 14, image_size: int = 518,
+                 use_mask_token: bool = True,
+                 num_camera_tokens: int = 0):
        super().__init__()
-        patch_size = 14
-        image_size = 518
        self.patch_size = patch_size
+        self.image_size = image_size

        self.patch_embeddings = Dino2PatchEmbeddings(dim, patch_size=patch_size, image_size=image_size, dtype=dtype, device=device, operations=operations)
        self.position_embeddings = torch.nn.Parameter(torch.empty(1, (image_size // patch_size) ** 2 + 1, dim, dtype=dtype, device=device))
        self.cls_token = torch.nn.Parameter(torch.empty(1, 1, dim, dtype=dtype, device=device)) # mask_token is a pre-training param, kept only so strict loading accepts the key.
-        self.mask_token = torch.nn.Parameter(torch.empty(1, dim, dtype=dtype, device=device))
+        if use_mask_token:
+            self.mask_token = torch.nn.Parameter(torch.empty(1, dim, dtype=dtype, device=device))
+        else:
+            self.mask_token = None
+        if num_camera_tokens > 0:
+            # DA3 stores (ref_token, src_token) pairs that get injected at the
+            # alt-attn boundary; see ``Dinov2Model._inject_camera_token``.
+            self.camera_token = torch.nn.Parameter(torch.empty(1, num_camera_tokens, dim, dtype=dtype, device=device))
+        else:
+            self.camera_token = None

    def interpolate_pos_encoding(self, x, h_pixels, w_pixels):
        pos_embed = comfy.model_management.cast_to_device(self.position_embeddings, x.device, torch.float32)
@ -140,12 +258,22 @@ class Dino2Embeddings(torch.nn.Module):
        patch_pos = pos_embed[:, 1:]
        N = patch_pos.shape[1]
        M = int(N ** 0.5)
+        assert N == M * M, f"DINOv2 position grid must be square, got N={N} patches (sqrt={M})"
        h0 = h_pixels // self.patch_size
        w0 = w_pixels // self.patch_size
-        scale_factor = ((h0 + 0.1) / M, (w0 + 0.1) / M)  # +0.1 matches upstream DINOv2's FP-rounding workaround so the interpolate output size lands on (h0, w0).
+        # +0.1 matches upstream DINOv2's FP-rounding workaround so the interpolate output size lands on (h0, w0).
+        # scale_factor is (height_scale, width_scale) -- height MUST come first;
+        # swapping these only happens to work for square inputs and breaks
+        # non-square paths like DA3-Small / DA3-Base multi-view.
+        scale_factor = ((h0 + 0.1) / M, (w0 + 0.1) / M)

        patch_pos = patch_pos.reshape(1, M, M, -1).permute(0, 3, 1, 2)
        patch_pos = torch.nn.functional.interpolate(patch_pos, scale_factor=scale_factor, mode="bicubic", antialias=False)
+        assert (h0, w0) == patch_pos.shape[-2:], (
+            f"Interpolated pos-embed grid {tuple(patch_pos.shape[-2:])} does not match "
+            f"target patch grid ({h0}, {w0}) for input {h_pixels}x{w_pixels} (patch_size={self.patch_size}); "
+            f"check scale_factor axis order and +0.1 rounding workaround"
+        )
        patch_pos = patch_pos.permute(0, 2, 3, 1).flatten(1, 2)
        return torch.cat((class_pos, patch_pos), dim=1).to(x.dtype)

@ -168,12 +296,51 @@ class Dinov2Model(torch.nn.Module):
        heads = config_dict["num_attention_heads"]
        layer_norm_eps = config_dict["layer_norm_eps"]
        use_swiglu_ffn = config_dict["use_swiglu_ffn"]
+        patch_size = config_dict.get("patch_size", 14)
+        image_size = config_dict.get("image_size", 518)
+        use_mask_token = config_dict.get("use_mask_token", True)

-        self.embeddings = Dino2Embeddings(dim, dtype, device, operations)
-        self.encoder = Dino2Encoder(dim, heads, layer_norm_eps, num_layers, dtype, device, operations, use_swiglu_ffn = use_swiglu_ffn)
+        # DA3 extensions (all default to disabled).
+        self.alt_start = config_dict.get("alt_start", -1)
+        self.qknorm_start = config_dict.get("qknorm_start", -1)
+        self.rope_start = config_dict.get("rope_start", -1)
+        self.cat_token = config_dict.get("cat_token", False)
+        rope_freq = config_dict.get("rope_freq", 100.0)
+
+        self.embed_dim = dim
+        self.patch_size = patch_size
+        self.num_register_tokens = 0
+        self.patch_start_idx = 1
+
+        if self.rope_start != -1 and rope_freq > 0:
+            self.rope = RotaryPositionEmbedding2D(frequency=rope_freq)
+            self._position_getter = _PositionGetter()
+        else:
+            self.rope = None
+            self._position_getter = None
+
+        # camera_token shape: (1, 2, dim) -> (ref_token, src_token).
+        num_cam_tokens = 2 if self.alt_start != -1 else 0
+
+        self.embeddings = Dino2Embeddings(
+            dim, dtype, device, operations,
+            patch_size=patch_size, image_size=image_size,
+            use_mask_token=use_mask_token, num_camera_tokens=num_cam_tokens,
+        )
+        self.encoder = Dino2Encoder(
+            dim, heads, layer_norm_eps, num_layers, dtype, device, operations,
+            use_swiglu_ffn=use_swiglu_ffn,
+            qknorm_start=self.qknorm_start,
+        )
        self.layernorm = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)

    def forward(self, pixel_values, attention_mask=None, intermediate_output=None):
+        if self.alt_start != -1:
+            raise RuntimeError(
+                "Dinov2Model.forward() is the backward-compatible CLIP-vision path and does not "
+                "apply DA3 extensions (RoPE, alternating attention, camera-token injection). "
+                "Use get_intermediate_layers_da3() for Depth Anything 3 models."
+            )
        x = self.embeddings(pixel_values)
        x, i = self.encoder(x, intermediate_output=intermediate_output)
        x = self.layernorm(x)
@ -181,6 +348,7 @@ class Dinov2Model(torch.nn.Module):
        return x, i, pooled_output, None

    def get_intermediate_layers(self, pixel_values, indices, apply_norm=True):
+        """Single-view multi-layer feature extraction."""
        x = self.embeddings(pixel_values)
        optimized_attention = optimized_attention_for_device(x.device, False, small_input=True)
        n_layers = len(self.encoder.layer)
@ -197,3 +365,132 @@ class Dinov2Model(torch.nn.Module):
            if i >= max_idx:
                break
        return [cache[i] for i in resolved]
+
+    # ------------------------------------------------------------------
+    # Depth Anything 3 forward
+    # ------------------------------------------------------------------
+    def _prepare_rope_positions(self, B, S, H, W, device):
+        if self.rope is None:
+            return None, None
+        ph, pw = H // self.patch_size, W // self.patch_size
+        pos = self._position_getter(B * S, ph, pw, device=device)
+        # Shift so the cls/cam token at position 0 is reserved for "no diff".
+        pos = pos + 1
+        cls_pos = torch.zeros(B * S, self.patch_start_idx, 2, device=device, dtype=pos.dtype)
+        # Per-view local: real grid positions for patches, 0 for cls token.
+        pos_local = torch.cat([cls_pos, pos], dim=1)
+        # Global (across views): same grid positions; cls token still at 0,
+        # but patches share the same positions in every view.
+        pos_global = torch.cat([cls_pos, torch.zeros_like(pos) + 1], dim=1)
+        return pos_local, pos_global
+
+    def _inject_camera_token(self, x: torch.Tensor, B: int, S: int, cam_token: "torch.Tensor | None") -> torch.Tensor:
+        # x: (B, S, N, C). Replace token at index 0 with the camera token.
+        if cam_token is not None:
+            inj = cam_token
+        else:
+            ct = comfy.model_management.cast_to_device(self.embeddings.camera_token, x.device, x.dtype)
+            ref_token = ct[:, :1].expand(B, -1, -1)
+            src_token = ct[:, 1:].expand(B, max(S - 1, 0), -1)
+            inj = torch.cat([ref_token, src_token], dim=1)
+        x = x.clone()
+        x[:, :, 0] = inj
+        return x
+
+    def get_intermediate_layers_da3(self, pixel_values, out_layers, cam_token=None, ref_view_strategy="saddle_balanced", export_feat_layers=None):
+        """Multi-view multi-layer feature extraction used by Depth Anything 3."""
+        if pixel_values.ndim == 4:
+            pixel_values = pixel_values.unsqueeze(1)
+        assert pixel_values.ndim == 5 and pixel_values.shape[2] == 3, \
+            f"expected (B,3,H,W) or (B,S,3,H,W); got {tuple(pixel_values.shape)}"
+        B, S, _, H, W = pixel_values.shape
+
+        # Patch + cls + (interpolated) pos embed for each view.
+        x = pixel_values.reshape(B * S, 3, H, W)
+        x = self.embeddings(x)                          # (B*S, 1+N, C)
+        x = x.reshape(B, S, x.shape[-2], x.shape[-1])    # (B, S, 1+N, C)
+
+        pos_local, pos_global = self._prepare_rope_positions(B, S, H, W, x.device)
+        # optimized_attention is only used by blocks without QK-norm/RoPE
+        # (vanilla DINOv2 path); enabling-aware blocks fall through to SDPA.
+        optimized_attention = optimized_attention_for_device(x.device, False, small_input=True)
+
+        out_set = set(out_layers)
+        export_set = set(export_feat_layers) if export_feat_layers else set()
+        outputs: list[torch.Tensor] = []
+        aux_outputs: list[torch.Tensor] = []
+        local_x = x
+        b_idx = None
+
+
+        for i, blk in enumerate(self.encoder.layer):
+            apply_rope = self.rope is not None and i >= self.rope_start
+            block_rope = self.rope if apply_rope else None
+            l_pos = pos_local if apply_rope else None
+            g_pos = pos_global if apply_rope else None
+
+            # Reference-view selection threshold: matches the upstream constant
+            # THRESH_FOR_REF_SELECTION = 3. Skipped when a user-supplied
+            # cam_token is provided (camera info already pins the geometry).
+            if (self.alt_start != -1 and i == self.alt_start - 1 and S >= THRESH_FOR_REF_SELECTION and cam_token is None):
+                b_idx = select_reference_view(x, strategy=ref_view_strategy)
+                x = reorder_by_reference(x, b_idx)
+                local_x = reorder_by_reference(local_x, b_idx)
+
+            if self.alt_start != -1 and i == self.alt_start:
+                x = self._inject_camera_token(x, B, S, cam_token)
+
+            if self.alt_start != -1 and i >= self.alt_start and (i % 2 == 1):
+                # Global attention across views: flatten S into the seq dim.
+                t = x.reshape(B, S * x.shape[-2], x.shape[-1])
+                p = g_pos.reshape(B, S * g_pos.shape[-2], g_pos.shape[-1]) if g_pos is not None else None
+                t = blk(t, optimized_attention=optimized_attention, pos=p, rope=block_rope)
+                x = t.reshape(B, S, x.shape[-2], x.shape[-1])
+            else:
+                # Per-view local attention.
+                t = x.reshape(B * S, x.shape[-2], x.shape[-1])
+                p = l_pos.reshape(B * S, l_pos.shape[-2], l_pos.shape[-1]) if l_pos is not None else None
+                t = blk(t, optimized_attention=optimized_attention, pos=p, rope=block_rope)
+                x = t.reshape(B, S, x.shape[-2], x.shape[-1])
+                local_x = x
+
+            if i in out_set:
+                if self.cat_token:
+                    out_x = torch.cat([local_x, x], dim=-1)
+                else:
+                    out_x = x
+                # Restore original view order on the way out so heads see views
+                # in the user's expected order.
+                if b_idx is not None and self.alt_start != -1:
+                    out_x = restore_original_order(out_x, b_idx)
+                outputs.append(out_x)
+
+            if i in export_set:
+                aux = x
+                if b_idx is not None and self.alt_start != -1:
+                    aux = restore_original_order(aux, b_idx)
+                aux_outputs.append(aux)
+
+        # Apply final norm. When cat_token is set, only the right half
+        # ("global" features) is normalised; the left half is left as-is to
+        # match the upstream DA3 head signature.
+        normed: list[torch.Tensor] = []
+        cls_tokens: list[torch.Tensor] = []
+        for out_x in outputs:
+            cls_tokens.append(out_x[:, :, 0])
+            if out_x.shape[-1] == self.embed_dim:
+                normed.append(self.layernorm(out_x))
+            elif out_x.shape[-1] == self.embed_dim * 2:
+                left = out_x[..., :self.embed_dim]
+                right = self.layernorm(out_x[..., self.embed_dim:])
+                normed.append(torch.cat([left, right], dim=-1))
+            else:
+                raise ValueError(f"Unexpected token width: {out_x.shape[-1]}")
+
+        # Drop cls/cam token from the patch sequence.
+        normed = [o[..., 1 + self.num_register_tokens:, :] for o in normed]
+
+        # Final layernorm + drop cls token from auxiliary features too.
+        aux_normed = [self.layernorm(o)[..., 1 + self.num_register_tokens:, :]
+                      for o in aux_outputs]
+        return list(zip(normed, cls_tokens)), aux_normed
--- a/comfy/image_encoders/dino3.py
+++ b/comfy/image_encoders/dino3.py
@ -3,10 +3,31 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F

-import comfy.model_management
+import comfy.ops
 from comfy.ldm.modules.attention import optimized_attention_for_device
 from comfy.image_encoders.dino2 import LayerScale as DINOv3ViTLayerScale

+
+# DINOv3 ViT-H/16+ (SwiGLU)
+DINOV3_VITH_CONFIG = {
+    "model_type": "dinov3",
+    "num_hidden_layers": 32,
+    "hidden_size": 1280,
+    "num_attention_heads": 20,
+    "num_register_tokens": 4,
+    "intermediate_size": 5120,
+    "layer_norm_eps": 1e-5,
+    "num_channels": 3,
+    "patch_size": 16,
+    "rope_theta": 100.0,
+    "use_gated_mlp": True,
+    "gated_mlp_act": "silu",
+    "image_size": 1024,
+    "image_mean": [0.485, 0.456, 0.406],
+    "image_std": [0.229, 0.224, 0.225],
+}
+
+
 class DINOv3ViTMLP(nn.Module):
    def __init__(self, hidden_size, intermediate_size, mlp_bias, device, dtype, operations):
        super().__init__()
@ -19,10 +40,13 @@ class DINOv3ViTMLP(nn.Module):
    def forward(self, x):
        return self.down_proj(self.act_fn(self.up_proj(x)))

+
 def rotate_half(x):
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)
+
+
 def apply_rotary_pos_emb(q, k, cos, sin, **kwargs):
    num_tokens = q.shape[-2]
    num_patches = sin.shape[-2]
@ -39,6 +63,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, **kwargs):

    return q, k

+
 class DINOv3ViTAttention(nn.Module):
    def __init__(self, hidden_size, num_attention_heads, device, dtype, operations):
        super().__init__()
@ -46,20 +71,12 @@ class DINOv3ViTAttention(nn.Module):
        self.num_heads = num_attention_heads
        self.head_dim = self.embed_dim // self.num_heads

-        self.k_proj = operations.Linear(self.embed_dim, self.embed_dim, bias=False, device=device, dtype=dtype) # key_bias = False
+        self.k_proj = operations.Linear(self.embed_dim, self.embed_dim, bias=False, device=device, dtype=dtype)  # key_bias = False
        self.v_proj = operations.Linear(self.embed_dim, self.embed_dim, bias=True, device=device, dtype=dtype)
-
        self.q_proj = operations.Linear(self.embed_dim, self.embed_dim, bias=True, device=device, dtype=dtype)
        self.o_proj = operations.Linear(self.embed_dim, self.embed_dim, bias=True, device=device, dtype=dtype)

-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: torch.Tensor | None = None,
-        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
-        **kwargs,
-    ) -> tuple[torch.Tensor, torch.Tensor | None]:
-
+    def forward(self, hidden_states, attention_mask=None, position_embeddings=None, **kwargs):
        batch_size, patches, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states)
@ -75,7 +92,6 @@ class DINOv3ViTAttention(nn.Module):
            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)

        attn = optimized_attention_for_device(query_states.device, mask=False)
-
        attn_output = attn(
            query_states, key_states, value_states, self.num_heads, attention_mask,
            skip_reshape=True, skip_output_reshape=True, low_precision_attention=False,
@ -84,27 +100,24 @@ class DINOv3ViTAttention(nn.Module):
        attn_output = attn_output.transpose(1, 2)
        attn_output = attn_output.reshape(batch_size, patches, -1).contiguous()
        attn_output = self.o_proj(attn_output)
-
        return attn_output

+
 class DINOv3ViTGatedMLP(nn.Module):
-    def __init__(self, hidden_size, intermediate_size, mlp_bias, device, dtype, operations):
+    def __init__(self, hidden_size, intermediate_size, mlp_bias, device, dtype, operations, act="silu"):
        super().__init__()
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.gate_proj = operations.Linear(self.hidden_size, self.intermediate_size, bias=mlp_bias, device=device, dtype=dtype)
        self.up_proj = operations.Linear(self.hidden_size, self.intermediate_size, bias=mlp_bias, device=device, dtype=dtype)
        self.down_proj = operations.Linear(self.intermediate_size, self.hidden_size, bias=mlp_bias, device=device, dtype=dtype)
-        self.act_fn = torch.nn.GELU()
+        self.act_fn = torch.nn.SiLU() if act == "silu" else torch.nn.GELU()

    def forward(self, x):
-        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-        return down_proj
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))

-def get_patches_center_coordinates(
-    num_patches_h: int, num_patches_w: int, dtype: torch.dtype, device: torch.device
-) -> torch.Tensor:

+def get_patches_center_coordinates(num_patches_h, num_patches_w, dtype, device):
    coords_h = torch.arange(0.5, num_patches_h, dtype=dtype, device=device)
    coords_w = torch.arange(0.5, num_patches_w, dtype=dtype, device=device)
    coords_h = coords_h / num_patches_h
@ -114,105 +127,79 @@ def get_patches_center_coordinates(
    coords = 2.0 * coords - 1.0
    return coords

+
 class DINOv3ViTRopePositionEmbedding(nn.Module):
    inv_freq: torch.Tensor

-    def __init__(self, rope_theta, hidden_size, num_attention_heads, image_size, patch_size, device, dtype):
+    def __init__(self, rope_theta, hidden_size, num_attention_heads, patch_size, device, dtype):
        super().__init__()
        self.base = rope_theta
        self.head_dim = hidden_size // num_attention_heads
-        self.num_patches_h = image_size // patch_size
-        self.num_patches_w = image_size // patch_size
        self.patch_size = patch_size

        inv_freq = 1 / self.base ** torch.arange(0, 1, 4 / self.head_dim, dtype=torch.float32, device=device)
        self.register_buffer("inv_freq", inv_freq, persistent=False)

-    def forward(self, pixel_values: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    def forward(self, pixel_values):
        _, _, height, width = pixel_values.shape
        num_patches_h = height // self.patch_size
        num_patches_w = width // self.patch_size

-        device = pixel_values.device
-        device_type = device.type if isinstance(device.type, str) and device.type != "mps" else "cpu"
-        with torch.amp.autocast(device_type = device_type, enabled=False):
-            patch_coords = get_patches_center_coordinates(
-                num_patches_h, num_patches_w, dtype=torch.float32, device=device
-            )
-
-            self.inv_freq = self.inv_freq.to(device)
-            angles = 2 * math.pi * patch_coords[:, :, None] * self.inv_freq[None, None, :]
-            angles = angles.flatten(1, 2)
-            angles = angles.tile(2)
-
-            cos = torch.cos(angles)
-            sin = torch.sin(angles)
-
-        dtype = pixel_values.dtype
-        return cos.to(dtype=dtype), sin.to(dtype=dtype)
+        patch_coords = get_patches_center_coordinates(num_patches_h, num_patches_w, dtype=torch.float32, device=pixel_values.device)
+        self.inv_freq = self.inv_freq.to(pixel_values.device)
+        angles = 2 * math.pi * patch_coords[:, :, None] * self.inv_freq[None, None, :]
+        angles = angles.flatten(1, 2)
+        angles = angles.tile(2)
+        cos = torch.cos(angles).to(dtype=pixel_values.dtype)
+        sin = torch.sin(angles).to(dtype=pixel_values.dtype)
+        return cos, sin


 class DINOv3ViTEmbeddings(nn.Module):
    def __init__(self, hidden_size, num_register_tokens, num_channels, patch_size, dtype, device, operations):
        super().__init__()
-        self.cls_token = nn.Parameter(torch.randn(1, 1, hidden_size, device=device, dtype=dtype))
-        self.mask_token = nn.Parameter(torch.zeros(1, 1, hidden_size, device=device, dtype=dtype))
+        self.cls_token = nn.Parameter(torch.empty(1, 1, hidden_size, device=device, dtype=dtype))
+        self.mask_token = nn.Parameter(torch.empty(1, 1, hidden_size, device=device, dtype=dtype))
        self.register_tokens = nn.Parameter(torch.empty(1, num_register_tokens, hidden_size, device=device, dtype=dtype))
        self.patch_embeddings = operations.Conv2d(
            num_channels, hidden_size, kernel_size=patch_size, stride=patch_size, device=device, dtype=dtype
        )

-    def forward(self, pixel_values: torch.Tensor, bool_masked_pos: torch.Tensor | None = None):
+    def forward(self, pixel_values, bool_masked_pos=None):
        batch_size = pixel_values.shape[0]
-        target_dtype = self.patch_embeddings.weight.dtype

-        patch_embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
+        patch_embeddings = self.patch_embeddings(pixel_values)
        patch_embeddings = patch_embeddings.flatten(2).transpose(1, 2)

        if bool_masked_pos is not None:
-            mask_token = self.mask_token.to(patch_embeddings.dtype)
+            mask_token = comfy.ops.cast_to_input(self.mask_token, patch_embeddings)
            patch_embeddings = torch.where(bool_masked_pos.unsqueeze(-1), mask_token, patch_embeddings)

-        cls_token = self.cls_token.expand(batch_size, -1, -1)
-        register_tokens = self.register_tokens.expand(batch_size, -1, -1)
-        device = patch_embeddings.device
-        cls_token = cls_token.to(device)
-        register_tokens = register_tokens.to(device)
+        cls_token = comfy.ops.cast_to_input(self.cls_token.expand(batch_size, -1, -1), patch_embeddings)
+        register_tokens = comfy.ops.cast_to_input(self.register_tokens.expand(batch_size, -1, -1), patch_embeddings)
        embeddings = torch.cat([cls_token, register_tokens, patch_embeddings], dim=1)
-
        return embeddings

+
 class DINOv3ViTLayer(nn.Module):
-
-    def __init__(self, hidden_size, layer_norm_eps, use_gated_mlp, mlp_bias, intermediate_size, num_attention_heads,
-                 device, dtype, operations):
+    def __init__(self, hidden_size, layer_norm_eps, use_gated_mlp, mlp_bias, intermediate_size,
+                 num_attention_heads, device, dtype, operations, gated_mlp_act="silu"):
        super().__init__()
-
        self.norm1 = operations.LayerNorm(hidden_size, eps=layer_norm_eps, device=device, dtype=dtype)
        self.attention = DINOv3ViTAttention(hidden_size, num_attention_heads, device=device, dtype=dtype, operations=operations)
        self.layer_scale1 = DINOv3ViTLayerScale(hidden_size, device=device, dtype=dtype, operations=None)

        self.norm2 = operations.LayerNorm(hidden_size, eps=layer_norm_eps, device=device, dtype=dtype)
-
        if use_gated_mlp:
-            self.mlp = DINOv3ViTGatedMLP(hidden_size, intermediate_size, mlp_bias, device=device, dtype=dtype, operations=operations)
+            self.mlp = DINOv3ViTGatedMLP(hidden_size, intermediate_size, mlp_bias, device=device, dtype=dtype, operations=operations, act=gated_mlp_act)
        else:
            self.mlp = DINOv3ViTMLP(hidden_size, intermediate_size=intermediate_size, mlp_bias=mlp_bias, device=device, dtype=dtype, operations=operations)
        self.layer_scale2 = DINOv3ViTLayerScale(hidden_size, device=device, dtype=dtype, operations=None)

-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: torch.Tensor | None = None,
-        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
-    ) -> torch.Tensor:
+    def forward(self, hidden_states, attention_mask=None, position_embeddings=None):
        residual = hidden_states
        hidden_states = self.norm1(hidden_states)
-        hidden_states = self.attention(
-            hidden_states,
-            attention_mask=attention_mask,
-            position_embeddings=position_embeddings,
-        )
+        hidden_states = self.attention(hidden_states, attention_mask=attention_mask, position_embeddings=position_embeddings)
        hidden_states = self.layer_scale1(hidden_states)
        hidden_states = hidden_states + residual

@ -221,18 +208,12 @@ class DINOv3ViTLayer(nn.Module):
        hidden_states = self.mlp(hidden_states)
        hidden_states = self.layer_scale2(hidden_states)
        hidden_states = hidden_states + residual
-
        return hidden_states


 class DINOv3ViTModel(nn.Module):
    def __init__(self, config, dtype, device, operations):
        super().__init__()
-        use_bf16 = comfy.model_management.should_use_bf16(device, prioritize_performance=True)
-        if dtype == torch.float16 and use_bf16:
-            dtype = torch.bfloat16
-        elif dtype == torch.float16 and not use_bf16:
-            dtype = torch.float32
        num_hidden_layers = config["num_hidden_layers"]
        hidden_size = config["hidden_size"]
        num_attention_heads = config["num_attention_heads"]
@ -242,45 +223,37 @@ class DINOv3ViTModel(nn.Module):
        num_channels = config["num_channels"]
        patch_size = config["patch_size"]
        rope_theta = config["rope_theta"]
+        use_gated_mlp = config.get("use_gated_mlp", False)
+        gated_mlp_act = config.get("gated_mlp_act", "silu")

        self.embeddings = DINOv3ViTEmbeddings(
-            hidden_size, num_register_tokens, num_channels=num_channels, patch_size=patch_size, dtype=dtype, device=device, operations=operations
+            hidden_size, num_register_tokens, num_channels=num_channels, patch_size=patch_size,
+            dtype=dtype, device=device, operations=operations
        )
        self.rope_embeddings = DINOv3ViTRopePositionEmbedding(
-            rope_theta, hidden_size, num_attention_heads, image_size=512, patch_size=patch_size, dtype=dtype, device=device
+            rope_theta, hidden_size, num_attention_heads, patch_size=patch_size, dtype=dtype, device=device
        )
-        self.layer = nn.ModuleList(
-            [DINOv3ViTLayer(hidden_size, layer_norm_eps, use_gated_mlp=False, mlp_bias=True,
-                            intermediate_size=intermediate_size,num_attention_heads = num_attention_heads,
-                            dtype=dtype, device=device, operations=operations)
+        self.layer = nn.ModuleList([
+            DINOv3ViTLayer(hidden_size, layer_norm_eps, use_gated_mlp=use_gated_mlp, mlp_bias=True,
+                           intermediate_size=intermediate_size, num_attention_heads=num_attention_heads,
+                           dtype=dtype, device=device, operations=operations, gated_mlp_act=gated_mlp_act)
            for _ in range(num_hidden_layers)])
        self.norm = operations.LayerNorm(hidden_size, eps=layer_norm_eps, dtype=dtype, device=device)

    def get_input_embeddings(self):
        return self.embeddings.patch_embeddings

-    def forward(
-        self,
-        pixel_values: torch.Tensor,
-        bool_masked_pos: torch.Tensor | None = None,
-        **kwargs,
-    ):
-
-        pixel_values = pixel_values.to(self.embeddings.patch_embeddings.weight.dtype)
+    def forward(self, pixel_values, bool_masked_pos=None, **kwargs):
        hidden_states = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
        position_embeddings = self.rope_embeddings(pixel_values)

-        for i, layer_module in enumerate(self.layer):
-            hidden_states = layer_module(
-                hidden_states,
-                position_embeddings=position_embeddings,
-            )
+        for layer_module in self.layer:
+            hidden_states = layer_module(hidden_states, position_embeddings=position_embeddings)

        if kwargs.get("skip_norm_elementwise", False):
-            sequence_output= F.layer_norm(hidden_states, hidden_states.shape[-1:])
+            sequence_output = F.layer_norm(hidden_states, hidden_states.shape[-1:])
        else:
            norm = self.norm.to(hidden_states.device)
            sequence_output = norm(hidden_states)
        pooled_output = sequence_output[:, 0, :]
-
        return sequence_output, None, pooled_output, None
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@ -239,6 +239,16 @@ class Flux2(LatentFormat):
    def process_out(self, latent):
        return latent

+class TripoSplat(LatentFormat):
+    # Sequence latent (B, 8192, 16) the camera token rides alongside as a second nested latent
+    latent_channels = 16
+
+    def process_in(self, latent):
+        return latent
+
+    def process_out(self, latent):
+        return latent
+
 class Mochi(LatentFormat):
    latent_channels = 12
    latent_dimensions = 3
@ -802,13 +812,15 @@ class ZImagePixelSpace(ChromaRadiance):
    """
    pass

-
 class HiDreamO1Pixel(ChromaRadiance):
    """Pixel-space latent format for HiDream-O1.
    No VAE — model patches/unpatches raw RGB internally with patch_size=32.
    """
    pass

+class PixelDiTPixel(ChromaRadiance):
+    pass
+
 class CogVideoX(LatentFormat):
    """Latent format for CogVideoX-2b (THUDM/CogVideoX-2b).

--- a/comfy/ldm/audio/dit.py
+++ b/comfy/ldm/audio/dit.py
@ -433,11 +433,11 @@ class Attention(nn.Module):
        if self.differential:
            q, q_diff = q.unbind(dim=1)
            k, k_diff = k.unbind(dim=1)
-            out      = optimized_attention(q,      k,      v, h, skip_reshape=True, transformer_options=transformer_options)
-            out_diff = optimized_attention(q_diff, k_diff, v, h, skip_reshape=True, transformer_options=transformer_options)
+            out      = optimized_attention(q,      k,      v, h, skip_reshape=True, low_precision_attention=False, transformer_options=transformer_options)
+            out_diff = optimized_attention(q_diff, k_diff, v, h, skip_reshape=True, low_precision_attention=False, transformer_options=transformer_options)
            out = out - out_diff
        else:
-            out = optimized_attention(q, k, v, h, skip_reshape=True, transformer_options=transformer_options)
+            out = optimized_attention(q, k, v, h, skip_reshape=True, low_precision_attention=False, transformer_options=transformer_options)

        out = self.to_out(out)

--- a/comfy/ldm/audio/vae_sa3.py
+++ b/comfy/ldm/audio/vae_sa3.py
@ -138,11 +138,11 @@ class Attention(nn.Module):
                k_diff = _apply_rotary_pos_emb(k_diff.float(), freqs).to(k_dtype)

        if self.differential:
-            out = (optimized_attention(q, k, v, h, mask=mask, skip_reshape=True)
-                   - optimized_attention(q_diff, k_diff, v, h, mask=mask, skip_reshape=True))
+            out = (optimized_attention(q, k, v, h, mask=mask, skip_reshape=True, low_precision_attention=False)
+                   - optimized_attention(q_diff, k_diff, v, h, mask=mask, skip_reshape=True, low_precision_attention=False))
            del q, k, v, q_diff, k_diff
        else:
-            out = optimized_attention(q, k, v, h, mask=mask, skip_reshape=True)
+            out = optimized_attention(q, k, v, h, mask=mask, skip_reshape=True, low_precision_attention=False)
            del q, k, v

        return self.to_out(out)
--- a/comfy/ldm/chroma_radiance/model.py
+++ b/comfy/ldm/chroma_radiance/model.py
@ -38,6 +38,8 @@ class ChromaRadianceParams(ChromaParams):
    # None means use the same dtype as the model.
    nerf_embedder_dtype: Optional[torch.dtype]
    use_x0: bool
+    # Use sequential txt_ids instead of zeros
+    use_sequential_txt_ids: bool

 class ChromaRadiance(Chroma):
    """
@ -162,6 +164,9 @@ class ChromaRadiance(Chroma):
        if params.use_x0:
            self.register_buffer("__x0__", torch.tensor([]))

+        if params.use_sequential_txt_ids:
+            self.register_buffer("__sequential__", torch.tensor([]))
+
    @property
    def _nerf_final_layer(self) -> nn.Module:
        if self.params.nerf_final_head_type == "linear":
@ -313,6 +318,9 @@ class ChromaRadiance(Chroma):
        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
        img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
+        # Radiance after 2026-05-22 uses sequential txt_ids instead of zeros
+        if params.use_sequential_txt_ids:
+            txt_ids[:, :, 0] = torch.arange(context.shape[1], device=x.device, dtype=x.dtype).unsqueeze(0).expand(bs, -1)

        img_out = self.forward_orig(
            img,
--- a/comfy/ldm/colormap.py
+++ b/comfy/ldm/colormap.py
@ -0,0 +1,25 @@
+"""Colormap utilities for depth and geometry visualisation."""
+
+from __future__ import annotations
+
+import torch
+
+
+def turbo(x: torch.Tensor) -> torch.Tensor:
+    """Anton Mikhailov polynomial approximation of the Turbo colormap.
+
+    Args:
+        x: Float tensor with values in [0, 1].
+
+    Returns:
+        RGB tensor of the same shape as ``x`` with a trailing size-3 dimension.
+    """
+    x = x.clamp(0.0, 1.0)
+    x2 = x * x
+    x3 = x2 * x
+    x4 = x2 * x2
+    x5 = x4 * x
+    r = 0.13572138 + 4.61539260*x - 42.66032258*x2 + 132.13108234*x3 - 152.94239396*x4 + 59.28637943*x5
+    g = 0.09140261 + 2.19418839*x + 4.84296658*x2 - 14.18503333*x3 +   4.27729857*x4 +  2.82956604*x5
+    b = 0.10667330 + 12.64194608*x - 60.58204836*x2 + 110.36276771*x3 - 89.90310912*x4 + 27.34824973*x5
+    return torch.stack([r, g, b], dim=-1).clamp(0.0, 1.0)
--- a/comfy/ldm/cosmos/predict2.py
+++ b/comfy/ldm/cosmos/predict2.py
@ -14,15 +14,7 @@ from torchvision import transforms
 import comfy.patcher_extension
 from comfy.ldm.modules.attention import optimized_attention
 import comfy.ldm.common_dit
-
-def apply_rotary_pos_emb(
-    t: torch.Tensor,
-    freqs: torch.Tensor,
-) -> torch.Tensor:
-    t_ = t.reshape(*t.shape[:-1], 2, -1).movedim(-2, -1).unsqueeze(-2).float()
-    t_out = freqs[..., 0] * t_[..., 0] + freqs[..., 1] * t_[..., 1]
-    t_out = t_out.movedim(-1, -2).reshape(*t.shape).type_as(t)
-    return t_out
+import comfy.quant_ops


 # ---------------------- Feed Forward Network -----------------------
@ -173,8 +165,7 @@ class Attention(nn.Module):
            k = self.k_norm(k)
            v = self.v_norm(v)
            if self.is_selfattn and rope_emb is not None:  # only apply to self-attention!
-                q = apply_rotary_pos_emb(q, rope_emb)
-                k = apply_rotary_pos_emb(k, rope_emb)
+                q, k = comfy.quant_ops.ck.apply_rope_split_half(q, k, rope_emb)
            return q, k, v

        q, k, v = apply_norm_and_rotary_pos_emb(q, k, v, rope_emb)
--- a/comfy/ldm/depth_anything_3/camera.py
+++ b/comfy/ldm/depth_anything_3/camera.py
@ -0,0 +1,177 @@
+"""Camera-token encoder and decoder for Depth Anything 3."""
+
+from __future__ import annotations
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from comfy.ldm.modules.attention import optimized_attention_for_device
+from .transform import affine_inverse, extri_intri_to_pose_encoding
+
+
+# -----------------------------------------------------------------------
+# Building blocks (mirror depth_anything_3.model.utils.{attention,block})
+# -----------------------------------------------------------------------
+
+
+class _Mlp(nn.Module):
+    """Standard 2-layer MLP with GELU. Matches upstream ``utils.attention.Mlp``."""
+
+    def __init__(self, in_features, hidden_features=None, out_features=None, *, device=None, dtype=None, operations=None):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = operations.Linear(in_features, hidden_features, bias=True, device=device, dtype=dtype)
+        self.fc2 = operations.Linear(hidden_features, out_features, bias=True, device=device, dtype=dtype)
+
+    def forward(self, x):
+        return self.fc2(F.gelu(self.fc1(x)))
+
+
+class _LayerScale(nn.Module):
+    """Per-channel learnable scaling. Matches upstream LayerScale."""
+
+    def __init__(self, dim, *, device=None, dtype=None):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.empty(dim, device=device, dtype=dtype))
+
+    def forward(self, x):
+        return x * self.gamma.to(dtype=x.dtype, device=x.device)
+
+
+class _Attention(nn.Module):
+    """ Self-attention with fused QKV projection. Mirrors upstream utils.attention.Attention;
+    Layout matches the HF safetensors (attn.qkv.{weight,bias} and attn.proj.{weight,bias})."""
+
+    def __init__(self, dim, num_heads, *, device=None, dtype=None, operations=None):
+        super().__init__()
+        assert dim % num_heads == 0
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.qkv = operations.Linear(dim, dim * 3, bias=True, device=device, dtype=dtype)
+        self.proj = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, C)
+        q, k, v = qkv.unbind(2)                      # each (B, N, C)
+        attn_fn = optimized_attention_for_device(x.device, small_input=True)
+        out = attn_fn(q, k, v, heads=self.num_heads)
+        return self.proj(out)
+
+
+class _Block(nn.Module):
+    """Pre-norm transformer block with LayerScale. Used by :class:CameraEnc. Layout follows upstream utils.block.Block."""
+
+    def __init__(self, dim, num_heads, mlp_ratio=4, init_values=0.01, *, device=None, dtype=None, operations=None):
+        super().__init__()
+        self.norm1 = operations.LayerNorm(dim, device=device, dtype=dtype)
+        self.attn = _Attention(dim, num_heads, device=device, dtype=dtype, operations=operations)
+        self.ls1 = _LayerScale(dim, device=device, dtype=dtype) if init_values else nn.Identity()
+        self.norm2 = operations.LayerNorm(dim, device=device, dtype=dtype)
+        self.mlp = _Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), device=device, dtype=dtype, operations=operations)
+        self.ls2 = _LayerScale(dim, device=device, dtype=dtype) if init_values else nn.Identity()
+
+    def forward(self, x):
+        x = x + self.ls1(self.attn(self.norm1(x)))
+        x = x + self.ls2(self.mlp(self.norm2(x)))
+        return x
+
+
+class CameraEnc(nn.Module):
+    """Encode per-view (extrinsics, intrinsics) into a camera token.
+
+    Maps a 9-D pose-encoding vector through a small MLP up to the backbone's
+    ``embed_dim``, then runs ``trunk_depth`` transformer blocks. The output
+    has shape ``(B, S, embed_dim)`` and is injected at block ``alt_start``
+    of the DINOv2 backbone in place of the cls token.
+
+    Parameters mirror the upstream ``cam_enc.py`` so HF weights load directly.
+    """
+
+    def __init__(
+        self,
+        dim_out: int = 1024,
+        dim_in: int = 9,
+        trunk_depth: int = 4,
+        target_dim: int = 9,
+        num_heads: int = 16,
+        mlp_ratio: int = 4,
+        init_values: float = 0.01,
+        *,
+        device=None, dtype=None, operations=None,
+        **_kwargs,
+    ):
+        super().__init__()
+        self.target_dim = target_dim
+        self.trunk_depth = trunk_depth
+        self.trunk = nn.Sequential(*[
+            _Block(dim_out, num_heads=num_heads, mlp_ratio=mlp_ratio,
+                   init_values=init_values,
+                   device=device, dtype=dtype, operations=operations)
+            for _ in range(trunk_depth)
+        ])
+        self.token_norm = operations.LayerNorm(dim_out, device=device, dtype=dtype)
+        self.trunk_norm = operations.LayerNorm(dim_out, device=device, dtype=dtype)
+        self.pose_branch = _Mlp(
+            in_features=dim_in,
+            hidden_features=dim_out // 2,
+            out_features=dim_out,
+            device=device, dtype=dtype, operations=operations,
+        )
+
+    def forward(self, extrinsics: torch.Tensor, intrinsics: torch.Tensor,
+                image_size_hw) -> torch.Tensor:
+        """Encode camera parameters into ``(B, S, dim_out)`` tokens."""
+        c2ws = affine_inverse(extrinsics)
+        pose_encoding = extri_intri_to_pose_encoding(c2ws, intrinsics, image_size_hw)
+        tokens = self.pose_branch(pose_encoding.to(self.pose_branch.fc1.weight.dtype))
+        tokens = self.token_norm(tokens)
+        tokens = self.trunk(tokens)
+        tokens = self.trunk_norm(tokens)
+        return tokens
+
+
+class CameraDec(nn.Module):
+    """Decode the final cam token into a 9-D pose encoding.
+
+    Output layout: ``[T(3), quat_xyzw(4), fov_h, fov_w]``. The translation is
+    always predicted by the network; the quaternion and FoV can either be
+    predicted or supplied via ``camera_encoding`` (used at training time
+    when GT cameras are available -- not exercised at inference here).
+
+    Parameters mirror the upstream ``cam_dec.py`` so HF weights load directly.
+    """
+
+    def __init__(self, dim_in: int = 1536,
+                 *, device=None, dtype=None, operations=None, **_kwargs):
+        super().__init__()
+        d = dim_in
+        self.backbone = nn.Sequential(
+            operations.Linear(d, d, device=device, dtype=dtype),
+            nn.ReLU(),
+            operations.Linear(d, d, device=device, dtype=dtype),
+            nn.ReLU(),
+        )
+        self.fc_t = operations.Linear(d, 3, device=device, dtype=dtype)
+        self.fc_qvec = operations.Linear(d, 4, device=device, dtype=dtype)
+        self.fc_fov = nn.Sequential(
+            operations.Linear(d, 2, device=device, dtype=dtype),
+            nn.ReLU(),
+        )
+
+    def forward(self, feat: torch.Tensor,
+                camera_encoding: "torch.Tensor | None" = None) -> torch.Tensor:
+        """Decode ``(B, N, dim_in)`` cam tokens into ``(B, N, 9)`` pose enc."""
+        B, N = feat.shape[:2]
+        feat = feat.reshape(B * N, -1)
+        feat = self.backbone(feat)
+        out_t = self.fc_t(feat.float()).reshape(B, N, 3)
+        if camera_encoding is None:
+            out_qvec = self.fc_qvec(feat.float()).reshape(B, N, 4)
+            out_fov = self.fc_fov(feat.float()).reshape(B, N, 2)
+        else:
+            out_qvec = camera_encoding[..., 3:7]
+            out_fov = camera_encoding[..., -2:]
+        return torch.cat([out_t, out_qvec, out_fov], dim=-1)
--- a/comfy/ldm/depth_anything_3/dpt.py
+++ b/comfy/ldm/depth_anything_3/dpt.py
@ -0,0 +1,489 @@
+"""DPT / DualDPT heads for Depth Anything 3."""
+
+from __future__ import annotations
+
+from typing import List, Optional, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Permute(nn.Module):
+    def __init__(self, dims: Tuple[int, ...]):
+        super().__init__()
+        self.dims = dims
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x.permute(*self.dims)
+
+
+def _custom_interpolate(
+    x: torch.Tensor,
+    size: Optional[Tuple[int, int]] = None,
+    scale_factor: Optional[float] = None,
+    mode: str = "bilinear",
+    align_corners: bool = True,
+) -> torch.Tensor:
+    if size is None:
+        assert scale_factor is not None
+        size = (int(x.shape[-2] * scale_factor), int(x.shape[-1] * scale_factor))
+    INT_MAX = 1610612736
+    total = size[0] * size[1] * x.shape[0] * x.shape[1]
+    if total > INT_MAX:
+        chunks = torch.chunk(x, chunks=(total // INT_MAX) + 1, dim=0)
+        outs = [F.interpolate(c, size=size, mode=mode, align_corners=align_corners) for c in chunks]
+        return torch.cat(outs, dim=0).contiguous()
+    return F.interpolate(x, size=size, mode=mode, align_corners=align_corners)
+
+
+def _create_uv_grid(width: int, height: int, aspect_ratio: float, dtype, device) -> torch.Tensor:
+    """Normalised UV grid spanning (-x_span, -y_span)..(x_span, y_span)."""
+    diag_factor = (aspect_ratio ** 2 + 1.0) ** 0.5
+    span_x = aspect_ratio / diag_factor
+    span_y = 1.0 / diag_factor
+    left_x = -span_x * (width - 1) / width
+    right_x = span_x * (width - 1) / width
+    top_y = -span_y * (height - 1) / height
+    bottom_y = span_y * (height - 1) / height
+    x_coords = torch.linspace(left_x, right_x, steps=width, dtype=dtype, device=device)
+    y_coords = torch.linspace(top_y, bottom_y, steps=height, dtype=dtype, device=device)
+    uu, vv = torch.meshgrid(x_coords, y_coords, indexing="xy")
+    return torch.stack((uu, vv), dim=-1)  # (H, W, 2)
+
+
+def _make_sincos_pos_embed(embed_dim: int, pos: torch.Tensor, omega_0: float = 100.0) -> torch.Tensor:
+    omega = torch.arange(embed_dim // 2, dtype=torch.float32, device=pos.device)
+    omega = 1.0 / omega_0 ** (omega / (embed_dim / 2.0))
+    pos = pos.reshape(-1)
+    out = torch.einsum("m,d->md", pos, omega)
+    return torch.cat([out.sin(), out.cos()], dim=1).float()
+
+
+def _position_grid_to_embed(pos_grid: torch.Tensor, embed_dim: int, omega_0: float = 100.0) -> torch.Tensor:
+    H, W, _ = pos_grid.shape
+    pos_flat = pos_grid.reshape(-1, 2)
+    emb_x = _make_sincos_pos_embed(embed_dim // 2, pos_flat[:, 0], omega_0=omega_0)
+    emb_y = _make_sincos_pos_embed(embed_dim // 2, pos_flat[:, 1], omega_0=omega_0)
+    emb = torch.cat([emb_x, emb_y], dim=-1)
+    return emb.view(H, W, embed_dim)
+
+
+def _add_pos_embed(x: torch.Tensor, W: int, H: int, ratio: float = 0.1) -> torch.Tensor:
+    """Stateless UV positional embedding added to a feature map (B, C, h, w)."""
+    pw, ph = x.shape[-1], x.shape[-2]
+    pe = _create_uv_grid(pw, ph, aspect_ratio=W / H, dtype=x.dtype, device=x.device)
+    pe = _position_grid_to_embed(pe, x.shape[1]) * ratio
+    pe = pe.permute(2, 0, 1)[None].expand(x.shape[0], -1, -1, -1).to(dtype=x.dtype)
+    return x + pe
+
+
+def _apply_activation(x: torch.Tensor, activation: str) -> torch.Tensor:
+    act = (activation or "linear").lower()
+    if act == "exp":
+        return torch.exp(x)
+    if act == "expp1":
+        return torch.exp(x) + 1
+    if act == "expm1":
+        return torch.expm1(x)
+    if act == "relu":
+        return torch.relu(x)
+    if act == "sigmoid":
+        return torch.sigmoid(x)
+    if act == "softplus":
+        return F.softplus(x)
+    if act == "tanh":
+        return torch.tanh(x)
+    return x
+
+
+# -----------------------------------------------------------------------------
+# Fusion building blocks
+# -----------------------------------------------------------------------------
+
+
+class ResidualConvUnit(nn.Module):
+    def __init__(self, features: int, device=None, dtype=None, operations=None):
+        super().__init__()
+        self.conv1 = operations.Conv2d(features, features, 3, 1, 1, bias=True, device=device, dtype=dtype)
+        self.conv2 = operations.Conv2d(features, features, 3, 1, 1, bias=True, device=device, dtype=dtype)
+        self.activation = nn.ReLU(inplace=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = self.activation(x)
+        out = self.conv1(out)
+        out = self.activation(out)
+        out = self.conv2(out)
+        return out + x
+
+
+class FeatureFusionBlock(nn.Module):
+    def __init__(self, features: int, has_residual: bool = True, align_corners: bool = True, device=None, dtype=None, operations=None):
+        super().__init__()
+        self.align_corners = align_corners
+        self.has_residual = has_residual
+        if has_residual:
+            self.resConfUnit1 = ResidualConvUnit(features, device=device, dtype=dtype, operations=operations)
+        else:
+            self.resConfUnit1 = None
+        self.resConfUnit2 = ResidualConvUnit(features, device=device, dtype=dtype, operations=operations)
+        self.out_conv = operations.Conv2d(features, features, 1, 1, 0, bias=True, device=device, dtype=dtype)
+
+    def forward(self, *xs: torch.Tensor, size: Optional[Tuple[int, int]] = None) -> torch.Tensor:
+        y = xs[0]
+        if self.has_residual and len(xs) > 1 and self.resConfUnit1 is not None:
+            y = y + self.resConfUnit1(xs[1])
+        y = self.resConfUnit2(y)
+        if size is None:
+            up_kwargs = {"scale_factor": 2.0}
+        else:
+            up_kwargs = {"size": size}
+        y = _custom_interpolate(y, **up_kwargs, mode="bilinear", align_corners=self.align_corners)
+        y = self.out_conv(y)
+        return y
+
+
+class _Scratch(nn.Module):
+    """Container that mirrors upstream ``scratch`` attribute layout."""
+
+
+def _make_scratch(in_shape: List[int], out_shape: int, device=None, dtype=None, operations=None) -> _Scratch:
+    scratch = _Scratch()
+    scratch.layer1_rn = operations.Conv2d(in_shape[0], out_shape, 3, 1, 1, bias=False, device=device, dtype=dtype)
+    scratch.layer2_rn = operations.Conv2d(in_shape[1], out_shape, 3, 1, 1, bias=False, device=device, dtype=dtype)
+    scratch.layer3_rn = operations.Conv2d(in_shape[2], out_shape, 3, 1, 1, bias=False, device=device, dtype=dtype)
+    scratch.layer4_rn = operations.Conv2d(in_shape[3], out_shape, 3, 1, 1, bias=False, device=device, dtype=dtype)
+    return scratch
+
+
+def _make_fusion_block(features: int, has_residual: bool = True, device=None, dtype=None, operations=None) -> FeatureFusionBlock:
+    return FeatureFusionBlock(features, has_residual=has_residual, align_corners=True, device=device, dtype=dtype, operations=operations)
+
+
+# -----------------------------------------------------------------------------
+# DPT (single head + optional sky head) -- used by DA3Mono/Metric
+# -----------------------------------------------------------------------------
+
+
+class DPT(nn.Module):
+    """Single-head DPT used by DA3Mono-Large and DA3Metric-Large."""
+
+    def __init__(
+        self,
+        dim_in: int,
+        patch_size: int = 14,
+        output_dim: int = 1,
+        activation: str = "exp",
+        conf_activation: str = "expp1",
+        features: int = 256,
+        out_channels: Sequence[int] = (256, 512, 1024, 1024),
+        pos_embed: bool = False,
+        down_ratio: int = 1,
+        head_name: str = "depth",
+        use_sky_head: bool = True,
+        sky_name: str = "sky",
+        sky_activation: str = "relu",
+        norm_type: str = "idt",
+        device=None, dtype=None, operations=None,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.activation = activation
+        self.conf_activation = conf_activation
+        self.pos_embed = pos_embed
+        self.down_ratio = down_ratio
+        self.head_main = head_name
+        self.sky_name = sky_name
+        self.out_dim = output_dim
+        self.has_conf = output_dim > 1
+        self.use_sky_head = use_sky_head
+        self.sky_activation = sky_activation
+        self.intermediate_layer_idx: Tuple[int, int, int, int] = (0, 1, 2, 3)
+
+        if norm_type == "layer":
+            self.norm = operations.LayerNorm(dim_in, device=device, dtype=dtype)
+        else:
+            self.norm = nn.Identity()
+
+        out_channels = list(out_channels)
+        self.projects = nn.ModuleList([
+            operations.Conv2d(dim_in, oc, kernel_size=1, stride=1, padding=0, device=device, dtype=dtype)
+            for oc in out_channels
+        ])
+        self.resize_layers = nn.ModuleList([
+            operations.ConvTranspose2d(out_channels[0], out_channels[0], kernel_size=4, stride=4, padding=0, device=device, dtype=dtype),
+            operations.ConvTranspose2d(out_channels[1], out_channels[1], kernel_size=2, stride=2, padding=0, device=device, dtype=dtype),
+            nn.Identity(),
+            operations.Conv2d(out_channels[3], out_channels[3], kernel_size=3, stride=2, padding=1, device=device, dtype=dtype),
+        ])
+
+        self.scratch = _make_scratch(out_channels, features, device=device, dtype=dtype, operations=operations)
+        self.scratch.refinenet1 = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
+        self.scratch.refinenet2 = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
+        self.scratch.refinenet3 = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
+        self.scratch.refinenet4 = _make_fusion_block(features, has_residual=False, device=device, dtype=dtype, operations=operations)
+
+        head_features_1 = features
+        head_features_2 = 32
+        self.scratch.output_conv1 = operations.Conv2d(
+            head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1,
+            device=device, dtype=dtype,
+        )
+        self.scratch.output_conv2 = nn.Sequential(
+            operations.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1, device=device, dtype=dtype),
+            nn.ReLU(inplace=False),
+            operations.Conv2d(head_features_2, output_dim, kernel_size=1, stride=1, padding=0, device=device, dtype=dtype),
+        )
+
+        if self.use_sky_head:
+            self.scratch.sky_output_conv2 = nn.Sequential(
+                operations.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1, device=device, dtype=dtype),
+                nn.ReLU(inplace=False),
+                operations.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0, device=device, dtype=dtype),
+            )
+
+    def forward(self, feats: List[torch.Tensor], H: int, W: int, patch_start_idx: int = 0, **_kwargs) -> dict:
+        # feats[i][0] is the patch-token tensor with shape (B, S, N_patch, C)
+        B, S, N, C = feats[0][0].shape
+        feats_flat = [feat[0].reshape(B * S, N, C) for feat in feats]
+
+        ph, pw = H // self.patch_size, W // self.patch_size
+        resized = []
+        for stage_idx, take_idx in enumerate(self.intermediate_layer_idx):
+            x = feats_flat[take_idx][:, patch_start_idx:]
+            x = self.norm(x)
+            x = x.permute(0, 2, 1).contiguous().reshape(B * S, C, ph, pw)
+            x = self.projects[stage_idx](x)
+            if self.pos_embed:
+                x = _add_pos_embed(x, W, H)
+            x = self.resize_layers[stage_idx](x)
+            resized.append(x)
+
+        l1_rn = self.scratch.layer1_rn(resized[0])
+        l2_rn = self.scratch.layer2_rn(resized[1])
+        l3_rn = self.scratch.layer3_rn(resized[2])
+        l4_rn = self.scratch.layer4_rn(resized[3])
+
+        out = self.scratch.refinenet4(l4_rn, size=l3_rn.shape[2:])
+        out = self.scratch.refinenet3(out, l3_rn, size=l2_rn.shape[2:])
+        out = self.scratch.refinenet2(out, l2_rn, size=l1_rn.shape[2:])
+        out = self.scratch.refinenet1(out, l1_rn)
+
+        h_out = int(ph * self.patch_size / self.down_ratio)
+        w_out = int(pw * self.patch_size / self.down_ratio)
+
+        fused = self.scratch.output_conv1(out)
+        fused = _custom_interpolate(fused, (h_out, w_out), mode="bilinear", align_corners=True)
+        if self.pos_embed:
+            fused = _add_pos_embed(fused, W, H)
+        feat = fused
+
+        main_logits = self.scratch.output_conv2(feat)
+        outs = {}
+        if self.has_conf:
+            fmap = main_logits.permute(0, 2, 3, 1)
+            pred = _apply_activation(fmap[..., :-1], self.activation)
+            conf = _apply_activation(fmap[..., -1], self.conf_activation)
+            outs[self.head_main] = pred.squeeze(-1).view(B, S, *pred.shape[1:-1])
+            outs[f"{self.head_main}_conf"] = conf.view(B, S, *conf.shape[1:])
+        else:
+            pred = _apply_activation(main_logits, self.activation)
+            outs[self.head_main] = pred.squeeze(1).view(B, S, *pred.shape[2:])
+
+        if self.use_sky_head:
+            sky_logits = self.scratch.sky_output_conv2(feat)
+            if self.sky_activation.lower() == "sigmoid":
+                sky = torch.sigmoid(sky_logits)
+            elif self.sky_activation.lower() == "relu":
+                sky = F.relu(sky_logits)
+            else:
+                sky = sky_logits
+            outs[self.sky_name] = sky.squeeze(1).view(B, S, *sky.shape[2:])
+
+        return outs
+
+
+# -----------------------------------------------------------------------------
+# DualDPT (depth + auxiliary "ray" head) -- used by DA3-Small / DA3-Base
+# -----------------------------------------------------------------------------
+
+
+class DualDPT(nn.Module):
+    """Two-head DPT used by DA3-Small / DA3-Base."""
+
+    def __init__(
+        self,
+        dim_in: int,
+        patch_size: int = 14,
+        output_dim: int = 2,
+        activation: str = "exp",
+        conf_activation: str = "expp1",
+        features: int = 256,
+        out_channels: Sequence[int] = (256, 512, 1024, 1024),
+        pos_embed: bool = True,
+        down_ratio: int = 1,
+        aux_pyramid_levels: int = 4,
+        aux_out1_conv_num: int = 5,
+        head_names: Tuple[str, str] = ("depth", "ray"),
+        device=None, dtype=None, operations=None,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.activation = activation
+        self.conf_activation = conf_activation
+        self.pos_embed = pos_embed
+        self.down_ratio = down_ratio
+        self.aux_levels = aux_pyramid_levels
+        self.aux_out1_conv_num = aux_out1_conv_num
+        self.head_main, self.head_aux = head_names
+        self.intermediate_layer_idx: Tuple[int, int, int, int] = (0, 1, 2, 3)
+        # Toggle the auxiliary ray branch at runtime. Default off (mono path).
+        # DepthAnything3Net flips this on when running multi-view + ray-pose.
+        self.enable_aux: bool = False
+
+        self.norm = operations.LayerNorm(dim_in, device=device, dtype=dtype)
+        out_channels = list(out_channels)
+        self.projects = nn.ModuleList([
+            operations.Conv2d(dim_in, oc, kernel_size=1, stride=1, padding=0, device=device, dtype=dtype)
+            for oc in out_channels
+        ])
+        self.resize_layers = nn.ModuleList([
+            operations.ConvTranspose2d(out_channels[0], out_channels[0], kernel_size=4, stride=4, padding=0, device=device, dtype=dtype),
+            operations.ConvTranspose2d(out_channels[1], out_channels[1], kernel_size=2, stride=2, padding=0, device=device, dtype=dtype),
+            nn.Identity(),
+            operations.Conv2d(out_channels[3], out_channels[3], kernel_size=3, stride=2, padding=1, device=device, dtype=dtype),
+        ])
+
+        self.scratch = _make_scratch(out_channels, features, device=device, dtype=dtype, operations=operations)
+        # Main fusion chain
+        self.scratch.refinenet1 = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
+        self.scratch.refinenet2 = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
+        self.scratch.refinenet3 = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
+        self.scratch.refinenet4 = _make_fusion_block(features, has_residual=False, device=device, dtype=dtype, operations=operations)
+        # Auxiliary fusion chain (separate copies)
+        self.scratch.refinenet1_aux = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
+        self.scratch.refinenet2_aux = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
+        self.scratch.refinenet3_aux = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
+        self.scratch.refinenet4_aux = _make_fusion_block(features, has_residual=False, device=device, dtype=dtype, operations=operations)
+
+        head_features_1 = features
+        head_features_2 = 32
+
+        # Main head neck + final projection
+        self.scratch.output_conv1 = operations.Conv2d(
+            head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1,
+            device=device, dtype=dtype,
+        )
+        self.scratch.output_conv2 = nn.Sequential(
+            operations.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1, device=device, dtype=dtype),
+            nn.ReLU(inplace=False),
+            operations.Conv2d(head_features_2, output_dim, kernel_size=1, stride=1, padding=0, device=device, dtype=dtype),
+        )
+
+        # Aux pre-head per level (multi-level pyramid)
+        self.scratch.output_conv1_aux = nn.ModuleList([
+            self._make_aux_out1_block(head_features_1, device=device, dtype=dtype, operations=operations)
+            for _ in range(self.aux_levels)
+        ])
+
+        # Aux final projection per level (includes LayerNorm permute path).
+        ln_seq = [Permute((0, 2, 3, 1)),
+                  operations.LayerNorm(head_features_2, device=device, dtype=dtype),
+                  Permute((0, 3, 1, 2))]
+        self.scratch.output_conv2_aux = nn.ModuleList([
+            nn.Sequential(
+                operations.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1, device=device, dtype=dtype),
+                *ln_seq,
+                nn.ReLU(inplace=False),
+                operations.Conv2d(head_features_2, 7, kernel_size=1, stride=1, padding=0, device=device, dtype=dtype),
+            )
+            for _ in range(self.aux_levels)
+        ])
+
+    @staticmethod
+    def _make_aux_out1_block(in_ch: int, *, device=None, dtype=None, operations=None) -> nn.Sequential:
+        # aux_out1_conv_num=5 in all Apache-2.0 variants.
+        return nn.Sequential(
+            operations.Conv2d(in_ch, in_ch // 2, 3, 1, 1, device=device, dtype=dtype),
+            operations.Conv2d(in_ch // 2, in_ch, 3, 1, 1, device=device, dtype=dtype),
+            operations.Conv2d(in_ch, in_ch // 2, 3, 1, 1, device=device, dtype=dtype),
+            operations.Conv2d(in_ch // 2, in_ch, 3, 1, 1, device=device, dtype=dtype),
+            operations.Conv2d(in_ch, in_ch // 2, 3, 1, 1, device=device, dtype=dtype),
+        )
+
+    def forward(self, feats: List[torch.Tensor], H: int, W: int, patch_start_idx: int = 0, **_kwargs) -> dict:
+        B, S, N, C = feats[0][0].shape
+        feats_flat = [feat[0].reshape(B * S, N, C) for feat in feats]
+
+        ph, pw = H // self.patch_size, W // self.patch_size
+        resized = []
+        for stage_idx, take_idx in enumerate(self.intermediate_layer_idx):
+            x = feats_flat[take_idx][:, patch_start_idx:]
+            x = self.norm(x)
+            x = x.permute(0, 2, 1).contiguous().reshape(B * S, C, ph, pw)
+            x = self.projects[stage_idx](x)
+            if self.pos_embed:
+                x = _add_pos_embed(x, W, H)
+            x = self.resize_layers[stage_idx](x)
+            resized.append(x)
+
+        l1_rn = self.scratch.layer1_rn(resized[0])
+        l2_rn = self.scratch.layer2_rn(resized[1])
+        l3_rn = self.scratch.layer3_rn(resized[2])
+        l4_rn = self.scratch.layer4_rn(resized[3])
+
+        # Main pyramid (output_conv1 is applied inside the upstream `_fuse`,
+        # before interpolation -- replicate that order here).
+        m = self.scratch.refinenet4(l4_rn, size=l3_rn.shape[2:])
+        if self.enable_aux:
+            a4 = self.scratch.refinenet4_aux(l4_rn, size=l3_rn.shape[2:])
+            aux_pyr = [a4]
+        m = self.scratch.refinenet3(m, l3_rn, size=l2_rn.shape[2:])
+        if self.enable_aux:
+            aux_pyr.append(self.scratch.refinenet3_aux(aux_pyr[-1], l3_rn, size=l2_rn.shape[2:]))
+        m = self.scratch.refinenet2(m, l2_rn, size=l1_rn.shape[2:])
+        if self.enable_aux:
+            aux_pyr.append(self.scratch.refinenet2_aux(aux_pyr[-1], l2_rn, size=l1_rn.shape[2:]))
+        m = self.scratch.refinenet1(m, l1_rn)
+        if self.enable_aux:
+            aux_pyr.append(self.scratch.refinenet1_aux(aux_pyr[-1], l1_rn))
+        m = self.scratch.output_conv1(m)
+
+        h_out = int(ph * self.patch_size / self.down_ratio)
+        w_out = int(pw * self.patch_size / self.down_ratio)
+
+        m = _custom_interpolate(m, (h_out, w_out), mode="bilinear", align_corners=True)
+        if self.pos_embed:
+            m = _add_pos_embed(m, W, H)
+        main_logits = self.scratch.output_conv2(m)
+        fmap = main_logits.permute(0, 2, 3, 1)
+        depth_pred = _apply_activation(fmap[..., :-1], self.activation)
+        depth_conf = _apply_activation(fmap[..., -1], self.conf_activation)
+
+        outs = {
+            self.head_main: depth_pred.squeeze(-1).view(B, S, *depth_pred.shape[1:-1]),
+            f"{self.head_main}_conf": depth_conf.view(B, S, *depth_conf.shape[1:]),
+        }
+
+        if self.enable_aux:
+            # Auxiliary "ray" head (multi-level inside) -- only the last level
+            # is returned. Mirrors upstream ``DualDPT._fuse`` + ``_forward_impl``:
+            # each aux pyramid level goes through ``output_conv1_aux[i]``
+            # (5-layer conv stack that ends at ``features // 2`` channels),
+            # then the last level optionally gets a pos-embed and finally
+            # ``output_conv2_aux[-1]``.
+            aux_processed = [
+                self.scratch.output_conv1_aux[i](a) for i, a in enumerate(aux_pyr)
+            ]
+            last_aux = aux_processed[-1]
+            if self.pos_embed:
+                last_aux = _add_pos_embed(last_aux, W, H)
+            last_aux_logits = self.scratch.output_conv2_aux[-1](last_aux)
+            fmap_last = last_aux_logits.permute(0, 2, 3, 1)
+            # Channels: [ray(6), ray_conf(1)]; ray uses 'linear' activation.
+            aux_pred = fmap_last[..., :-1]
+            aux_conf = _apply_activation(fmap_last[..., -1], self.conf_activation)
+            outs[self.head_aux] = aux_pred.view(B, S, *aux_pred.shape[1:])
+            outs[f"{self.head_aux}_conf"] = aux_conf.view(B, S, *aux_conf.shape[1:])
+
+        return outs
--- a/comfy/ldm/depth_anything_3/model.py
+++ b/comfy/ldm/depth_anything_3/model.py
@ -0,0 +1,236 @@
+from __future__ import annotations
+
+from typing import Dict, Optional, Sequence
+
+import torch
+import torch.nn as nn
+
+from comfy.image_encoders.dino2 import Dinov2Model
+
+from .camera import CameraDec, CameraEnc
+from .dpt import DPT, DualDPT
+from .ray_pose import get_extrinsic_from_camray
+from .transform import affine_inverse, pose_encoding_to_extri_intri
+
+
+_HEAD_REGISTRY = {
+    "dpt": DPT,
+    "dualdpt": DualDPT,
+}
+
+
+# Backbone presets (mirror the upstream DINOv2 ViT variants).
+_BACKBONE_PRESETS = {
+    "vits": dict(hidden_size=384,  num_hidden_layers=12, num_attention_heads=6,  use_swiglu_ffn=False),
+    "vitb": dict(hidden_size=768,  num_hidden_layers=12, num_attention_heads=12, use_swiglu_ffn=False),
+    "vitl": dict(hidden_size=1024, num_hidden_layers=24, num_attention_heads=16, use_swiglu_ffn=False),
+    "vitg": dict(hidden_size=1536, num_hidden_layers=40, num_attention_heads=24, use_swiglu_ffn=True),
+}
+
+
+def _build_backbone_config(
+    backbone_name: str,
+    *,
+    alt_start: int,
+    qknorm_start: int,
+    rope_start: int,
+    cat_token: bool,
+) -> dict:
+    if backbone_name not in _BACKBONE_PRESETS:
+        raise ValueError(f"Unknown DINOv2 backbone variant: {backbone_name!r}")
+    cfg = dict(_BACKBONE_PRESETS[backbone_name])
+    cfg.update(dict(
+        layer_norm_eps=1e-6,
+        patch_size=14,
+        image_size=518,
+        # No mask_token in DA3 weights; omit param to avoid load warnings.
+        use_mask_token=False,
+        alt_start=alt_start,
+        qknorm_start=qknorm_start,
+        rope_start=rope_start,
+        cat_token=cat_token,
+        rope_freq=100.0,
+    ))
+    return cfg
+
+
+class DepthAnything3Net(nn.Module):
+
+    PATCH_SIZE = 14
+
+    def __init__(
+        self,
+        # --- Backbone ---
+        backbone_name: str = "vitl",
+        out_layers: Sequence[int] = (4, 11, 17, 23),
+        alt_start: int = -1,
+        qknorm_start: int = -1,
+        rope_start: int = -1,
+        cat_token: bool = False,
+        # --- Head ---
+        head_type: str = "dpt",  # dpt or dualdpt
+        head_dim_in: int = 1024,
+        head_output_dim: int = 1,  # 1 = depth only, 2 = depth+conf
+        head_features: int = 256,
+        head_out_channels: Sequence[int] = (256, 512, 1024, 1024),
+        head_use_sky_head: bool = True, # ignored by DualDPT
+        head_pos_embed: Optional[bool] = None,  # default: True for DualDPT, False for DPT
+        # --- Camera (multi-view) ---
+        has_cam_enc: bool = False,
+        has_cam_dec: bool = False,
+        cam_dim_out: Optional[int] = None,  # CameraEnc dim_out (defaults to embed_dim)
+        cam_dec_dim_in: Optional[int] = None,  # CameraDec dim_in  (defaults to 2*embed_dim with cat_token)
+        # ComfyUI plumbing
+        device=None, dtype=None, operations=None,
+        **_ignored,
+    ):
+        super().__init__()
+        head_cls = _HEAD_REGISTRY[head_type.lower()]
+        self.head_type = head_type.lower()
+        self.has_sky = (self.head_type == "dpt") and head_use_sky_head
+        self.has_conf = head_output_dim > 1
+        self.out_layers = list(out_layers)
+
+        backbone_cfg = _build_backbone_config(
+            backbone_name,
+            alt_start=alt_start,
+            qknorm_start=qknorm_start,
+            rope_start=rope_start,
+            cat_token=cat_token,
+        )
+        self.backbone = Dinov2Model(backbone_cfg, dtype, device, operations)
+
+        head_kwargs = dict(
+            dim_in=head_dim_in,
+            patch_size=self.PATCH_SIZE,
+            output_dim=head_output_dim,
+            features=head_features,
+            out_channels=tuple(head_out_channels),
+            device=device, dtype=dtype, operations=operations,
+        )
+        if self.head_type == "dpt":
+            head_kwargs.update(
+                use_sky_head=head_use_sky_head,
+                pos_embed=(False if head_pos_embed is None else head_pos_embed),
+            )
+        else:  # dualdpt
+            head_kwargs.update(
+                pos_embed=(True if head_pos_embed is None else head_pos_embed),
+            )
+        self.head = head_cls(**head_kwargs)
+
+        # Built only if checkpoint has weights; cam_enc output dim == embed_dim.
+        embed_dim = backbone_cfg["hidden_size"]
+        if has_cam_enc:
+            self.cam_enc = CameraEnc(
+                dim_out=cam_dim_out if cam_dim_out is not None else embed_dim,
+                num_heads=max(1, embed_dim // 64),
+                device=device, dtype=dtype, operations=operations,
+            )
+        else:
+            self.cam_enc = None
+        if has_cam_dec:
+            default_dim = embed_dim * (2 if cat_token else 1)
+            self.cam_dec = CameraDec(
+                dim_in=cam_dec_dim_in if cam_dec_dim_in is not None else default_dim,
+                device=device, dtype=dtype, operations=operations,
+            )
+        else:
+            self.cam_dec = None
+
+        self.dtype = dtype
+
+    def forward(
+        self,
+        image: torch.Tensor,
+        extrinsics: Optional[torch.Tensor] = None,
+        intrinsics: Optional[torch.Tensor] = None,
+        *,
+        use_ray_pose: bool = False,
+        ref_view_strategy: str = "saddle_balanced",
+        export_feat_layers: Optional[Sequence[int]] = None,
+        **_unused,
+    ) -> Dict[str, torch.Tensor]:
+        """Run depth and optionally pose prediction."""
+        if image.ndim == 4:
+            image = image.unsqueeze(1)  # (B, 1, 3, H, W)
+        assert image.ndim == 5 and image.shape[2] == 3, \
+            f"image must be (B,3,H,W) or (B,S,3,H,W); got {tuple(image.shape)}"
+
+        B, S, _, H, W = image.shape
+        assert H % self.PATCH_SIZE == 0 and W % self.PATCH_SIZE == 0, \
+            f"image H,W must be multiples of {self.PATCH_SIZE}; got {(H, W)}"
+
+        # Camera-token preparation (multi-view path).
+        cam_token = None
+        if extrinsics is not None and intrinsics is not None and self.cam_enc is not None:
+            cam_token = self.cam_enc(extrinsics, intrinsics, (H, W))
+
+        # Toggle aux ray output on/off depending on what the caller asked for.
+        if isinstance(self.head, DualDPT):
+            self.head.enable_aux = bool(use_ray_pose)
+
+        feats, aux_feats = self.backbone.get_intermediate_layers_da3(
+            image, self.out_layers, cam_token=cam_token,
+            ref_view_strategy=ref_view_strategy,
+            export_feat_layers=export_feat_layers,
+        )
+        head_out = self.head(feats, H=H, W=W, patch_start_idx=0)
+
+        # Pose prediction.
+        out: Dict[str, torch.Tensor] = {}
+        if use_ray_pose and "ray" in head_out and "ray_conf" in head_out:
+            ray = head_out["ray"]
+            ray_conf = head_out["ray_conf"]
+            extr_c2w, focal, pp = get_extrinsic_from_camray(
+                ray, ray_conf, ray.shape[-3], ray.shape[-2],
+            )
+            # Match the upstream output: w2c, drop the homogeneous row.
+            extr_w2c = affine_inverse(extr_c2w)[:, :, :3, :]
+            # Build pixel-space intrinsics from the normalised focal/pp output.
+            intr = torch.eye(3, device=ray.device, dtype=ray.dtype)
+            intr = intr[None, None].expand(extr_c2w.shape[0], extr_c2w.shape[1], 3, 3).clone()
+            intr[:, :, 0, 0] = focal[:, :, 0] / 2 * W
+            intr[:, :, 1, 1] = focal[:, :, 1] / 2 * H
+            intr[:, :, 0, 2] = pp[:, :, 0] * W * 0.5
+            intr[:, :, 1, 2] = pp[:, :, 1] * H * 0.5
+            out["extrinsics"] = extr_w2c
+            out["intrinsics"] = intr
+        elif self.cam_dec is not None and S > 1:
+            # Decode the cam-token of the final out_layer into a pose encoding.
+            cam_feat = feats[-1][1]  # (B, S, dim_in_to_cam_dec)
+            pose_enc = self.cam_dec(cam_feat)
+            c2w_3x4, intr = pose_encoding_to_extri_intri(pose_enc, (H, W))
+            # Match the upstream output convention: w2c (world->camera), 3x4.
+            c2w_4x4 = torch.cat([
+                c2w_3x4,
+                torch.tensor([0, 0, 0, 1], device=c2w_3x4.device, dtype=c2w_3x4.dtype)
+                    .view(1, 1, 1, 4).expand(B, S, 1, 4),
+            ], dim=-2)
+            out["extrinsics"] = affine_inverse(c2w_4x4)[:, :, :3, :]
+            out["intrinsics"] = intr
+
+        # Flatten the views axis for per-pixel outputs (depth/conf/sky) so the
+        # per-image consumer keeps its (B*S, H, W) interface.
+        for k, v in head_out.items():
+            if k in ("ray", "ray_conf"):
+                # Keep multi-view shape for downstream pose work.
+                out[k] = v
+            elif v.ndim >= 3 and v.shape[0] == B and v.shape[1] == S:
+                out[k] = v.reshape(B * S, *v.shape[2:])
+            else:
+                out[k] = v
+
+        if export_feat_layers:
+            out["aux_features"] = self._reshape_aux_features(aux_feats, H, W)
+        return out
+
+    def _reshape_aux_features(self, aux_feats, H: int, W: int):
+        """Reshape (B, S, N, C) aux features into (B, S, h_p, w_p, C)."""
+        ph, pw = H // self.PATCH_SIZE, W // self.PATCH_SIZE
+        out = []
+        for f in aux_feats:
+            B, S, N, C = f.shape
+            assert N == ph * pw, f"aux feature seq mismatch: {N} != {ph}*{pw}"
+            out.append(f.reshape(B, S, ph, pw, C))
+        return out
--- a/comfy/ldm/depth_anything_3/preprocess.py
+++ b/comfy/ldm/depth_anything_3/preprocess.py
@ -0,0 +1,128 @@
+"""Input/output preprocessing helpers for Depth Anything 3."""
+
+from __future__ import annotations
+
+from typing import Tuple
+
+import torch
+
+import comfy.utils
+
+PATCH_SIZE = 14
+
+# ImageNet normalization constants used during DA3 training.
+_IMAGENET_MEAN = torch.tensor([0.485, 0.456, 0.406])
+_IMAGENET_STD = torch.tensor([0.229, 0.224, 0.225])
+
+
+def _round_to_patch(x: int, patch: int = PATCH_SIZE) -> int:
+    down = (x // patch) * patch
+    up = down + patch
+    return up if abs(up - x) <= abs(x - down) else down
+
+
+def compute_target_size(orig_h: int, orig_w: int, process_res: int, method: str = "upper_bound_resize") -> Tuple[int, int]:
+    """Compute (target_h, target_w) for a single image.
+    upper_bound_resize: scale longest side to process_res, then round each dim to nearest multiple of 14 (default upstream method).
+    lower_bound_resize: scale shortest side to process_res, then round."""
+
+    if method == "upper_bound_resize":
+        longest = max(orig_h, orig_w)
+        scale = process_res / float(longest)
+    elif method == "lower_bound_resize":
+        shortest = min(orig_h, orig_w)
+        scale = process_res / float(shortest)
+    else:
+        raise ValueError(f"Unsupported process_res_method: {method}")
+
+    new_w = max(1, _round_to_patch(int(round(orig_w * scale))))
+    new_h = max(1, _round_to_patch(int(round(orig_h * scale))))
+    return new_h, new_w
+
+
+def preprocess_image(image: torch.Tensor, process_res: int = 504, method: str = "upper_bound_resize") -> torch.Tensor:
+    assert image.ndim == 4 and image.shape[-1] == 3, f"expected (B,H,W,3) IMAGE; got {tuple(image.shape)}"
+    B, H, W, _ = image.shape
+    target_h, target_w = compute_target_size(H, W, process_res, method)
+
+    # (B, H, W, 3) -> (B, 3, H, W)
+    x = image.movedim(-1, 1).contiguous()
+    if (target_h, target_w) != (H, W):
+        # Upstream uses cv2 INTER_CUBIC (upscale) / INTER_AREA (downscale).
+        # Lanczos in ``common_upscale`` is anti-aliased and produces the
+        # closest pixel-wise match in a sweep across {bilinear, bicubic,
+        # area, lanczos, bislerp}. Used in both directions for simplicity.
+        x = comfy.utils.common_upscale(x.float(), target_w, target_h, "lanczos", "disabled",)
+    x = x.clamp(0.0, 1.0)
+
+    mean = _IMAGENET_MEAN.to(device=x.device, dtype=x.dtype).view(1, 3, 1, 1)
+    std = _IMAGENET_STD.to(device=x.device, dtype=x.dtype).view(1, 3, 1, 1)
+    x = (x - mean) / std
+    return x
+
+
+# -----------------------------------------------------------------------------
+# Output post-processing (sky-aware clipping for Mono/Metric variants)
+# -----------------------------------------------------------------------------
+
+
+def compute_non_sky_mask(sky_prediction: torch.Tensor, threshold: float = 0.3) -> torch.Tensor:
+    """Boolean mask: True for non-sky pixels (sky probability < threshold)."""
+    return sky_prediction < threshold
+
+
+def apply_sky_aware_clip(depth: torch.Tensor, sky: torch.Tensor, threshold: float = 0.3, quantile: float = 0.99) -> torch.Tensor:
+    """Clips sky regions to the 99th percentile of non-sky depth. Returns a new depth tensor."""
+    non_sky = compute_non_sky_mask(sky, threshold=threshold)
+    if non_sky.sum() <= 10 or (~non_sky).sum() <= 10:
+        return depth.clone()
+
+    non_sky_depth = depth[non_sky]
+    if non_sky_depth.numel() > 100_000:
+        idx = torch.randint(0, non_sky_depth.numel(), (100_000,), device=non_sky_depth.device)
+        sampled = non_sky_depth[idx]
+    else:
+        sampled = non_sky_depth
+
+    max_depth = torch.quantile(sampled, quantile)
+    out = depth.clone()
+    out[~non_sky] = max_depth
+    return out
+
+
+def normalize_depth_v2_style(depth: torch.Tensor, sky: torch.Tensor | None = None, low_quantile: float = 0.01, high_quantile: float = 0.99) -> torch.Tensor:
+    """V2-style normalization computes percentile bounds over non-sky pixels (when available), then maps depth into [0, 1] with near = white (1.0)."""
+    if sky is not None:
+        mask = compute_non_sky_mask(sky)
+        if mask.any():
+            valid = depth[mask]
+        else:
+            valid = depth.flatten()
+    else:
+        valid = depth.flatten()
+
+    if valid.numel() > 100_000:
+        idx = torch.randint(0, valid.numel(), (100_000,), device=valid.device)
+        sample = valid[idx]
+    else:
+        sample = valid
+
+    lo = torch.quantile(sample, low_quantile)
+    hi = torch.quantile(sample, high_quantile)
+    rng = (hi - lo).clamp(min=1e-6)
+    norm = ((depth - lo) / rng).clamp(0.0, 1.0)
+    # Nearer pixels are brighter (1.0)
+    norm = 1.0 - norm
+    if sky is not None:
+        # Sky pixels become black (far / unknown)
+        sky_mask = ~compute_non_sky_mask(sky)
+        norm = torch.where(sky_mask, torch.zeros_like(norm), norm)
+    return norm
+
+
+def normalize_depth_min_max(depth: torch.Tensor) -> torch.Tensor:
+    """Simple per-frame min/max normalization with near=1.0 convention."""
+    lo = depth.amin(dim=(-2, -1), keepdim=True)
+    hi = depth.amax(dim=(-2, -1), keepdim=True)
+    rng = (hi - lo).clamp(min=1e-6)
+    return 1.0 - ((depth - lo) / rng).clamp(0.0, 1.0)
--- a/comfy/ldm/depth_anything_3/ray_pose.py
+++ b/comfy/ldm/depth_anything_3/ray_pose.py
@ -0,0 +1,272 @@
+"""Ray-to-pose conversion for the multi-view path of Depth Anything 3."""
+
+from __future__ import annotations
+
+from typing import Optional, Tuple
+
+import torch
+
+
+# qr/svd use fp32: CUDA often has no fp16/bf16 kernels for these ops.
+
+
+def _ql_decomposition(A: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Decompose A = Q @ L with Q orthogonal and L lower-triangular.
+    Implemented in terms of QR by reversing the columns/rows; the standard
+    trick from the upstream reference. Inputs A are (3, 3)."""
+    P = torch.tensor([[0, 0, 1], [0, 1, 0], [1, 0, 0]], device=A.device, dtype=A.dtype)
+    A_tilde = A @ P
+    # CUDA QR is not implemented for fp16/bf16; upcast just for this call.
+    Q_tilde, R_tilde = torch.linalg.qr(A_tilde.float())
+    Q_tilde = Q_tilde.to(A.dtype)
+    R_tilde = R_tilde.to(A.dtype)
+    Q = Q_tilde @ P
+    L = P @ R_tilde @ P
+    d = torch.diag(L)
+    sign = torch.sign(d)
+    Q = Q * sign[None, :]  # scale columns of Q
+    L = L * sign[:, None]  # scale rows of L
+    return Q, L
+
+
+def _homogenize_points(points: torch.Tensor) -> torch.Tensor:
+    return torch.cat([points, torch.ones_like(points[..., :1])], dim=-1)
+
+
+# -----------------------------------------------------------------------------
+# Weighted-LSQ + RANSAC homography (batched)
+# -----------------------------------------------------------------------------
+
+
+def _find_homography_weighted_lsq(src_pts: torch.Tensor, dst_pts: torch.Tensor, confident_weight: torch.Tensor,) -> torch.Tensor:
+    """Solve a single H with weighted least-squares (DLT)."""
+    N = src_pts.shape[0]
+    if N < 4:
+        raise ValueError("At least 4 points are required to compute a homography.")
+    w = confident_weight.sqrt().unsqueeze(1)  # (N, 1)
+    x = src_pts[:, 0:1]
+    y = src_pts[:, 1:2]
+    u = dst_pts[:, 0:1]
+    v = dst_pts[:, 1:2]
+    zeros = torch.zeros_like(x)
+    A1 = torch.cat([-x * w, -y * w, -w, zeros, zeros, zeros, x * u * w, y * u * w, u * w], dim=1)
+    A2 = torch.cat([zeros, zeros, zeros, -x * w, -y * w, -w, x * v * w, y * v * w, v * w], dim=1)
+    A = torch.cat([A1, A2], dim=0)        # (2N, 9)
+    # CUDA SVD is not implemented for fp16/bf16; upcast just for this call.
+    _, _, Vh = torch.linalg.svd(A.float())
+    Vh = Vh.to(A.dtype)
+    H = Vh[-1].reshape(3, 3)
+    return H / H[-1, -1]
+
+
+def _find_homography_weighted_lsq_batched(src_pts_batch: torch.Tensor, dst_pts_batch: torch.Tensor, confident_weight_batch: torch.Tensor) -> torch.Tensor:
+    """Batched DLT solver. Inputs (B, K, 2) / (B, K); output (B, 3, 3)."""
+    B, K, _ = src_pts_batch.shape
+    w = confident_weight_batch.sqrt().unsqueeze(2)
+    x = src_pts_batch[:, :, 0:1]
+    y = src_pts_batch[:, :, 1:2]
+    u = dst_pts_batch[:, :, 0:1]
+    v = dst_pts_batch[:, :, 1:2]
+    zeros = torch.zeros_like(x)
+    A1 = torch.cat([-x * w, -y * w, -w, zeros, zeros, zeros, x * u * w, y * u * w, u * w], dim=2)
+    A2 = torch.cat([zeros, zeros, zeros, -x * w, -y * w, -w, x * v * w, y * v * w, v * w], dim=2)
+    A = torch.cat([A1, A2], dim=1)        # (B, 2K, 9)
+    # CUDA SVD is not implemented for fp16/bf16; upcast just for this call.
+    _, _, Vh = torch.linalg.svd(A.float())
+    Vh = Vh.to(A.dtype)
+    H = Vh[:, -1].reshape(B, 3, 3)
+    return H / H[:, 2:3, 2:3]
+
+
+def _ransac_find_homography_weighted_batched(
+    src_pts: torch.Tensor,                # (B, N, 2)
+    dst_pts: torch.Tensor,                # (B, N, 2)
+    confident_weight: torch.Tensor,       # (B, N)
+    n_sample: int,
+    n_iter: int = 100,
+    reproj_threshold: float = 3.0,
+    num_sample_for_ransac: int = 8,
+    max_inlier_num: int = 10000,
+    rand_sample_iters_idx: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """Batched weighted-RANSAC homography estimator. Returns (B, 3, 3) homography matrices."""
+    B, N, _ = src_pts.shape
+    assert N >= 4
+    device = src_pts.device
+
+    sorted_idx = torch.argsort(confident_weight, descending=True, dim=1)
+    candidate_idx = sorted_idx[:, :n_sample]                  # (B, n_sample)
+
+    if rand_sample_iters_idx is None:
+        rand_sample_iters_idx = torch.stack(
+            [torch.randperm(n_sample, device=device)[:num_sample_for_ransac]
+             for _ in range(n_iter)],
+            dim=0,
+        )
+
+    rand_idx = candidate_idx[:, rand_sample_iters_idx]        # (B, n_iter, k)
+    b_idx = (
+        torch.arange(B, device=device)
+        .view(B, 1, 1)
+        .expand(B, n_iter, num_sample_for_ransac)
+    )
+    src_b = src_pts[b_idx, rand_idx]
+    dst_b = dst_pts[b_idx, rand_idx]
+    w_b = confident_weight[b_idx, rand_idx]
+
+    cB, cN = src_b.shape[:2]
+    H_batch = _find_homography_weighted_lsq_batched(
+        src_b.flatten(0, 1), dst_b.flatten(0, 1), w_b.flatten(0, 1),
+    ).unflatten(0, (cB, cN))                                  # (B, n_iter, 3, 3)
+
+    src_homo = torch.cat([src_pts, torch.ones(B, N, 1, device=device, dtype=src_pts.dtype)], dim=2)
+    proj = torch.bmm(
+        src_homo.unsqueeze(1).expand(B, n_iter, N, 3).reshape(-1, N, 3),
+        H_batch.reshape(-1, 3, 3).transpose(1, 2),
+    )                                                          # (B*n_iter, N, 3)
+    proj_xy = (proj[:, :, :2] / proj[:, :, 2:3]).reshape(B, n_iter, N, 2)
+    err = ((proj_xy - dst_pts.unsqueeze(1)) ** 2).sum(-1).sqrt()  # (B, n_iter, N)
+    inlier_mask = err < reproj_threshold
+    score = (inlier_mask * confident_weight.unsqueeze(1)).sum(dim=2)
+    best_idx = torch.argmax(score, dim=1)
+    best_inlier_mask = inlier_mask[torch.arange(B, device=device), best_idx]
+
+    # Refit with the inlier set (per-batch, since the inlier counts vary).
+    H_inlier_list = []
+    for b in range(B):
+        mask = best_inlier_mask[b]
+        in_src = src_pts[b][mask]
+        in_dst = dst_pts[b][mask]
+        in_w = confident_weight[b][mask]
+        if in_src.shape[0] < 4:
+            # Fall back to identity when RANSAC fails to find enough inliers.
+            H_inlier_list.append(torch.eye(3, device=device, dtype=src_pts.dtype))
+            continue
+        sorted_w = torch.argsort(in_w, descending=True)
+        if len(sorted_w) > max_inlier_num:
+            keep = max(int(len(sorted_w) * 0.95), max_inlier_num)
+            sorted_w = sorted_w[:keep][torch.randperm(keep, device=device)[:max_inlier_num]]
+        H_inlier_list.append(
+            _find_homography_weighted_lsq(in_src[sorted_w], in_dst[sorted_w], in_w[sorted_w])
+        )
+    return torch.stack(H_inlier_list, dim=0)
+
+
+# -----------------------------------------------------------------------------
+# Camera-ray utilities
+# -----------------------------------------------------------------------------
+
+
+def _unproject_identity(num_y: int, num_x: int, B: int, S: int, device, dtype) -> torch.Tensor:
+    """Camera-space unit rays for an identity intrinsic on a 2x2 image plane."""
+    dx = 1.0 / num_x
+    dy = 1.0 / num_y
+    # Centered camera-space coords directly (skip the K^-1 step since it's
+    # just a translation by -1 on x and y when K is identity-with-center=1).
+    y = torch.linspace(-(1 - dy), (1 - dy), num_y, device=device, dtype=dtype)
+    x = torch.linspace(-(1 - dx), (1 - dx), num_x, device=device, dtype=dtype)
+    yy, xx = torch.meshgrid(y, x, indexing="ij")
+    grid = torch.stack((xx, yy), dim=-1)            # (h, w, 2)
+    grid = grid.unsqueeze(0).unsqueeze(0).expand(B, S, num_y, num_x, 2)
+    return torch.cat([grid, torch.ones_like(grid[..., :1])], dim=-1)
+
+
+def _camray_to_caminfo(
+    camray: torch.Tensor,  # (B, S, h, w, 6)
+    confidence: Optional[torch.Tensor] = None,  # (B, S, h, w)
+    reproj_threshold: float = 0.2,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Convert per-pixel camera rays to per-view (R, T, focal, principal)."""
+    if confidence is None:
+        confidence = torch.ones_like(camray[..., 0])
+    B, S, h, w, _ = camray.shape
+    device = camray.device
+    dtype = camray.dtype
+
+    rays_target = camray[..., :3]                           # (B, S, h, w, 3)
+    rays_origin = _unproject_identity(h, w, B, S, device, dtype)
+
+    # Flatten (B*S, h*w, *) for the RANSAC routine.
+    rays_target = rays_target.flatten(0, 1).flatten(1, 2)
+    rays_origin = rays_origin.flatten(0, 1).flatten(1, 2)
+    weights = confidence.flatten(0, 1).flatten(1, 2).clone()
+
+    # Project to 2D in homogeneous form (the upstream calls this "perspective division").
+    z_thresh = 1e-4
+    mask = (rays_target[:, :, 2].abs() > z_thresh) & (rays_origin[:, :, 2].abs() > z_thresh)
+    weights = torch.where(mask, weights, torch.zeros_like(weights))
+    src = rays_origin.clone()
+    dst = rays_target.clone()
+    src[..., 0] = torch.where(mask, src[..., 0] / src[..., 2], src[..., 0])
+    src[..., 1] = torch.where(mask, src[..., 1] / src[..., 2], src[..., 1])
+    dst[..., 0] = torch.where(mask, dst[..., 0] / dst[..., 2], dst[..., 0])
+    dst[..., 1] = torch.where(mask, dst[..., 1] / dst[..., 2], dst[..., 1])
+    src = src[..., :2]
+    dst = dst[..., :2]
+
+    N = src.shape[1]
+    n_iter = 100
+    sample_ratio = 0.3
+    num_sample_for_ransac = 8
+    n_sample = max(num_sample_for_ransac, int(N * sample_ratio))
+    rand_idx = torch.stack(
+        [torch.randperm(n_sample, device=device)[:num_sample_for_ransac] for _ in range(n_iter)],
+        dim=0,
+    )
+
+    # Chunk along the view axis to keep peak memory predictable.
+    chunk = 2
+    A_list = []
+    for i in range(0, src.shape[0], chunk):
+        A = _ransac_find_homography_weighted_batched(
+            src[i:i + chunk], dst[i:i + chunk], weights[i:i + chunk],
+            n_sample=n_sample, n_iter=n_iter,
+            num_sample_for_ransac=num_sample_for_ransac,
+            reproj_threshold=reproj_threshold,
+            rand_sample_iters_idx=rand_idx,
+            max_inlier_num=8000,
+        )
+        # Flip sign on dets that come out < 0 (so that the QL produces a
+        # right-handed rotation). ``det`` lacks fp16/bf16 CUDA kernels, so
+        # do the comparison in fp32.
+        flip = torch.linalg.det(A.float()) < 0
+        A = torch.where(flip[:, None, None], -A, A)
+        A_list.append(A)
+    A = torch.cat(A_list, dim=0)                            # (B*S, 3, 3)
+
+    R_list, f_list, pp_list = [], [], []
+    for i in range(A.shape[0]):
+        R, L = _ql_decomposition(A[i])
+        L = L / L[2][2]
+        f_list.append(torch.stack((L[0][0], L[1][1])))
+        pp_list.append(torch.stack((L[2][0], L[2][1])))
+        R_list.append(R)
+    R = torch.stack(R_list).reshape(B, S, 3, 3)
+    focal = torch.stack(f_list).reshape(B, S, 2)
+    pp = torch.stack(pp_list).reshape(B, S, 2)
+
+    # Translation: confidence-weighted average of camray direction(s).
+    cf = confidence.flatten(0, 1).flatten(1, 2)
+    T = (camray.flatten(0, 1).flatten(1, 2)[..., 3:] * cf.unsqueeze(-1)).sum(dim=1)
+    T = T / cf.sum(dim=-1, keepdim=True)
+    T = T.reshape(B, S, 3)
+
+    # Match upstream output convention: focal -> 1/focal, pp + 1.
+    return R, T, 1.0 / focal, pp + 1.0
+
+
+def get_extrinsic_from_camray(
+    camray: torch.Tensor,  # (B, S, h, w, 6)
+    conf: torch.Tensor,  # (B, S, h, w, 1) or (B, S, h, w)
+    patch_size_y: int,
+    patch_size_x: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Wrap a 4x4 extrinsic + per-view focal + principal-point output."""
+    if conf.ndim == 5 and conf.shape[-1] == 1:
+        conf = conf.squeeze(-1)
+    R, T, focal, pp = _camray_to_caminfo(camray, confidence=conf)
+    extr = torch.cat([R, T.unsqueeze(-1)], dim=-1)           # (B, S, 3, 4)
+    homo_row = torch.tensor([0, 0, 0, 1], dtype=R.dtype, device=R.device)
+    homo_row = homo_row.view(1, 1, 1, 4).expand(R.shape[0], R.shape[1], 1, 4)
+    extr = torch.cat([extr, homo_row], dim=-2)               # (B, S, 4, 4)
+    return extr, focal, pp
--- a/comfy/ldm/depth_anything_3/reference_view_selector.py
+++ b/comfy/ldm/depth_anything_3/reference_view_selector.py
@ -0,0 +1,87 @@
+"""Reference-view selection for the multi-view path of Depth Anything 3."""
+
+from __future__ import annotations
+
+from typing import Literal
+
+import torch
+
+
+RefViewStrategy = Literal["first", "middle", "saddle_balanced", "saddle_sim_range"]
+
+
+# Per the upstream constants module: ``THRESH_FOR_REF_SELECTION = 3``.
+# Reference selection only runs when there are at least this many views.
+THRESH_FOR_REF_SELECTION: int = 3
+
+
+def select_reference_view(x: torch.Tensor, strategy: RefViewStrategy = "saddle_balanced") -> torch.Tensor:
+    """Pick a reference view index per batch element."""
+    B, S, _, _ = x.shape
+    if S <= 1:
+        return torch.zeros(B, dtype=torch.long, device=x.device)
+    if strategy == "first":
+        return torch.zeros(B, dtype=torch.long, device=x.device)
+    if strategy == "middle":
+        return torch.full((B,), S // 2, dtype=torch.long, device=x.device)
+
+    # Feature-based strategies: normalised cls/cam token per view.
+    img_class_feat = x[:, :, 0] / x[:, :, 0].norm(dim=-1, keepdim=True)  # (B,S,C)
+
+    if strategy == "saddle_balanced":
+        sim = torch.matmul(img_class_feat, img_class_feat.transpose(1, 2))  # (B,S,S)
+        sim_no_diag = sim - torch.eye(S, device=sim.device).unsqueeze(0)
+        sim_score = sim_no_diag.sum(dim=-1) / (S - 1)               # (B,S)
+        feat_norm = x[:, :, 0].norm(dim=-1)                          # (B,S)
+        feat_var = img_class_feat.var(dim=-1)                        # (B,S)
+
+        def _normalize(metric):
+            mn = metric.min(dim=1, keepdim=True).values
+            mx = metric.max(dim=1, keepdim=True).values
+            return (metric - mn) / (mx - mn + 1e-8)
+
+        sim_n, norm_n, var_n = _normalize(sim_score), _normalize(feat_norm), _normalize(feat_var)
+        balance = (sim_n - 0.5).abs() + (norm_n - 0.5).abs() + (var_n - 0.5).abs()
+        return balance.argmin(dim=1)
+
+    if strategy == "saddle_sim_range":
+        sim = torch.matmul(img_class_feat, img_class_feat.transpose(1, 2))
+        sim_no_diag = sim - torch.eye(S, device=sim.device).unsqueeze(0)
+        sim_max = sim_no_diag.max(dim=-1).values
+        sim_min = sim_no_diag.min(dim=-1).values
+        return (sim_max - sim_min).argmax(dim=1)
+
+    raise ValueError(
+        f"Unknown reference view selection strategy: {strategy!r}. "
+        f"Must be one of: 'first', 'middle', 'saddle_balanced', 'saddle_sim_range'"
+    )
+
+
+def reorder_by_reference(x: torch.Tensor, b_idx: torch.Tensor) -> torch.Tensor:
+    """Reorder x so the reference view is at position 0 in axis S."""
+    B, S = x.shape[0], x.shape[1]
+    if S <= 1:
+        return x
+    positions = torch.arange(S, device=x.device).unsqueeze(0).expand(B, -1)
+    b_idx_exp = b_idx.unsqueeze(1)
+    reorder = torch.where(
+        (positions > 0) & (positions <= b_idx_exp),
+        positions - 1,
+        positions,
+    )
+    reorder[:, 0] = b_idx
+    batch = torch.arange(B, device=x.device).unsqueeze(1)
+    return x[batch, reorder]
+
+
+def restore_original_order(x: torch.Tensor, b_idx: torch.Tensor) -> torch.Tensor:
+    """Inverse of reorder_by_reference."""
+    B, S = x.shape[0], x.shape[1]
+    if S <= 1:
+        return x
+    target_positions = torch.arange(S, device=x.device).unsqueeze(0).expand(B, -1)
+    b_idx_exp = b_idx.unsqueeze(1)
+    restore = torch.where(target_positions < b_idx_exp, target_positions + 1, target_positions)
+    restore = torch.scatter(restore, dim=1, index=b_idx_exp, src=torch.zeros_like(b_idx_exp))
+    batch = torch.arange(B, device=x.device).unsqueeze(1)
+    return x[batch, restore]
--- a/comfy/ldm/depth_anything_3/transform.py
+++ b/comfy/ldm/depth_anything_3/transform.py
@ -0,0 +1,160 @@
+"""Geometry / camera transform helpers for Depth Anything 3."""
+
+from __future__ import annotations
+
+from typing import Tuple
+
+import torch
+import torch.nn.functional as F
+
+
+# -----------------------------------------------------------------------------
+# Affine 4x4 helpers
+# -----------------------------------------------------------------------------
+
+
+def as_homogeneous(ext: torch.Tensor) -> torch.Tensor:
+    """Promote (...,3,4) extrinsics to (...,4,4) homogeneous form. No-op when the input is already ``(...,4,4)``."""
+    if ext.shape[-2:] == (4, 4):
+        return ext
+    if ext.shape[-2:] == (3, 4):
+        ones = torch.zeros_like(ext[..., :1, :4])
+        ones[..., 0, 3] = 1.0
+        return torch.cat([ext, ones], dim=-2)
+    raise ValueError(f"Invalid affine shape: {ext.shape}")
+
+
+def affine_inverse(A: torch.Tensor) -> torch.Tensor:
+    """Inverse of an affine matrix ``[R|T; 0 0 0 1]``."""
+    R = A[..., :3, :3]
+    T = A[..., :3, 3:]
+    P = A[..., 3:, :]
+    return torch.cat([torch.cat([R.mT, -R.mT @ T], dim=-1), P], dim=-2)
+
+
+# -----------------------------------------------------------------------------
+# Quaternion <-> rotation matrix (xyzw / scalar-last)
+# -----------------------------------------------------------------------------
+
+
+def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor:
+    """sqrt(max(0, x)) with a zero subgradient where x == 0."""
+    ret = torch.zeros_like(x)
+    positive_mask = x > 0
+    if torch.is_grad_enabled():
+        ret[positive_mask] = torch.sqrt(x[positive_mask])
+    else:
+        ret = torch.where(positive_mask, torch.sqrt(x), ret)
+    return ret
+
+
+def standardize_quaternion(quaternions: torch.Tensor) -> torch.Tensor:
+    """Force the real part of a unit quaternion (xyzw) to be non-negative."""
+    return torch.where(quaternions[..., 3:4] < 0, -quaternions, quaternions)
+
+
+def quat_to_mat(quaternions: torch.Tensor) -> torch.Tensor:
+    """Convert quaternions (xyzw) to (...,3,3) rotation matrices."""
+    i, j, k, r = torch.unbind(quaternions, -1)
+    two_s = 2.0 / (quaternions * quaternions).sum(-1)
+    o = torch.stack(
+        (
+            1 - two_s * (j * j + k * k),
+            two_s * (i * j - k * r),
+            two_s * (i * k + j * r),
+            two_s * (i * j + k * r),
+            1 - two_s * (i * i + k * k),
+            two_s * (j * k - i * r),
+            two_s * (i * k - j * r),
+            two_s * (j * k + i * r),
+            1 - two_s * (i * i + j * j),
+        ),
+        -1,
+    )
+    return o.reshape(quaternions.shape[:-1] + (3, 3))
+
+
+def mat_to_quat(matrix: torch.Tensor) -> torch.Tensor:
+    """Convert (...,3,3) rotation matrices to quaternions (xyzw)."""
+    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
+        raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")
+
+    batch_dim = matrix.shape[:-2]
+    m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(
+        matrix.reshape(batch_dim + (9,)), dim=-1
+    )
+
+    q_abs = _sqrt_positive_part(
+        torch.stack(
+            [
+                1.0 + m00 + m11 + m22,
+                1.0 + m00 - m11 - m22,
+                1.0 - m00 + m11 - m22,
+                1.0 - m00 - m11 + m22,
+            ],
+            dim=-1,
+        )
+    )
+
+    quat_by_rijk = torch.stack(
+        [
+            torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1),
+            torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1),
+            torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1),
+            torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1),
+        ],
+        dim=-2,
+    )
+
+    flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device)
+    quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr))
+
+    out = quat_candidates[F.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, :].reshape(
+        batch_dim + (4,)
+    )
+    # Reorder rijk -> xyzw (i.e. ijkr).
+    out = out[..., [1, 2, 3, 0]]
+    return standardize_quaternion(out)
+
+
+# -----------------------------------------------------------------------------
+# Pose-encoding <-> extrinsics + intrinsics
+# -----------------------------------------------------------------------------
+
+
+def extri_intri_to_pose_encoding(extrinsics: torch.Tensor, intrinsics: torch.Tensor, image_size_hw: Tuple[int, int]) -> torch.Tensor:
+    """Pack (extr, intr, image_size) into the 9-D pose-encoding vector.
+    extrinsics: camera-to-world (c2w) (B,S,4,4) matrices,
+    intrinsics: pixel-space (B,S,3,3) matrices,
+    image_size_hw: is a (H, W) pair.
+    """
+    R = extrinsics[..., :3, :3]
+    T = extrinsics[..., :3, 3]
+    quat = mat_to_quat(R)
+    H, W = image_size_hw
+    fov_h = 2 * torch.atan((H / 2) / intrinsics[..., 1, 1])
+    fov_w = 2 * torch.atan((W / 2) / intrinsics[..., 0, 0])
+    return torch.cat([T, quat, fov_h[..., None], fov_w[..., None]], dim=-1).float()
+
+
+def pose_encoding_to_extri_intri(pose_encoding: torch.Tensor, image_size_hw: Tuple[int, int]) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Inverse of extri_intri_to_pose_encoding."""
+    T = pose_encoding[..., :3]
+    quat = pose_encoding[..., 3:7]
+    fov_h = pose_encoding[..., 7]
+    fov_w = pose_encoding[..., 8]
+    # Normalize to unit quaternion. CameraDec outputs raw values; a near-zero
+    # quaternion causes two_s = 2/norm² → inf in quat_to_mat → NaN extrinsics.
+    quat = quat / quat.norm(dim=-1, keepdim=True).clamp(min=1e-6)
+    R = quat_to_mat(quat)
+    extrinsics = torch.cat([R, T[..., None]], dim=-1)
+    H, W = image_size_hw
+    fy = (H / 2.0) / torch.clamp(torch.tan(fov_h / 2.0), 1e-6)
+    fx = (W / 2.0) / torch.clamp(torch.tan(fov_w / 2.0), 1e-6)
+    intrinsics = torch.zeros(pose_encoding.shape[:2] + (3, 3), device=pose_encoding.device, dtype=pose_encoding.dtype)
+    intrinsics[..., 0, 0] = fx
+    intrinsics[..., 1, 1] = fy
+    intrinsics[..., 0, 2] = W / 2
+    intrinsics[..., 1, 2] = H / 2
+    intrinsics[..., 2, 2] = 1.0
+    return extrinsics, intrinsics
--- a/comfy/ldm/ernie/model.py
+++ b/comfy/ldm/ernie/model.py
@ -5,6 +5,7 @@ import torch.nn.functional as F

 from comfy.ldm.modules.attention import optimized_attention
 import comfy.model_management
+import comfy.quant_ops

 def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
    assert dim % 2 == 0
@ -19,15 +20,6 @@ def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
    out = torch.stack([torch.cos(out), torch.sin(out)], dim=0)
    return out.to(dtype=torch.float32, device=pos.device)

-def apply_rotary_emb(x_in: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
-    rot_dim = freqs_cis.shape[-1]
-    x, x_pass = x_in[..., :rot_dim], x_in[..., rot_dim:]
-    cos_ = freqs_cis[0]
-    sin_ = freqs_cis[1]
-    x1, x2 = x.chunk(2, dim=-1)
-    x_rotated = torch.cat((-x2, x1), dim=-1)
-    return torch.cat((x * cos_ + x_rotated * sin_, x_pass), dim=-1)
-
 class ErnieImageEmbedND3(nn.Module):
    def __init__(self, dim: int, theta: int, axes_dim: tuple):
        super().__init__()
@ -37,8 +29,16 @@ class ErnieImageEmbedND3(nn.Module):

    def forward(self, ids: torch.Tensor) -> torch.Tensor:
        emb = torch.cat([rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(3)], dim=-1)
-        emb = emb.unsqueeze(3)  # [2, B, S, 1, head_dim//2]
-        return torch.stack([emb, emb], dim=-1).reshape(*emb.shape[:-1], -1)  # [B, S, 1, head_dim]
+        cos_ = emb[0]
+        sin_ = emb[1]
+        N = cos_.shape[-1]
+        half = N // 2
+        cos_top = cos_[..., :half].repeat_interleave(2, dim=-1)
+        sin_top = sin_[..., :half].repeat_interleave(2, dim=-1)
+        cos_bot = cos_[..., half:].repeat_interleave(2, dim=-1)
+        sin_bot = sin_[..., half:].repeat_interleave(2, dim=-1)
+        rot = torch.stack([cos_top, -sin_top, sin_bot, cos_bot], dim=-1)
+        return rot.reshape(*rot.shape[:-1], 2, 2).unsqueeze(2)

 class ErnieImagePatchEmbedDynamic(nn.Module):
    def __init__(self, in_channels: int, embed_dim: int, patch_size: int, operations, device=None, dtype=None):
@ -115,8 +115,7 @@ class ErnieImageAttention(nn.Module):
        key = self.norm_k(key)

        if image_rotary_emb is not None:
-            query = apply_rotary_emb(query, image_rotary_emb)
-            key = apply_rotary_emb(key, image_rotary_emb)
+            query, key = comfy.quant_ops.ck.apply_rope_split_half(query, key, image_rotary_emb)

        q_flat = query.reshape(B, S, -1)
        k_flat = key.reshape(B, S, -1)
@ -274,7 +273,7 @@ class ErnieImageModel(nn.Module):

        image_ids = image_ids.view(1, N_img, 3).expand(B, -1, -1)

-        rotary_pos_emb = self.pos_embed(torch.cat([image_ids, text_ids], dim=1)).to(x.dtype)
+        rotary_pos_emb = self.pos_embed(torch.cat([image_ids, text_ids], dim=1))
        del image_ids, text_ids

        sample = self.time_proj(timesteps).to(dtype)
--- a/comfy/ldm/flux/math.py
+++ b/comfy/ldm/flux/math.py
@ -4,7 +4,7 @@ from torch import Tensor

 from comfy.ldm.modules.attention import optimized_attention
 import comfy.model_management
-import logging
+import comfy.quant_ops


 def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None, transformer_options={}) -> Tensor:
@ -44,21 +44,15 @@ def _apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor):
    return apply_rope1(xq, freqs_cis), apply_rope1(xk, freqs_cis)


-try:
-    import comfy.quant_ops
-    q_apply_rope = comfy.quant_ops.ck.apply_rope
-    q_apply_rope1 = comfy.quant_ops.ck.apply_rope1
-    def apply_rope(xq, xk, freqs_cis):
-        if comfy.model_management.in_training:
-            return _apply_rope(xq, xk, freqs_cis)
-        else:
-            return apply_rope1(xq, freqs_cis), apply_rope1(xk, freqs_cis)
-    def apply_rope1(x, freqs_cis):
-        if comfy.model_management.in_training:
-            return _apply_rope1(x, freqs_cis)
-        else:
-            return q_apply_rope1(x, freqs_cis)
-except:
-    logging.warning("No comfy kitchen, using old apply_rope functions.")
-    apply_rope = _apply_rope
-    apply_rope1 = _apply_rope1
+def apply_rope(xq, xk, freqs_cis):
+    if comfy.model_management.in_training:
+        return _apply_rope(xq, xk, freqs_cis)
+    else:
+        return comfy.quant_ops.ck.apply_rope(xq, xk, freqs_cis)
+
+
+def apply_rope1(x, freqs_cis):
+    if comfy.model_management.in_training:
+        return _apply_rope1(x, freqs_cis)
+    else:
+        return comfy.quant_ops.ck.apply_rope1(x, freqs_cis)
--- a/comfy/ldm/hunyuan3dv2_1/hunyuandit.py
+++ b/comfy/ldm/hunyuan3dv2_1/hunyuandit.py
@ -607,9 +607,13 @@ class HunYuanDiTPlain(nn.Module):
    def forward(self, x, t, context, transformer_options = {}, **kwargs):

        x = x.movedim(-1, -2)
-        if context.shape[0] >= 2:
-            uncond_emb, cond_emb = context.chunk(2, dim = 0)
-            context = torch.cat([cond_emb, uncond_emb], dim = 0)
+
+        swap_cfg_halves = context.shape[0] >= 2
+
+        if swap_cfg_halves:
+            first_half, second_half = context.chunk(2, dim = 0)
+            context = torch.cat([second_half, first_half], dim = 0)
+
        main_condition = context

        t = 1.0 - t
@ -657,8 +661,8 @@ class HunYuanDiTPlain(nn.Module):
        output = self.final_layer(combined)
        output =  output.movedim(-2, -1) * (-1.0)

-        if output.shape[0] >= 2:
-            cond_emb, uncond_emb = output.chunk(2, dim = 0)
-            return torch.cat([uncond_emb, cond_emb])
-        else:
-            return output
+        if swap_cfg_halves:
+            first_half, second_half = output.chunk(2, dim = 0)
+            output = torch.cat([second_half, first_half], dim = 0)
+
+        return output
--- a/comfy/ldm/ideogram4/model.py
+++ b/comfy/ldm/ideogram4/model.py
@ -0,0 +1,297 @@
+"""
+The Ideogram 4 transformer is a NextDiT/Lumina2-family single-stream model
+consumes Qwen3-VL hidden-state features (concatenated from 13 layers -> 53248 dims)
+packs ``[text tokens, image tokens]`` into one sequence with block-diagonal segment attention and 3D interleaved MRoPE.
+"""
+
+from __future__ import annotations
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import comfy.patcher_extension
+from comfy.ldm.lumina.model import FeedForward
+from comfy.ldm.modules.attention import optimized_attention_masked
+from comfy.text_encoders.llama import apply_rope, precompute_freqs_cis
+
+# Per-token role indicators
+SEQUENCE_PADDING_INDICATOR = -1
+OUTPUT_IMAGE_INDICATOR = 2
+LLM_TOKEN_INDICATOR = 3
+# Image grid coordinates are offset so they never collide with text positions
+IMAGE_POSITION_OFFSET = 65536
+
+
+class Ideogram4Attention(nn.Module):
+    def __init__(self, hidden_size, num_heads, eps=1e-5, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = hidden_size // num_heads
+        self.hidden_size = hidden_size
+
+        self.qkv = operations.Linear(hidden_size, hidden_size * 3, bias=False, dtype=dtype, device=device)
+        self.norm_q = operations.RMSNorm(self.head_dim, eps=eps, elementwise_affine=True, dtype=dtype, device=device)
+        self.norm_k = operations.RMSNorm(self.head_dim, eps=eps, elementwise_affine=True, dtype=dtype, device=device)
+        self.o = operations.Linear(hidden_size, hidden_size, bias=False, dtype=dtype, device=device)
+
+    def forward(self, x, attn_mask, freqs_cis, transformer_options={}):
+        batch_size, seq_len, _ = x.shape
+        qkv = self.qkv(x).view(batch_size, seq_len, 3, self.num_heads, self.head_dim)
+        q, k, v = qkv.unbind(dim=2)
+
+        q = self.norm_q(q)
+        k = self.norm_k(k)
+
+        # (B, heads, L, head_dim)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+
+        q, k = apply_rope(q, k, freqs_cis)
+
+        out = optimized_attention_masked(q, k, v, self.num_heads, attn_mask, skip_reshape=True, transformer_options=transformer_options)
+        return self.o(out)
+
+
+class Ideogram4TransformerBlock(nn.Module):
+    def __init__(self, hidden_size, intermediate_size, num_heads, norm_eps, adaln_dim, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.attention = Ideogram4Attention(hidden_size, num_heads, eps=1e-5, dtype=dtype, device=device, operations=operations)
+        self.feed_forward = FeedForward(
+            dim=hidden_size, hidden_dim=intermediate_size, multiple_of=1, ffn_dim_multiplier=None,
+            operation_settings={"operations": operations, "dtype": dtype, "device": device},
+        )
+
+        self.attention_norm1 = operations.RMSNorm(hidden_size, eps=norm_eps, elementwise_affine=True, dtype=dtype, device=device)
+        self.ffn_norm1 = operations.RMSNorm(hidden_size, eps=norm_eps, elementwise_affine=True, dtype=dtype, device=device)
+        self.attention_norm2 = operations.RMSNorm(hidden_size, eps=norm_eps, elementwise_affine=True, dtype=dtype, device=device)
+        self.ffn_norm2 = operations.RMSNorm(hidden_size, eps=norm_eps, elementwise_affine=True, dtype=dtype, device=device)
+
+        self.adaln_modulation = operations.Linear(adaln_dim, 4 * hidden_size, bias=True, dtype=dtype, device=device)
+
+    def forward(self, x, attn_mask, freqs_cis, adaln_input, transformer_options={}):
+        mod = self.adaln_modulation(adaln_input)
+        scale_msa, gate_msa, scale_mlp, gate_mlp = mod.chunk(4, dim=-1)
+        gate_msa = torch.tanh(gate_msa)
+        gate_mlp = torch.tanh(gate_mlp)
+        scale_msa = 1.0 + scale_msa
+        scale_mlp = 1.0 + scale_mlp
+
+        attn_out = self.attention(self.attention_norm1(x) * scale_msa, attn_mask, freqs_cis, transformer_options=transformer_options)
+        x = x + gate_msa * self.attention_norm2(attn_out)
+        x = x + gate_mlp * self.ffn_norm2(self.feed_forward(self.ffn_norm1(x) * scale_mlp))
+        return x
+
+
+def _sinusoidal_embedding(t, dim, scale=1e4):
+    t = t.to(torch.float32)
+    half = dim // 2
+    freq = math.log(scale) / (half - 1)
+    freq = torch.exp(torch.arange(half, dtype=torch.float32, device=t.device) * -freq)
+    emb = t.unsqueeze(-1) * freq
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+    if dim % 2 == 1:
+        emb = F.pad(emb, (0, 1))
+    return emb
+
+
+class Ideogram4EmbedScalar(nn.Module):
+    def __init__(self, dim, input_range=(0.0, 1.0), dtype=None, device=None, operations=None):
+        super().__init__()
+        self.dim = dim
+        self.range_min, self.range_max = input_range
+        self.mlp_in = operations.Linear(dim, dim, bias=True, dtype=dtype, device=device)
+        self.mlp_out = operations.Linear(dim, dim, bias=True, dtype=dtype, device=device)
+
+    def forward(self, x):
+        x = x.to(torch.float32)
+        scaled = 1e4 * (x - self.range_min) / (self.range_max - self.range_min)
+        emb = _sinusoidal_embedding(scaled, self.dim)
+        emb = emb.to(self.mlp_in.weight.dtype)
+        emb = F.silu(self.mlp_in(emb))
+        return self.mlp_out(emb)
+
+
+class Ideogram4FinalLayer(nn.Module):
+    def __init__(self, hidden_size, out_channels, adaln_dim, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.norm_final = operations.LayerNorm(hidden_size, eps=1e-6, elementwise_affine=False, dtype=dtype, device=device)
+        self.linear = operations.Linear(hidden_size, out_channels, bias=True, dtype=dtype, device=device)
+        self.adaln_modulation = operations.Linear(adaln_dim, hidden_size, bias=True, dtype=dtype, device=device)
+
+    def forward(self, x, c):
+        scale = 1.0 + self.adaln_modulation(F.silu(c))
+        return self.linear(self.norm_final(x) * scale)
+
+
+class Ideogram4Transformer(nn.Module):
+    """A single Ideogram 4 backbone operating on a packed token sequence."""
+
+    def __init__(self, emb_dim, num_layers, num_heads, intermediate_size, adaln_dim,
+                 in_channels, llm_features_dim, rope_theta, mrope_section, norm_eps,
+                 dtype=None, device=None, operations=None):
+        super().__init__()
+        self.head_dim = emb_dim // num_heads
+        self.rope_theta = rope_theta
+        self.mrope_section = tuple(mrope_section)
+
+        self.input_proj = operations.Linear(in_channels, emb_dim, bias=True, dtype=dtype, device=device)
+        self.llm_cond_norm = operations.RMSNorm(llm_features_dim, eps=1e-6, elementwise_affine=True, dtype=dtype, device=device)
+        self.llm_cond_proj = operations.Linear(llm_features_dim, emb_dim, bias=True, dtype=dtype, device=device)
+        self.t_embedding = Ideogram4EmbedScalar(emb_dim, input_range=(0.0, 1.0), dtype=dtype, device=device, operations=operations)
+        self.adaln_proj = operations.Linear(emb_dim, adaln_dim, bias=True, dtype=dtype, device=device)
+
+        self.embed_image_indicator = operations.Embedding(2, emb_dim, dtype=dtype, device=device)
+
+        self.layers = nn.ModuleList([
+            Ideogram4TransformerBlock(emb_dim, intermediate_size, num_heads, norm_eps, adaln_dim,
+                                      dtype=dtype, device=device, operations=operations)
+            for _ in range(num_layers)
+        ])
+
+        self.final_layer = Ideogram4FinalLayer(emb_dim, in_channels, adaln_dim, dtype=dtype, device=device, operations=operations)
+
+    def _backbone(self, llm_features, x, t, position_ids, attn_mask, indicator, transformer_options={}):
+        indicator = indicator.to(torch.long)
+        output_image_mask = (indicator == OUTPUT_IMAGE_INDICATOR).to(x.dtype).unsqueeze(-1)
+
+        x = x * output_image_mask
+        h = self.input_proj(x) * output_image_mask
+
+        t_cond = self.t_embedding(t)
+        if t.dim() == 1:
+            t_cond = t_cond.unsqueeze(1)
+        adaln_input = F.silu(self.adaln_proj(t_cond))
+
+        # h is zero on the text rows (content lives only on image rows), add writes the text features in place
+        if llm_features is not None:
+            L_text = llm_features.shape[1]
+            text_mask = (indicator[:, :L_text] == LLM_TOKEN_INDICATOR).to(x.dtype).unsqueeze(-1)
+            llm = self.llm_cond_norm(llm_features * text_mask)
+            llm = self.llm_cond_proj(llm) * text_mask
+            h[:, :L_text] = h[:, :L_text] + llm
+
+        h = h + self.embed_image_indicator((indicator == OUTPUT_IMAGE_INDICATOR).to(torch.long), out_dtype=h.dtype)
+
+        # Qwen3-VL interleaved MRoPE; position_ids (B, L, 3) -> (3, L) (same across batch).
+        freqs_cis = precompute_freqs_cis(
+            self.head_dim, position_ids[0].transpose(0, 1), self.rope_theta,
+            rope_dims=self.mrope_section, interleaved_mrope=True, device=position_ids.device,
+        )
+
+        if attn_mask is not None and attn_mask.dtype == torch.bool:
+            attn_mask = torch.zeros_like(attn_mask, dtype=h.dtype).masked_fill_(~attn_mask, -torch.finfo(h.dtype).max)
+
+        for layer in self.layers:
+            h = layer(h, attn_mask, freqs_cis, adaln_input, transformer_options=transformer_options)
+
+        return self.final_layer(h, adaln_input)
+
+
+class Ideogram4Transformer2DModel(Ideogram4Transformer):
+    """Ideogram 4 single-stream DiT.
+
+    Runs a packed ``[text, image]`` sequence when text context is supplied, or an image-only sequence when ``context is None``.
+    """
+
+    def __init__(self, image_model=None, in_channels=128, num_layers=34, num_attention_heads=18, attention_head_dim=256, intermediate_size=12288,
+                 adaln_dim=512, llm_features_dim=53248, rope_theta=5000000, mrope_section=(24, 20, 20), norm_eps=1e-5,
+                 dtype=None, device=None, operations=None, **kwargs):
+        emb_dim = num_attention_heads * attention_head_dim
+        super().__init__(
+            emb_dim=emb_dim, num_layers=num_layers, num_heads=num_attention_heads,
+            intermediate_size=intermediate_size, adaln_dim=adaln_dim, in_channels=in_channels,
+            llm_features_dim=llm_features_dim, rope_theta=rope_theta, mrope_section=mrope_section,
+            norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
+        self.dtype = dtype
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        # 128-dim token = patch (2x2) * ae_channels (32).
+        self.patch_size = 2
+        self.ae_channels = in_channels // (self.patch_size * self.patch_size)
+
+    def _img_to_tokens(self, x):
+        B, C, gh, gw = x.shape
+        x = x.view(B, self.ae_channels, self.patch_size, self.patch_size, gh, gw)
+        x = x.permute(0, 4, 5, 2, 3, 1)  # (B, gh, gw, pi, pj, c)
+        return x.reshape(B, gh * gw, C)
+
+    def _tokens_to_img(self, tokens, gh, gw):
+        B = tokens.shape[0]
+        C = tokens.shape[-1]
+        x = tokens.reshape(B, gh, gw, self.patch_size, self.patch_size, self.ae_channels)
+        x = x.permute(0, 5, 3, 4, 1, 2)  # (B, c, pi, pj, gh, gw)
+        return x.reshape(B, C, gh, gw)
+
+    def _image_position_ids(self, gh, gw, device):
+        h_idx = torch.arange(gh, device=device).view(-1, 1).expand(gh, gw).reshape(-1)
+        w_idx = torch.arange(gw, device=device).view(1, -1).expand(gh, gw).reshape(-1)
+        t_idx = torch.zeros_like(h_idx)
+        return torch.stack([t_idx, h_idx, w_idx], dim=1) + IMAGE_POSITION_OFFSET  # (L_img, 3)
+
+    def _run_conditional(self, x_chunk, context_chunk, attn_mask_chunk, t_chunk, gh, gw, transformer_options):
+        B = x_chunk.shape[0]
+        device = x_chunk.device
+        img_tokens = self._img_to_tokens(x_chunk)
+        L_img = img_tokens.shape[1]
+        L_text = context_chunk.shape[1]
+        L = L_text + L_img
+        latent_dim = img_tokens.shape[-1]
+
+        x_full = torch.zeros(B, L, latent_dim, dtype=img_tokens.dtype, device=device)
+        x_full[:, L_text:] = img_tokens
+
+        text_pos = torch.arange(L_text, device=device).view(-1, 1).expand(L_text, 3)
+        img_pos = self._image_position_ids(gh, gw, device)
+        position_ids = torch.cat([text_pos, img_pos], dim=0).unsqueeze(0).expand(B, L, 3)
+
+        indicator = torch.empty(B, L, dtype=torch.long, device=device)
+        indicator[:, :L_text] = LLM_TOKEN_INDICATOR
+        indicator[:, L_text:] = OUTPUT_IMAGE_INDICATOR
+
+        attn_mask = None
+        if attn_mask_chunk is not None:
+            segment_ids = torch.ones(B, L, dtype=torch.long, device=device)
+            pad = (attn_mask_chunk == 0)
+            segment_ids[:, :L_text][pad] = SEQUENCE_PADDING_INDICATOR
+            indicator[:, :L_text][pad] = 0
+            # Block-diagonal mask from segment ids: (B, 1, L, L), True = attend.
+            attn_mask = (segment_ids.unsqueeze(2) == segment_ids.unsqueeze(1)).unsqueeze(1)
+
+        out = self._backbone(context_chunk, x_full, t_chunk, position_ids, attn_mask, indicator,
+                             transformer_options=transformer_options)
+        return self._tokens_to_img(out[:, L_text:], gh, gw)
+
+    def _run_image_only(self, x_chunk, t_chunk, gh, gw, transformer_options):
+        B = x_chunk.shape[0]
+        device = x_chunk.device
+        img_tokens = self._img_to_tokens(x_chunk)
+        L_img = img_tokens.shape[1]
+
+        position_ids = self._image_position_ids(gh, gw, device).unsqueeze(0).expand(B, L_img, 3)
+        indicator = torch.full((B, L_img), OUTPUT_IMAGE_INDICATOR, dtype=torch.long, device=device)
+
+        # Image-only sequence is a single segment -> no mask, full attention, no LLM context.
+        out = self._backbone(None, img_tokens, t_chunk, position_ids, None, indicator, transformer_options=transformer_options)
+        return self._tokens_to_img(out, gh, gw)
+
+    def forward(self, x, timesteps, context=None, attention_mask=None, transformer_options={}, **kwargs):
+        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
+            self._forward,
+            self,
+            comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options),
+        ).execute(x, timesteps, context, attention_mask, transformer_options, **kwargs)
+
+    def _forward(self, x, timesteps, context=None, attention_mask=None, transformer_options={}, **kwargs):
+        bs, c, gh, gw = x.shape
+
+        timesteps = 1.0 - timesteps
+
+        # unconditional pass
+        if context is None:
+            return -self._run_image_only(x, timesteps, gh, gw, transformer_options)
+
+        return -self._run_conditional(x, context, attention_mask, timesteps, gh, gw, transformer_options)
--- a/comfy/ldm/lens/model.py
+++ b/comfy/ldm/lens/model.py
@ -0,0 +1,510 @@
+"""Lens denoising transformer (DiT)"""
+
+from __future__ import annotations
+
+from typing import Any, Dict, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import comfy.ldm.flux.layers
+import comfy.patcher_extension
+from comfy.ldm.flux.layers import EmbedND
+from comfy.ldm.flux.math import apply_rope
+from comfy.ldm.modules.attention import optimized_attention
+
+
+def _lens_time_proj(t: torch.Tensor, dim: int = 256) -> torch.Tensor:
+    return comfy.ldm.flux.layers.timestep_embedding(t, dim)
+
+
+def _lens_position_ids(
+    frame: int, height: int, width: int, text_seq_len: int,
+    scale_rope: bool = True, device=None,
+) -> torch.Tensor:
+    """Lens axial (frame, h, w) position ids for joint image + text sequence.
+
+    With ``scale_rope=True`` h/w are centered around 0 (negative + positive
+    halves) and text starts at ``max(h//2, w//2)``. Result shape ``[seq, 3]``;
+    caller adds a batch dim for ``EmbedND``.
+    """
+    if scale_rope:
+        h_pos = torch.cat([torch.arange(-(height - height // 2), 0, device=device),
+                           torch.arange(0, height // 2, device=device)])
+        w_pos = torch.cat([torch.arange(-(width - width // 2), 0, device=device),
+                           torch.arange(0, width // 2, device=device)])
+        text_start = max(height // 2, width // 2)
+    else:
+        h_pos = torch.arange(height, device=device)
+        w_pos = torch.arange(width, device=device)
+        text_start = max(height, width)
+
+    f_pos = torch.arange(frame, device=device)
+    img_ids = torch.zeros(frame, height, width, 3, device=device)
+    img_ids[..., 0] = f_pos[:, None, None]
+    img_ids[..., 1] = h_pos[None, :, None]
+    img_ids[..., 2] = w_pos[None, None, :]
+    img_ids = img_ids.reshape(-1, 3)
+
+    # Text positions replicate across all 3 axes (matches original packing).
+    txt_pos = torch.arange(text_start, text_start + text_seq_len, device=device).float()
+    txt_ids = txt_pos[:, None].expand(text_seq_len, 3)
+
+    return torch.cat([img_ids, txt_ids], dim=0)
+
+
+class _TimestepEmbedder(nn.Module):
+    def __init__(self, in_channels: int, time_embed_dim: int, dtype=None, device=None, operations=None) -> None:
+        super().__init__()
+        self.linear_1 = operations.Linear(in_channels, time_embed_dim, dtype=dtype, device=device)
+        self.linear_2 = operations.Linear(time_embed_dim, time_embed_dim, dtype=dtype, device=device)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.linear_1(x)
+        x = F.silu(x)
+        return self.linear_2(x)
+
+
+class LensTimestepProjEmbeddings(nn.Module):
+    def __init__(self, embedding_dim: int, dtype=None, device=None, operations=None) -> None:
+        super().__init__()
+        self.timestep_embedder = _TimestepEmbedder(256, embedding_dim, dtype=dtype, device=device, operations=operations)
+
+    def forward(self, timestep: torch.Tensor, hidden_states: torch.Tensor) -> torch.Tensor:
+        proj = _lens_time_proj(timestep, 256)
+        return self.timestep_embedder(proj.to(dtype=hidden_states.dtype))
+
+
+class GateMLP(nn.Module):
+    """SwiGLU MLP."""
+
+    def __init__(self, dim: int, hidden_dim: int, dtype=None, device=None, operations=None) -> None:
+        super().__init__()
+        self.w1 = operations.Linear(dim, hidden_dim, bias=False, dtype=dtype, device=device)
+        self.w2 = operations.Linear(hidden_dim, dim, bias=False, dtype=dtype, device=device)
+        self.w3 = operations.Linear(dim, hidden_dim, bias=False, dtype=dtype, device=device)
+
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x), inplace=True).mul_(self.w3(x)))
+
+
+class LensJointAttention(nn.Module):
+    """Joint image+text attention with fused QKV per stream."""
+
+    def __init__(
+        self,
+        query_dim: int,
+        added_kv_proj_dim: int,
+        dim_head: int = 64,
+        heads: int = 8,
+        out_dim: Optional[int] = None,
+        eps: float = 1e-5,
+        dtype=None,
+        device=None,
+        operations=None,
+    ) -> None:
+        super().__init__()
+        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.heads = self.inner_dim // dim_head
+        self.dim_head = dim_head
+        self.out_dim = out_dim if out_dim is not None else query_dim
+
+        self.norm_q = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device)
+        self.norm_k = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device)
+        self.norm_added_q = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device)
+        self.norm_added_k = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device)
+
+        self.img_qkv = operations.Linear(query_dim, 3 * self.inner_dim, bias=True, dtype=dtype, device=device)
+        self.txt_qkv = operations.Linear(added_kv_proj_dim, 3 * self.inner_dim, bias=True, dtype=dtype, device=device)
+
+        # ModuleList([Linear, Identity]) for state-dict key compatibility.
+        self.to_out = nn.ModuleList([
+            operations.Linear(self.inner_dim, self.out_dim, bias=True, dtype=dtype, device=device),
+            nn.Identity(),
+        ])
+        self.to_add_out = operations.Linear(self.inner_dim, query_dim, bias=True, dtype=dtype, device=device)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        freqs_cis: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        transformer_options: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        bsz, seq_img, _ = hidden_states.shape
+        seq_txt = encoder_hidden_states.shape[1]
+
+        # image stream
+        img_qkv = self.img_qkv(hidden_states).view(bsz, seq_img, 3, self.heads, self.dim_head)
+        img_q, img_k, img_v = img_qkv.unbind(dim=2)
+        img_q = self.norm_q(img_q)
+        img_k = self.norm_k(img_k)
+        del img_qkv
+
+        # text stream
+        txt_qkv = self.txt_qkv(encoder_hidden_states).view(bsz, seq_txt, 3, self.heads, self.dim_head)
+        txt_q, txt_k, txt_v = txt_qkv.unbind(dim=2)
+        txt_q = self.norm_added_q(txt_q)
+        txt_k = self.norm_added_k(txt_k)
+
+        # [B, S, H, D] → [B, H, S, D] for attention, dels to avoid VRAM peaks
+        q = torch.cat([img_q, txt_q], dim=1).transpose(1, 2)
+        del img_q, txt_q
+        k = torch.cat([img_k, txt_k], dim=1).transpose(1, 2)
+        del img_k, txt_k
+        v = torch.cat([img_v, txt_v], dim=1).transpose(1, 2)
+        del img_v, txt_v
+
+        q, k = apply_rope(q, k, freqs_cis)
+
+        if attention_mask is not None:
+            expected = (bsz, 1, 1, seq_img + seq_txt)
+            if attention_mask.shape != expected:
+                raise ValueError(
+                    f"attention_mask must be {expected}, got {tuple(attention_mask.shape)}"
+                )
+            attention_mask = attention_mask.to(q.dtype)
+
+        out = optimized_attention(
+            q, k, v, self.heads, mask=attention_mask, skip_reshape=True,
+            transformer_options=transformer_options,
+        )
+
+        img_out = self.to_out[1](self.to_out[0](out[:, :seq_img, :]))
+        txt_out = self.to_add_out(out[:, seq_img:, :])
+        return img_out, txt_out
+
+
+class LensTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        eps: float = 1e-6,
+        rms_norm: bool = True,
+        dtype=None,
+        device=None,
+        operations=None,
+    ) -> None:
+        super().__init__()
+
+        self.attn = LensJointAttention(
+            query_dim=dim,
+            added_kv_proj_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            eps=1e-5,
+            dtype=dtype,
+            device=device,
+            operations=operations,
+        )
+
+        if rms_norm:
+            NormCls = operations.RMSNorm
+            norm_kwargs = {}
+        else:
+            NormCls = operations.LayerNorm
+            norm_kwargs = {"elementwise_affine": False}
+
+        mlp_hidden = int(dim / 3 * 8)
+
+        # Sequential(SiLU, Linear) so state-dict lands at img_mod.1.{weight,bias}.
+        self.img_mod = nn.Sequential(
+            nn.SiLU(),
+            operations.Linear(dim, 6 * dim, bias=True, dtype=dtype, device=device),
+        )
+        self.img_norm1 = NormCls(dim, eps=eps, dtype=dtype, device=device, **norm_kwargs)
+        self.img_norm2 = NormCls(dim, eps=eps, dtype=dtype, device=device, **norm_kwargs)
+        self.img_mlp = GateMLP(dim, mlp_hidden, dtype=dtype, device=device, operations=operations)
+
+        self.txt_mod = nn.Sequential(
+            nn.SiLU(),
+            operations.Linear(dim, 6 * dim, bias=True, dtype=dtype, device=device),
+        )
+        self.txt_norm1 = NormCls(dim, eps=eps, dtype=dtype, device=device, **norm_kwargs)
+        self.txt_norm2 = NormCls(dim, eps=eps, dtype=dtype, device=device, **norm_kwargs)
+        self.txt_mlp = GateMLP(dim, mlp_hidden, dtype=dtype, device=device, operations=operations)
+
+    @staticmethod
+    def _modulate(x: torch.Tensor, mod_params: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        shift, scale, gate = mod_params.chunk(3, dim=-1)
+        return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1), gate.unsqueeze(1)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        freqs_cis: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        transformer_options: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        img_mod1, img_mod2 = self.img_mod(temb).chunk(2, dim=-1)
+        txt_mod1, txt_mod2 = self.txt_mod(temb).chunk(2, dim=-1)
+
+        img_modulated, img_gate1 = self._modulate(self.img_norm1(hidden_states), img_mod1)
+        txt_modulated, txt_gate1 = self._modulate(self.txt_norm1(encoder_hidden_states), txt_mod1)
+
+        img_attn, txt_attn = self.attn(
+            hidden_states=img_modulated,
+            encoder_hidden_states=txt_modulated,
+            freqs_cis=freqs_cis,
+            attention_mask=attention_mask,
+            transformer_options=transformer_options,
+        )
+
+        hidden_states = hidden_states + img_gate1 * img_attn
+        encoder_hidden_states = encoder_hidden_states + txt_gate1 * txt_attn
+
+        img_modulated2, img_gate2 = self._modulate(self.img_norm2(hidden_states), img_mod2)
+        hidden_states = hidden_states + img_gate2 * self.img_mlp(img_modulated2)
+
+        txt_modulated2, txt_gate2 = self._modulate(self.txt_norm2(encoder_hidden_states), txt_mod2)
+        encoder_hidden_states = encoder_hidden_states + txt_gate2 * self.txt_mlp(txt_modulated2)
+
+        return encoder_hidden_states, hidden_states
+
+
+class _AdaLayerNormContinuousNoAffine(nn.Module):
+    """AdaLayerNormContinuous(elementwise_affine=False).
+
+    The reference uses ``scale, shift = chunk(2)`` (scale first) — opposite
+    to Flux's ``LastLayer``.
+    """
+
+    def __init__(self, embedding_dim: int, conditioning_embedding_dim: int, eps: float = 1e-6,
+                 dtype=None, device=None, operations=None) -> None:
+        super().__init__()
+        self.linear = operations.Linear(
+            conditioning_embedding_dim, embedding_dim * 2, bias=True, dtype=dtype, device=device
+        )
+        self.eps = eps
+        self.embedding_dim = embedding_dim
+
+    def forward(self, x: torch.Tensor, conditioning: torch.Tensor) -> torch.Tensor:
+        emb = self.linear(F.silu(conditioning))
+        scale, shift = torch.chunk(emb, 2, dim=-1)
+        x = F.layer_norm(x, (self.embedding_dim,), None, None, self.eps)
+        return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+
+
+class LensTransformer2DModel(nn.Module):
+    """Lens dual-stream MMDiT (48 blocks, inner_dim=1536, multi-layer text)."""
+
+    def __init__(
+        self,
+        patch_size: int = 2,
+        in_channels: int = 128,
+        out_channels: Optional[int] = 32,
+        num_layers: int = 48,
+        attention_head_dim: int = 64,
+        num_attention_heads: int = 24,
+        enc_hidden_dim: int = 2880,
+        axes_dims_rope: Tuple[int, int, int] = (8, 28, 28),
+        rms_norm: bool = True,
+        multi_layer_encoder_feature: bool = True,
+        selected_layer_index: Tuple[int, ...] = (5, 11, 17, 23),
+        image_model=None,  # unused; accepted for detection-side configs.
+        dtype=None,
+        device=None,
+        operations=None,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.in_channels = in_channels
+        self.out_channels = out_channels if out_channels is not None else in_channels
+        self.inner_dim = num_attention_heads * attention_head_dim
+        self.multi_layer_encoder_feature = multi_layer_encoder_feature
+        self.selected_layer_index = list(selected_layer_index)
+        self.dtype = dtype
+
+        self.pos_embed = EmbedND(dim=attention_head_dim, theta=10000, axes_dim=list(axes_dims_rope))
+        self.time_text_embed = LensTimestepProjEmbeddings(
+            embedding_dim=self.inner_dim, dtype=dtype, device=device, operations=operations
+        )
+
+        if self.multi_layer_encoder_feature:
+            self.txt_norm = nn.ModuleList(
+                [operations.RMSNorm(enc_hidden_dim, eps=1e-5, dtype=dtype, device=device)
+                 for _ in self.selected_layer_index]
+            )
+            self.txt_in = operations.Linear(
+                enc_hidden_dim * len(self.selected_layer_index),
+                self.inner_dim, bias=True, dtype=dtype, device=device,
+            )
+        else:
+            self.txt_norm = operations.RMSNorm(enc_hidden_dim, eps=1e-5, dtype=dtype, device=device)
+            self.txt_in = operations.Linear(enc_hidden_dim, self.inner_dim, bias=True, dtype=dtype, device=device)
+
+        self.img_in = operations.Linear(in_channels, self.inner_dim, bias=True, dtype=dtype, device=device)
+
+        self.transformer_blocks = nn.ModuleList([
+            LensTransformerBlock(
+                dim=self.inner_dim,
+                num_attention_heads=num_attention_heads,
+                attention_head_dim=attention_head_dim,
+                eps=1e-6,
+                rms_norm=rms_norm,
+                dtype=dtype, device=device, operations=operations,
+            )
+            for _ in range(num_layers)
+        ])
+
+        self.norm_out = _AdaLayerNormContinuousNoAffine(
+            self.inner_dim, self.inner_dim, eps=1e-6,
+            dtype=dtype, device=device, operations=operations,
+        )
+        self.proj_out = operations.Linear(
+            self.inner_dim, patch_size * patch_size * self.out_channels, bias=True,
+            dtype=dtype, device=device,
+        )
+
+    def forward(self, x: torch.Tensor, timestep: torch.Tensor, context: torch.Tensor, attention_mask: Optional[torch.Tensor] = None,
+                transformer_options: Optional[Dict[str, Any]] = None, **kwargs) -> torch.Tensor:
+        if transformer_options is None:
+            transformer_options = {}
+        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
+            self._forward, self,
+            comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options),
+        ).execute(x, timestep, context, attention_mask, transformer_options, **kwargs)
+
+    def _forward(
+        self,
+        x: torch.Tensor,
+        timestep: torch.Tensor,
+        context: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        transformer_options: Optional[Dict[str, Any]] = None,
+        control: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """ComfyUI bridge: ``(x[B,128,h,w], t[B], context[B,S,L*H], mask[B,S])``."""
+        if transformer_options is None:
+            transformer_options = {}
+        transformer_options = transformer_options.copy()
+        patches = transformer_options.get("patches", {})
+        patches_replace = transformer_options.get("patches_replace", {})
+        blocks_replace = patches_replace.get("dit", {})
+
+        B, C, h, w = x.shape
+        hidden_states = x.permute(0, 2, 3, 1).reshape(B, h * w, C)
+
+        if self.multi_layer_encoder_feature:
+            L = len(self.selected_layer_index)
+            enc_dim = context.shape[-1] // L
+            encoder_hidden_states = list(
+                context.reshape(B, -1, L, enc_dim).unbind(dim=2)
+            )
+            text_seq_len = encoder_hidden_states[0].shape[1]
+        else:
+            encoder_hidden_states = context
+            text_seq_len = context.shape[1]
+
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (B, text_seq_len), dtype=torch.bool, device=x.device
+            )
+
+        img_len = h * w
+        joint_mask = self._build_joint_attention_mask(attention_mask, img_len)
+
+        hidden_states = self.img_in(hidden_states)
+        timestep = timestep.to(hidden_states.dtype)
+
+        if self.multi_layer_encoder_feature:
+            normed = [self.txt_norm[i](encoder_hidden_states[i]) for i in range(L)]
+            encoder_hidden_states = torch.cat(normed, dim=-1)
+        else:
+            encoder_hidden_states = self.txt_norm(encoder_hidden_states)
+        encoder_hidden_states = self.txt_in(encoder_hidden_states)
+
+        if "post_input" in patches:
+            for p in patches["post_input"]:
+                out = p({
+                    "img": hidden_states,
+                    "txt": encoder_hidden_states,
+                    "transformer_options": transformer_options,
+                })
+                hidden_states = out["img"]
+                encoder_hidden_states = out["txt"]
+
+        temb = self.time_text_embed(timestep, hidden_states)
+        ids = _lens_position_ids(1, h, w, text_seq_len, device=hidden_states.device).unsqueeze(0)
+        freqs_cis = self.pos_embed(ids)
+
+        transformer_options["total_blocks"] = len(self.transformer_blocks)
+        transformer_options["block_type"] = "double"
+        for i, block in enumerate(self.transformer_blocks):
+            transformer_options["block_index"] = i
+            if ("double_block", i) in blocks_replace:
+                def block_wrap(args):
+                    out = {}
+                    out["txt"], out["img"] = block(
+                        hidden_states=args["img"],
+                        encoder_hidden_states=args["txt"],
+                        temb=args["vec"],
+                        freqs_cis=args["pe"],
+                        attention_mask=args.get("attn_mask"),
+                        transformer_options=args.get("transformer_options"),
+                    )
+                    return out
+                out = blocks_replace[("double_block", i)](
+                    {
+                        "img": hidden_states,
+                        "txt": encoder_hidden_states,
+                        "vec": temb,
+                        "pe": freqs_cis,
+                        "attn_mask": joint_mask,
+                        "transformer_options": transformer_options,
+                    },
+                    {"original_block": block_wrap},
+                )
+                encoder_hidden_states = out["txt"]
+                hidden_states = out["img"]
+            else:
+                encoder_hidden_states, hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=temb,
+                    freqs_cis=freqs_cis,
+                    attention_mask=joint_mask,
+                    transformer_options=transformer_options,
+                )
+
+            if "double_block" in patches:
+                for p in patches["double_block"]:
+                    out = p({
+                        "img": hidden_states,
+                        "txt": encoder_hidden_states,
+                        "x": x,
+                        "block_index": i,
+                        "transformer_options": transformer_options,
+                    })
+                    hidden_states = out["img"]
+                    encoder_hidden_states = out["txt"]
+
+            if control is not None:
+                control_i = control.get("input")
+                if control_i is not None and i < len(control_i):
+                    add = control_i[i]
+                    if add is not None:
+                        hidden_states[:, :add.shape[1]] += add
+
+        hidden_states = self.norm_out(hidden_states, temb)
+        out = self.proj_out(hidden_states)
+        return out.reshape(B, h, w, C).permute(0, 3, 1, 2).contiguous()
+
+    @staticmethod
+    def _build_joint_attention_mask(text_mask: torch.Tensor, img_len: int) -> torch.Tensor:
+        if text_mask.dtype != torch.bool:
+            text_mask = text_mask.bool()
+        bsz = text_mask.shape[0]
+        img_ones = torch.ones((bsz, img_len), dtype=torch.bool, device=text_mask.device)
+        joint = torch.cat([img_ones, text_mask], dim=1)
+        additive = torch.zeros_like(joint, dtype=torch.float32)
+        additive.masked_fill_(~joint, torch.finfo(torch.float32).min)
+        return additive[:, None, None, :]
--- a/comfy/ldm/lightricks/av_model.py
+++ b/comfy/ldm/lightricks/av_model.py
@ -767,25 +767,25 @@ class LTXAVModel(LTXVModel):

            # Cross-attention timesteps - compress these too
            av_ca_audio_scale_shift_timestep, _ = self.av_ca_audio_scale_shift_adaln_single(
-                timestep.max().expand_as(a_timestep_flat),
+                a_timestep_flat,
                {"resolution": None, "aspect_ratio": None},
                batch_size=batch_size,
                hidden_dtype=hidden_dtype,
            )
            av_ca_video_scale_shift_timestep, _ = self.av_ca_video_scale_shift_adaln_single(
-                a_timestep.max().expand_as(timestep_flat),
+                timestep_flat,
                {"resolution": None, "aspect_ratio": None},
                batch_size=batch_size,
                hidden_dtype=hidden_dtype,
            )
            av_ca_a2v_gate_noise_timestep, _ = self.av_ca_a2v_gate_adaln_single(
-                a_timestep.max().expand_as(timestep_flat) * av_ca_factor,
+                a_timestep_scaled.max().expand_as(timestep_flat) * av_ca_factor,
                {"resolution": None, "aspect_ratio": None},
                batch_size=batch_size,
                hidden_dtype=hidden_dtype,
            )
            av_ca_v2a_gate_noise_timestep, _ = self.av_ca_v2a_gate_adaln_single(
-                timestep.max().expand_as(a_timestep_flat) * av_ca_factor,
+                timestep_scaled.max().expand_as(a_timestep_flat) * av_ca_factor,
                {"resolution": None, "aspect_ratio": None},
                batch_size=batch_size,
                hidden_dtype=hidden_dtype,
--- a/comfy/ldm/lightricks/vae/causal_audio_autoencoder.py
+++ b/comfy/ldm/lightricks/vae/causal_audio_autoencoder.py
@ -1,4 +1,3 @@
-from __future__ import annotations
 import torch
 from torch import nn
 from torch.nn import functional as F
--- a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
+++ b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
@ -1,4 +1,3 @@
-from __future__ import annotations
 import threading
 import torch
 from torch import nn
--- a/comfy/ldm/lumina/model.py
+++ b/comfy/ldm/lumina/model.py
@ -1,5 +1,4 @@
 # Code from: https://github.com/Alpha-VLLM/Lumina-Image-2.0/blob/main/models/model.py
-from __future__ import annotations

 from typing import List, Optional, Tuple

--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@ -810,12 +810,12 @@ optimized_attention = attention_basic
 if model_management.sage_attention_enabled():
    logging.info("Using sage attention")
    optimized_attention = attention_sage
-elif model_management.xformers_enabled():
-    logging.info("Using xformers attention")
-    optimized_attention = attention_xformers
 elif model_management.flash_attention_enabled():
    logging.info("Using Flash Attention")
    optimized_attention = attention_flash
+elif model_management.xformers_enabled():
+    logging.info("Using xformers attention")
+    optimized_attention = attention_xformers
 elif model_management.pytorch_attention_enabled():
    logging.info("Using pytorch attention")
    optimized_attention = attention_pytorch
--- a/comfy/ldm/modules/diffusionmodules/mmdit.py
+++ b/comfy/ldm/modules/diffusionmodules/mmdit.py
@ -211,7 +211,7 @@ class TimestepEmbedder(nn.Module):
    Embeds scalar timesteps into vector representations.
    """

-    def __init__(self, hidden_size, frequency_embedding_size=256, output_size=None, dtype=None, device=None, operations=None):
+    def __init__(self, hidden_size, frequency_embedding_size=256, output_size=None, dtype=None, device=None, operations=None, max_period=10000):
        super().__init__()
        if output_size is None:
            output_size = hidden_size
@ -221,9 +221,10 @@ class TimestepEmbedder(nn.Module):
            operations.Linear(hidden_size, output_size, bias=True, dtype=dtype, device=device),
        )
        self.frequency_embedding_size = frequency_embedding_size
+        self.max_period = max_period

    def forward(self, t, dtype, **kwargs):
-        t_freq = timestep_embedding(t, self.frequency_embedding_size).to(dtype)
+        t_freq = timestep_embedding(t, self.frequency_embedding_size, max_period=self.max_period).to(dtype)
        t_emb = self.mlp(t_freq)
        return t_emb

--- a/comfy/ldm/moge/geometry.py
+++ b/comfy/ldm/moge/geometry.py
@ -1,6 +1,5 @@
 """Pure-torch + scipy geometry helpers for MoGe inference and mesh export."""

-from __future__ import annotations

 from typing import Optional, Tuple

--- a/comfy/ldm/moge/model.py
+++ b/comfy/ldm/moge/model.py
@ -4,7 +4,6 @@ V1: DINOv2 backbone + multi-output head (points, mask).
 V2: DINOv2 encoder + neck + per-output heads (points, mask, normal, optional metric-scale MLP).
 """

-from __future__ import annotations

 from numbers import Number
 from typing import Any, Dict, List, Optional, Tuple, Union
--- a/comfy/ldm/moge/modules.py
+++ b/comfy/ldm/moge/modules.py
@ -1,6 +1,5 @@
 """Building blocks for MoGe: residual conv stack, resamplers, MLP, DINOv2 encoder, v1 head."""

-from __future__ import annotations

 from typing import List, Optional, Sequence, Tuple, Union

--- a/comfy/ldm/moge/panorama.py
+++ b/comfy/ldm/moge/panorama.py
@ -6,7 +6,6 @@ equirect distance map via a multi-scale Poisson + gradient sparse solve.
 Image sampling uses F.grid_sample (GPU); the sparse solve uses lsmr (CPU).
 """

-from __future__ import annotations

 from typing import Callable, List, Optional, Tuple

--- a/comfy/ldm/pixeldit/model.py
+++ b/comfy/ldm/pixeldit/model.py
@ -0,0 +1,239 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import comfy.ldm.common_dit
+import comfy.patcher_extension
+from comfy.ldm.flux.math import apply_rope, rope
+from comfy.ldm.hidream.model import FeedForwardSwiGLU
+from comfy.ldm.modules.attention import optimized_attention
+from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder
+
+from .modules import (
+    FinalLayer,
+    PatchTokenEmbedder,
+    PiTBlock,
+    PixelTokenEmbedder,
+    apply_adaln_,
+    precompute_freqs_cis_2d,
+)
+
+
+class MMDiTJointAttention(nn.Module):
+    """Joint MMDiT attention with separate Q/K/V/proj for image and text streams.
+
+    RoPE is applied to each stream before concatenation so each stream uses its own
+    2D/1D positional encoding. Concat order is [text, image] (text first).
+    """
+    def __init__(self, dim, num_heads=8, qkv_bias=False, dtype=None, device=None, operations=None):
+        super().__init__()
+        assert dim % num_heads == 0
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+
+        self.qkv_x = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device)
+        self.qkv_y = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device)
+
+        self.q_norm_x = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device)
+        self.k_norm_x = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device)
+        self.q_norm_y = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device)
+        self.k_norm_y = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device)
+
+        self.proj_x = operations.Linear(dim, dim, dtype=dtype, device=device)
+        self.proj_y = operations.Linear(dim, dim, dtype=dtype, device=device)
+
+    def forward(self, x, y, pos_img, pos_txt=None, attn_mask=None, transformer_options={}):
+        B, Nx, _ = x.shape
+        _, Ny, _ = y.shape
+        H = self.num_heads
+        D = self.head_dim
+
+        qkv_x = self.qkv_x(x).reshape(B, Nx, 3, H, D).permute(2, 0, 3, 1, 4)
+        qx, kx, vx = qkv_x.unbind(0)
+        qx = self.q_norm_x(qx)
+        kx = self.k_norm_x(kx)
+
+        qkv_y = self.qkv_y(y).reshape(B, Ny, 3, H, D).permute(2, 0, 3, 1, 4)
+        qy, ky, vy = qkv_y.unbind(0)
+        qy = self.q_norm_y(qy)
+        ky = self.k_norm_y(ky)
+
+        qx, kx = apply_rope(qx, kx, pos_img[None, None])
+        if pos_txt is not None:
+            qy, ky = apply_rope(qy, ky, pos_txt[None, None])
+
+        q_joint = torch.cat([qy, qx], dim=2)
+        k_joint = torch.cat([ky, kx], dim=2)
+        v_joint = torch.cat([vy, vx], dim=2)
+
+        out_joint = optimized_attention(
+            q_joint, k_joint, v_joint, H,
+            mask=attn_mask, skip_reshape=True, skip_output_reshape=True,
+            transformer_options=transformer_options,
+        )
+
+        out_y = out_joint[:, :, :Ny, :].transpose(1, 2).reshape(B, Ny, H * D)
+        out_x = out_joint[:, :, Ny:, :].transpose(1, 2).reshape(B, Nx, H * D)
+
+        return self.proj_x(out_x), self.proj_y(out_y)
+
+
+class MMDiTBlockT2I(nn.Module):
+    def __init__(self, hidden_size, groups, mlp_ratio=4.0, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.norm_x1 = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device)
+        self.norm_y1 = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device)
+        self.attn = MMDiTJointAttention(hidden_size, num_heads=groups, qkv_bias=False, dtype=dtype, device=device, operations=operations)
+        self.norm_x2 = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device)
+        self.norm_y2 = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.mlp_x = FeedForwardSwiGLU(hidden_size, mlp_hidden_dim, multiple_of=1, dtype=dtype, device=device, operations=operations)
+        self.mlp_y = FeedForwardSwiGLU(hidden_size, mlp_hidden_dim, multiple_of=1, dtype=dtype, device=device, operations=operations)
+        self.adaLN_modulation_img = nn.Sequential(operations.Linear(hidden_size, 6 * hidden_size, bias=True, dtype=dtype, device=device))
+        self.adaLN_modulation_txt = nn.Sequential(operations.Linear(hidden_size, 6 * hidden_size, bias=True, dtype=dtype, device=device))
+
+    def forward(self, x, y, c, pos_img, pos_txt=None, attn_mask=None, transformer_options={}):
+        shift_msa_x, scale_msa_x, gate_msa_x, shift_mlp_x, scale_mlp_x, gate_mlp_x = self.adaLN_modulation_img(c).chunk(6, dim=-1)
+        shift_msa_y, scale_msa_y, gate_msa_y, shift_mlp_y, scale_mlp_y, gate_mlp_y = self.adaLN_modulation_txt(c).chunk(6, dim=-1)
+
+        x_norm = apply_adaln_(self.norm_x1(x), shift_msa_x, scale_msa_x)
+        y_norm = apply_adaln_(self.norm_y1(y), shift_msa_y, scale_msa_y)
+        attn_x, attn_y = self.attn(x_norm, y_norm, pos_img, pos_txt, attn_mask, transformer_options=transformer_options)
+        x = torch.addcmul(x, gate_msa_x, attn_x)
+        y = torch.addcmul(y, gate_msa_y, attn_y)
+
+        x = torch.addcmul(x, gate_mlp_x, self.mlp_x(apply_adaln_(self.norm_x2(x), shift_mlp_x, scale_mlp_x)))
+        y = torch.addcmul(y, gate_mlp_y, self.mlp_y(apply_adaln_(self.norm_y2(y), shift_mlp_y, scale_mlp_y)))
+        return x, y
+
+
+class PixDiT_T2I(nn.Module):
+    """PixelDiT T2I model. Hardcoded for the released 1024px Stage-3 checkpoint
+    (also runs at 512px when fed the appropriate latent size and flow_shift).
+
+    Forward:
+      x:        [B, 3, H, W] pixel-space input (no VAE)
+      timesteps:[B] in [0, 1000] (ComfyUI flow sampling convention)
+      context:  [B, Ltxt, 2304] Gemma-2-2b-it hidden states (chi_prompt prepended)
+    Returns flow-matching velocity [B, 3, H, W].
+    """
+    def __init__(
+        self,
+        in_channels=3,
+        num_groups=24,
+        hidden_size=1536,
+        pixel_hidden_size=16,
+        pixel_attn_hidden_size=1152,
+        pixel_num_groups=16,
+        patch_depth=14,
+        pixel_depth=2,
+        patch_size=16,
+        txt_embed_dim=2304,
+        txt_max_length=300,
+        use_text_rope=True,
+        text_rope_theta=10000.0,
+        image_model=None,
+        dtype=None,
+        device=None,
+        operations=None,
+        pixel_mlp_chunks=2,
+    ):
+        super().__init__()
+        self.dtype = dtype
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.hidden_size = hidden_size
+        self.num_groups = num_groups
+        self.patch_depth = patch_depth
+        self.pixel_depth = pixel_depth
+        self.patch_size = patch_size
+        self.pixel_hidden_size = pixel_hidden_size
+        self.pixel_attn_hidden_size = pixel_attn_hidden_size
+        self.pixel_num_groups = pixel_num_groups
+        self.txt_embed_dim = txt_embed_dim
+        self.txt_max_length = txt_max_length
+        self.use_text_rope = use_text_rope
+        self.text_rope_theta = text_rope_theta
+
+        self.pixel_embedder = PixelTokenEmbedder(self.in_channels, self.pixel_hidden_size, dtype=dtype, device=device, operations=operations)
+        self.s_embedder = PatchTokenEmbedder(self.in_channels * self.patch_size ** 2, self.hidden_size, bias=True, dtype=dtype, device=device, operations=operations)
+        self.t_embedder = TimestepEmbedder(self.hidden_size, dtype=dtype, device=device, operations=operations, max_period=10)
+        self.y_embedder = PatchTokenEmbedder(self.txt_embed_dim, self.hidden_size, bias=True, use_norm=True, dtype=dtype, device=device, operations=operations)
+        self.y_pos_embedding = nn.Parameter(torch.empty(1, self.txt_max_length, self.hidden_size, dtype=dtype, device=device))
+
+        self.patch_blocks = nn.ModuleList([
+            MMDiTBlockT2I(self.hidden_size, self.num_groups,
+                          dtype=dtype, device=device, operations=operations)
+            for _ in range(self.patch_depth)
+        ])
+        self.pixel_blocks = nn.ModuleList([
+            PiTBlock(
+                self.pixel_hidden_size,
+                self.hidden_size,
+                patch_size=self.patch_size,
+                num_heads=self.num_groups,
+                attn_hidden_size=self.pixel_attn_hidden_size,
+                attn_num_heads=self.pixel_num_groups,
+                dtype=dtype, device=device, operations=operations,
+                mlp_chunks=pixel_mlp_chunks,
+            )
+            for _ in range(self.pixel_depth)
+        ])
+
+        self.final_layer = FinalLayer(self.pixel_hidden_size, self.out_channels, dtype=dtype, device=device, operations=operations)
+
+    def _fetch_patch_pos(self, height, width, device, dtype, **rope_opts):
+        return precompute_freqs_cis_2d(self.hidden_size // self.num_groups, height, width, device=device, dtype=dtype, **rope_opts)
+
+    def _fetch_text_pos(self, length, device, dtype):
+        return rope(torch.arange(length, dtype=torch.float32, device=device).reshape(1, -1), self.hidden_size // self.num_groups, self.text_rope_theta).squeeze(0).to(dtype=dtype)
+
+    def forward(self, x, timesteps, context=None, attention_mask=None, transformer_options={}, **kwargs):
+        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
+            self._forward, self, comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options),
+        ).execute(x, timesteps, context, attention_mask, transformer_options, **kwargs)
+
+    def _pre_patch_block(self, s, i, **kwargs):
+        """Hook for subclasses to inject per-block state into the patch stream (e.g. PiD's LQ gate)."""
+        return s
+
+    def _forward(self, x, timesteps, context=None, attention_mask=None, transformer_options={}, **kwargs):
+        H_orig, W_orig = x.shape[2], x.shape[3]
+        x = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size))
+        B, _, H, W = x.shape
+        Hs = H // self.patch_size
+        Ws = W // self.patch_size
+        L = Hs * Ws
+
+        pos_img = self._fetch_patch_pos(Hs, Ws, x.device, x.dtype, **(transformer_options.get("rope_options") or {}))
+        x_patches = F.unfold(x, kernel_size=self.patch_size, stride=self.patch_size).transpose(1, 2)
+
+        t_emb = self.t_embedder(timesteps.view(-1), x.dtype).view(B, -1, self.hidden_size)
+
+        if context is None or context.dim() != 3:
+            raise ValueError("PixDiT_T2I requires context (text embeddings) of shape [B, L, D]")
+        Ltxt = min(context.shape[1], self.txt_max_length)
+        y = context[:, :Ltxt, :]
+        y_emb = self.y_embedder(y).view(B, Ltxt, self.hidden_size)
+        y_emb = y_emb + self.y_pos_embedding[:, :Ltxt, :].to(y_emb) # y_pos_embedding is a raw nn.Parameter
+
+        condition = F.silu(t_emb)
+        pos_txt = self._fetch_text_pos(Ltxt, x.device, x.dtype) if self.use_text_rope else None
+
+        s = self.s_embedder(x_patches)
+        for i, blk in enumerate(self.patch_blocks):
+            s = self._pre_patch_block(s, i, **kwargs)
+            s, y_emb = blk(s, y_emb, condition, pos_img, pos_txt, None, transformer_options=transformer_options)
+        s = F.silu(t_emb + s)
+
+        s_cond = s.view(B * L, self.hidden_size)
+        x_pixels = self.pixel_embedder(x, patch_size=self.patch_size)
+        for blk in self.pixel_blocks:
+            x_pixels = blk(x_pixels, s_cond, H, W, self.patch_size, mask=None, transformer_options=transformer_options)
+
+        x_pixels = self.final_layer(x_pixels)
+        C_out = self.out_channels
+        P2 = self.patch_size * self.patch_size
+        x_pixels = x_pixels.view(B, L, P2, C_out).permute(0, 3, 2, 1).reshape(B, C_out * P2, L)
+        out = F.fold(x_pixels, (H, W), kernel_size=self.patch_size, stride=self.patch_size)
+        return out[:, :, :H_orig, :W_orig]
--- a/comfy/ldm/pixeldit/modules.py
+++ b/comfy/ldm/pixeldit/modules.py
@ -0,0 +1,187 @@
+import torch
+import torch.nn as nn
+
+from comfy.ldm.flux.math import apply_rope, rope
+from comfy.ldm.modules.attention import optimized_attention
+from comfy.ldm.modules.diffusionmodules.mmdit import Mlp, get_1d_sincos_pos_embed_from_grid_torch
+
+
+def apply_adaln_(x, shift, scale):
+    return x.addcmul_(x, scale).add_(shift)
+
+
+def precompute_freqs_cis_2d(dim, height, width, theta=10000.0, scale=16.0,
+                            ref_grid_h=None, ref_grid_w=None,
+                            scale_x=1.0, scale_y=1.0, shift_x=0.0, shift_y=0.0,
+                            device=None, dtype=torch.float32, **kwargs):
+    """2D RoPE with x/y axis frequencies interleaved at stride 2 across head dim.
+
+    rope_options:
+      scale_x / scale_y multiply the position range (RoPE extrapolation).
+      shift_x / shift_y offset the position origin (tiled / regional inference).
+    With ref_grid_h/w set, also applies NTK-aware per-axis theta scaling
+    (rope_mode='ntk_aware'): theta_axis = theta * (current/ref)^(dim_axis/(dim_axis-2)).
+    Returns Flux-format rotation matrices of shape [H*W, dim/2, 2, 2].
+    Layout of head-dim pairs: [x_0, y_0, x_1, y_1, ..., x_{dim/4-1}, y_{dim/4-1}].
+    """
+    dim_axis = dim // 2
+    if ref_grid_h is not None and dim_axis > 2:
+        h_ntk = (height / ref_grid_h) ** (dim_axis / (dim_axis - 2))
+        w_ntk = (width / ref_grid_w) ** (dim_axis / (dim_axis - 2))
+    else:
+        h_ntk = w_ntk = 1.0
+
+    x_lin = torch.linspace(shift_x, scale * scale_x + shift_x, width, device=device)
+    y_lin = torch.linspace(shift_y, scale * scale_y + shift_y, height, device=device)
+    y_grid, x_grid = torch.meshgrid(y_lin, x_lin, indexing="ij")
+    x_rope = rope(x_grid.reshape(1, -1), dim_axis, theta * w_ntk).squeeze(0)
+    y_rope = rope(y_grid.reshape(1, -1), dim_axis, theta * h_ntk).squeeze(0)
+    out = torch.stack([x_rope, y_rope], dim=2).reshape(height * width, dim // 2, 2, 2)
+    return out.to(dtype=dtype)
+
+
+def get_2d_sincos_pos_embed(embed_dim, height, width, device=None, dtype=torch.float32):
+    """Standard 2D sin/cos absolute positional embedding (ViT-style).
+
+    first half encodes W-coordinates, second half H.
+    """
+    assert embed_dim % 4 == 0
+    grid_h = torch.arange(height, dtype=torch.float32, device=device)
+    grid_w = torch.arange(width, dtype=torch.float32, device=device)
+    grid_y, grid_x = torch.meshgrid(grid_h, grid_w, indexing="ij")
+    emb_w = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_x.reshape(-1), device=device)
+    emb_h = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_y.reshape(-1), device=device)
+    return torch.cat([emb_w, emb_h], dim=1).to(dtype=dtype)
+
+
+class RotaryAttention(nn.Module):
+    """Single-stream self-attention with rotary positional encoding (used inside PiTBlock)."""
+    def __init__(self, dim, num_heads=8, qkv_bias=False, dtype=None, device=None, operations=None):
+        super().__init__()
+        assert dim % num_heads == 0
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.qkv = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device)
+        self.q_norm = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device)
+        self.k_norm = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device)
+        self.proj = operations.Linear(dim, dim, dtype=dtype, device=device)
+
+    def forward(self, x, pos, mask=None, transformer_options={}):
+        B, N, C = x.shape
+        H = self.num_heads
+        D = self.head_dim
+        qkv = self.qkv(x).reshape(B, N, 3, H, D).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+        q, k = apply_rope(self.q_norm(q), self.k_norm(k), pos[None, None])
+        x = optimized_attention(q, k, v, H, mask=mask, skip_reshape=True, transformer_options=transformer_options)
+        return self.proj(x)
+
+
+class FinalLayer(nn.Module):
+    def __init__(self, hidden_size, out_channels, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.norm = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device)
+        self.linear = operations.Linear(hidden_size, out_channels, bias=True, dtype=dtype, device=device)
+
+    def forward(self, x):
+        return self.linear(self.norm(x))
+
+
+class PatchTokenEmbedder(nn.Module):
+    """Linear projection used both for patchified-image tokens and text-feature tokens."""
+    def __init__(self, in_chans, embed_dim, use_norm=False, bias=True, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.proj = operations.Linear(in_chans, embed_dim, bias=bias, dtype=dtype, device=device)
+        self.norm = operations.RMSNorm(embed_dim, eps=1e-6, dtype=dtype, device=device) if use_norm else nn.Identity()
+
+    def forward(self, x):
+        return self.norm(self.proj(x))
+
+
+class PixelTokenEmbedder(nn.Module):
+    """Pixel-level embedder: lifts each RGB pixel to hidden_size and packs into per-patch sequences."""
+    def __init__(self, in_channels, hidden_size_output, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.in_channels = in_channels
+        self.hidden_size_output = hidden_size_output
+        self.proj = operations.Linear(self.in_channels, self.hidden_size_output, bias=True, dtype=dtype, device=device)
+
+    def forward(self, inputs, patch_size):
+        B, _, H, W = inputs.shape
+        Hs, Ws = H // patch_size, W // patch_size
+        P2 = patch_size * patch_size
+        x = inputs.permute(0, 2, 3, 1).contiguous()
+        x = self.proj(x)
+        pos_full = get_2d_sincos_pos_embed(self.hidden_size_output, H, W, device=x.device, dtype=x.dtype).view(H, W, self.hidden_size_output)
+        x = x + pos_full.unsqueeze(0)
+        x = x.view(B, Hs, patch_size, Ws, patch_size, self.hidden_size_output)
+        return x.permute(0, 1, 3, 2, 4, 5).reshape(B * Hs * Ws, P2, self.hidden_size_output)
+
+
+class PiTBlock(nn.Module):
+    """Pixel-level transformer block.
+
+    Compresses each patch's P^2 pixel tokens → 1 attention token via a linear,
+    runs global self-attention across patches with 2D RoPE, then expands back to P^2 tokens.
+    Conditioning is per-pixel adaLN from the patch-level features.
+    """
+    def __init__(self, pixel_hidden_size, patch_hidden_size, patch_size, num_heads, mlp_ratio=4.0,
+                 attn_hidden_size=None, attn_num_heads=None, dtype=None, device=None, operations=None, mlp_chunks=1):
+        super().__init__()
+        self.pixel_dim = pixel_hidden_size
+        self.context_dim = patch_hidden_size
+        self.attn_dim = attn_hidden_size if attn_hidden_size is not None else patch_hidden_size
+        self.num_heads = attn_num_heads if attn_num_heads is not None else num_heads
+        assert self.attn_dim % self.num_heads == 0
+
+        p2 = patch_size * patch_size
+        self.compress_to_attn = operations.Linear(p2 * self.pixel_dim, self.attn_dim, bias=True, dtype=dtype, device=device)
+        self.expand_from_attn = operations.Linear(self.attn_dim, p2 * self.pixel_dim, bias=True, dtype=dtype, device=device)
+
+        self.norm1 = operations.RMSNorm(self.pixel_dim, eps=1e-6, dtype=dtype, device=device)
+        self.attn = RotaryAttention(self.attn_dim, num_heads=self.num_heads, qkv_bias=False, dtype=dtype, device=device, operations=operations)
+        self.norm2 = operations.RMSNorm(self.pixel_dim, eps=1e-6, dtype=dtype, device=device)
+        self.mlp = Mlp(self.pixel_dim, hidden_features=int(self.pixel_dim * mlp_ratio), dtype=dtype, device=device, operations=operations)
+
+        self.adaLN_modulation_msa = operations.Linear(self.context_dim, 3 * self.pixel_dim * p2, bias=True, dtype=dtype, device=device)
+        self.adaLN_modulation_mlp = operations.Linear(self.context_dim, 3 * self.pixel_dim * p2, bias=True, dtype=dtype, device=device)
+
+        self._rope_fn = precompute_freqs_cis_2d
+        self.mlp_chunks = max(1, int(mlp_chunks))
+
+    def _fetch_pos(self, height, width, device, dtype, **rope_opts):
+        return self._rope_fn(self.attn_dim // self.num_heads, height, width, device=device, dtype=dtype, **rope_opts)
+
+    def forward(self, x, s_cond, image_height, image_width, patch_size, mask=None, transformer_options={}):
+        BL, P2, _ = x.shape
+        Hs, Ws = image_height // patch_size, image_width // patch_size
+        L = Hs * Ws
+        B = BL // L
+
+        # Attention path uses only msa params; compute, use, free before mlp params allocate.
+        msa_params = self.adaLN_modulation_msa(s_cond).view(BL, P2, 3 * self.pixel_dim)
+        shift_msa, scale_msa, gate_msa = msa_params.chunk(3, dim=-1)
+
+        x_norm = apply_adaln_(self.norm1(x), shift_msa, scale_msa)
+        x_flat = x_norm.view(BL, P2 * self.pixel_dim)
+
+        x_comp = self.compress_to_attn(x_flat).view(B, L, self.attn_dim)
+        pos_comp = self._fetch_pos(Hs, Ws, x.device, x.dtype, **(transformer_options.get("rope_options") or {}))
+        attn_out = self.attn(x_comp, pos_comp, mask=mask, transformer_options=transformer_options)
+        attn_flat = self.expand_from_attn(attn_out.view(B * L, self.attn_dim))
+        attn_exp = attn_flat.view(BL, P2, self.pixel_dim)
+        x = torch.addcmul(x, gate_msa, attn_exp)
+        del msa_params, shift_msa, scale_msa, gate_msa
+
+        mlp_params = self.adaLN_modulation_mlp(s_cond).view(BL, P2, 3 * self.pixel_dim)
+        shift_mlp, scale_mlp, gate_mlp = mlp_params.chunk(3, dim=-1)
+        gate_mlp = gate_mlp.contiguous()  # detach from mlp_params so the del below frees shift+scale storage before the MLP
+        mlp_input = apply_adaln_(self.norm2(x), shift_mlp, scale_mlp)
+        del mlp_params, shift_mlp, scale_mlp
+
+        # MLP in chunks since the peak memory usage is huge here
+        chunk_size = (BL + self.mlp_chunks - 1) // self.mlp_chunks
+        for s in range(0, BL, chunk_size):
+            e = min(s + chunk_size, BL)
+            x[s:e].addcmul_(gate_mlp[s:e], self.mlp(mlp_input[s:e]))
+        return x
--- a/comfy/ldm/pixeldit/pid.py
+++ b/comfy/ldm/pixeldit/pid.py
@ -0,0 +1,227 @@
+"""PiD — Pixel Diffusion Decoder. Decodes a Flux/SD3/Flux2/Z-Image latent
+directly to a 4x-upscaled image in 4 distilled flow-matching steps. PixDiT_T2I
+body + LQ projection branch injected before each MMDiT patch block.
+"""
+
+from typing import List
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .model import PixDiT_T2I
+from .modules import precompute_freqs_cis_2d
+
+
+class SigmaAwareGatePerTokenPerDim(nn.Module):
+    """gate = sigmoid(content_proj(cat[x, lq]) - exp(log_alpha) * sigma); out = x + gate * lq.
+
+    Trained init gives ~0.88 gate at sigma=0, ~0.05 at sigma=1.
+    """
+
+    def __init__(self, dim: int, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.content_proj = operations.Linear(dim * 2, dim, dtype=dtype, device=device)
+        self.log_alpha = nn.Parameter(torch.empty((), dtype=dtype, device=device))
+
+    def forward(self, x: torch.Tensor, lq: torch.Tensor, sigma: torch.Tensor) -> torch.Tensor:
+        content_logit = self.content_proj(torch.cat([x, lq], dim=-1))
+        # log_alpha is a raw nn.Parameter -> doesn't auto-cast under dynamic VRAM.
+        log_alpha = self.log_alpha.to(device=x.device, dtype=torch.float32)
+        sigma_offset = -log_alpha.exp() * sigma.float().view(-1, 1, 1)
+        gate = torch.sigmoid(content_logit + sigma_offset)
+        return x + (gate * lq).to(x.dtype)
+
+
+class ResBlock(nn.Module):
+    """Pre-activation ResNet block: GN -> SiLU -> Conv -> GN -> SiLU -> Conv + skip."""
+
+    def __init__(self, channels: int, num_groups: int = 4, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.block = nn.Sequential(
+            operations.GroupNorm(num_groups, channels, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Conv2d(channels, channels, kernel_size=3, padding=1, dtype=dtype, device=device),
+            operations.GroupNorm(num_groups, channels, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Conv2d(channels, channels, kernel_size=3, padding=1, dtype=dtype, device=device),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x + self.block(x)
+
+
+class LQProjection2D(nn.Module):
+    """LQ latent -> per-block patch-aligned features for controlnet-style injection."""
+
+    def __init__(
+        self,
+        latent_channels: int,
+        hidden_dim: int = 512,
+        out_dim: int = 1536,
+        patch_size: int = 16,
+        sr_scale: int = 4,
+        latent_spatial_down_factor: int = 8,
+        num_res_blocks: int = 4,
+        num_outputs: int = 7,
+        interval: int = 2,
+        dtype=None, device=None, operations=None,
+    ):
+        super().__init__()
+        self.latent_channels = latent_channels
+        self.hidden_dim = hidden_dim
+        self.out_dim = out_dim
+        self.patch_size = patch_size
+        self.sr_scale = sr_scale
+        self.latent_spatial_down_factor = latent_spatial_down_factor
+        self.num_outputs = num_outputs
+        self.interval = interval
+
+        z_to_patch_ratio = (sr_scale * latent_spatial_down_factor) / patch_size
+        self.z_to_patch_ratio = z_to_patch_ratio
+        if z_to_patch_ratio >= 1:
+            self.latent_fold_factor = 0
+            latent_proj_in_ch = latent_channels
+        else:
+            fold_factor = int(1 / z_to_patch_ratio)
+            assert fold_factor * z_to_patch_ratio == 1.0
+            self.latent_fold_factor = fold_factor
+            latent_proj_in_ch = latent_channels * fold_factor * fold_factor
+
+        layers = [
+            operations.Conv2d(latent_proj_in_ch, hidden_dim, kernel_size=3, padding=1, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Conv2d(hidden_dim, hidden_dim, kernel_size=3, padding=1, dtype=dtype, device=device),
+        ]
+        for _ in range(num_res_blocks):
+            layers.append(ResBlock(hidden_dim, dtype=dtype, device=device, operations=operations))
+        self.latent_proj = nn.Sequential(*layers)
+
+        self.output_heads = nn.ModuleList(
+            [operations.Linear(hidden_dim, out_dim, dtype=dtype, device=device) for _ in range(num_outputs)]
+        )
+        self.gate_modules = nn.ModuleList(
+            [SigmaAwareGatePerTokenPerDim(out_dim, dtype=dtype, device=device, operations=operations)
+             for _ in range(num_outputs)]
+        )
+
+    def is_gate_active(self, block_idx: int) -> bool:
+        return block_idx % self.interval == 0
+
+    def output_index(self, block_idx: int) -> int:
+        return block_idx // self.interval
+
+    def gate(self, x: torch.Tensor, lq_feature: torch.Tensor, sigma: torch.Tensor, out_idx: int) -> torch.Tensor:
+        return self.gate_modules[out_idx](x, lq_feature, sigma)
+
+    def _align_latent_to_patch_grid(self, lq_latent: torch.Tensor, pH: int, pW: int) -> torch.Tensor:
+        B, z_dim = lq_latent.shape[:2]
+        if self.z_to_patch_ratio >= 1:
+            if lq_latent.shape[2] != pH or lq_latent.shape[3] != pW:
+                z_aligned = F.interpolate(lq_latent, size=(pH, pW), mode="nearest")
+            else:
+                z_aligned = lq_latent
+        else:
+            f = self.latent_fold_factor
+            zH_expected, zW_expected = pH * f, pW * f
+            if lq_latent.shape[2] != zH_expected or lq_latent.shape[3] != zW_expected:
+                lq_latent = F.interpolate(lq_latent, size=(zH_expected, zW_expected), mode="nearest")
+            z_aligned = lq_latent.reshape(B, z_dim, pH, f, pW, f).permute(0, 1, 3, 5, 2, 4)
+            z_aligned = z_aligned.reshape(B, z_dim * f * f, pH, pW)
+        return self.latent_proj(z_aligned)
+
+    def forward(self, lq_latent: torch.Tensor, target_pH: int, target_pW: int) -> List[torch.Tensor]:
+        feat = self._align_latent_to_patch_grid(lq_latent, target_pH, target_pW)
+        B, C, H, W = feat.shape
+        tokens = feat.permute(0, 2, 3, 1).contiguous().view(B, H * W, C)
+        return [head(tokens) for head in self.output_heads]
+
+
+class PidNet(PixDiT_T2I):
+    """PixDiT_T2I + LQ injection (one sigma-gated feature inserted before each patch block)."""
+
+    def __init__(
+        self,
+        lq_latent_channels: int = 16,
+        lq_hidden_dim: int = 512,
+        lq_num_res_blocks: int = 4,
+        lq_interval: int = 2,
+        sr_scale: int = 4,
+        latent_spatial_down_factor: int = 8,
+        rope_ref_h: int = 1024, # NTK ref resolution in PIXEL units: 1024px / patch=16 -> grid_ref=64.
+        rope_ref_w: int = 1024,
+        image_model=None,
+        dtype=None, device=None, operations=None,
+        **pixdit_kwargs,
+    ):
+        super().__init__(dtype=dtype, device=device, operations=operations, **pixdit_kwargs)
+
+        self.rope_ref_grid_h = rope_ref_h // self.patch_size
+        self.rope_ref_grid_w = rope_ref_w // self.patch_size
+
+        # Parent's PiTBlocks were built with plain RoPE — swap in NTK-aware.
+        def _pit_rope_fn(head_dim, h, w, device=None, dtype=torch.float32, **rope_opts):
+            return precompute_freqs_cis_2d(head_dim, h, w, ref_grid_h=self.rope_ref_grid_h, ref_grid_w=self.rope_ref_grid_w, device=device, dtype=dtype, **rope_opts)
+        for blk in self.pixel_blocks:
+            blk._rope_fn = _pit_rope_fn
+
+        num_lq_outputs = (self.patch_depth + lq_interval - 1) // lq_interval
+        self.lq_proj = LQProjection2D(
+            latent_channels=lq_latent_channels,
+            hidden_dim=lq_hidden_dim,
+            out_dim=self.hidden_size,
+            patch_size=self.patch_size,
+            sr_scale=sr_scale,
+            latent_spatial_down_factor=latent_spatial_down_factor,
+            num_res_blocks=lq_num_res_blocks,
+            num_outputs=num_lq_outputs,
+            interval=lq_interval,
+            dtype=dtype,
+            device=device,
+            operations=operations,
+        )
+
+    def _fetch_patch_pos(self, height, width, device, dtype, **rope_opts):
+        return precompute_freqs_cis_2d(
+            self.hidden_size // self.num_groups,
+            height, width,
+            ref_grid_h=self.rope_ref_grid_h, ref_grid_w=self.rope_ref_grid_w,
+            device=device, dtype=dtype, **rope_opts,
+        )
+
+    def _pre_patch_block(self, s, i, pid_lq_features, pid_degrade_sigma, **kwargs):
+        if not self.lq_proj.is_gate_active(i):
+            return s
+        out_idx = self.lq_proj.output_index(i)
+        if out_idx >= len(pid_lq_features):
+            return s
+        return self.lq_proj.gate(s, pid_lq_features[out_idx], pid_degrade_sigma, out_idx)
+
+    def _forward(self, x, timesteps, context=None, attention_mask=None, transformer_options={}, lq_latent=None, degrade_sigma=None, **kwargs):
+        if lq_latent is None:
+            raise ValueError("PidNet requires lq_latent — attach via PiDConditioning")
+        expected_c = self.lq_proj.latent_channels
+        if lq_latent.shape[1] != expected_c:
+            raise ValueError(
+                f"Input latent has {lq_latent.shape[1]} channels, this model variant expects {expected_c}. "
+                f"Flux1/SD3 = 16 channels, Flux2 = 128 channels."
+            )
+        B = x.shape[0]
+        # Match the backbone's pad_to_patch_size (round up) so the LQ grid lines up with the patch stream.
+        Hs = -(-x.shape[2] // self.patch_size)
+        Ws = -(-x.shape[3] // self.patch_size)
+
+        degrade_sigma = degrade_sigma.to(device=x.device, dtype=torch.float32).reshape(-1)
+        if degrade_sigma.numel() == 1 and B > 1:
+            degrade_sigma = degrade_sigma.expand(B).contiguous()
+
+        lq_features = self.lq_proj(lq_latent=lq_latent.to(x), target_pH=Hs, target_pW=Ws)
+
+        return super()._forward(
+            x, timesteps,
+            context=context, attention_mask=attention_mask,
+            transformer_options=transformer_options,
+            pid_lq_features=lq_features,
+            pid_degrade_sigma=degrade_sigma,
+            **kwargs,
+        )
--- a/Show More
+++ b/Show More