mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-05-30 19:07:25 +08:00
ComfyUI 0.4.0 with transformers 5 support
This commit is contained in:
commit
5a1dda8bd0
@ -0,0 +1,3 @@
|
||||
..\python_embeded\python.exe -s ..\ComfyUI\main.py --windows-standalone-build --disable-api-nodes
|
||||
echo If you see this and ComfyUI did not start try updating your Nvidia Drivers to the latest.
|
||||
pause
|
||||
8
.github/ISSUE_TEMPLATE/bug-report.yml
vendored
8
.github/ISSUE_TEMPLATE/bug-report.yml
vendored
@ -8,13 +8,15 @@ body:
|
||||
Before submitting a **Bug Report**, please ensure the following:
|
||||
|
||||
- **1:** You are running the latest version of ComfyUI.
|
||||
- **2:** You have looked at the existing bug reports and made sure this isn't already reported.
|
||||
- **2:** You have your ComfyUI logs and relevant workflow on hand and will post them in this bug report.
|
||||
- **3:** You confirmed that the bug is not caused by a custom node. You can disable all custom nodes by passing
|
||||
`--disable-all-custom-nodes` command line argument.
|
||||
`--disable-all-custom-nodes` command line argument. If you have custom node try updating them to the latest version.
|
||||
- **4:** This is an actual bug in ComfyUI, not just a support question. A bug is when you can specify exact
|
||||
steps to replicate what went wrong and others will be able to repeat your steps and see the same issue happen.
|
||||
|
||||
If unsure, ask on the [ComfyUI Matrix Space](https://app.element.io/#/room/%23comfyui_space%3Amatrix.org) or the [Comfy Org Discord](https://discord.gg/comfyorg) first.
|
||||
## Very Important
|
||||
|
||||
Please make sure that you post ALL your ComfyUI logs in the bug report. A bug report without logs will likely be ignored.
|
||||
- type: checkboxes
|
||||
id: custom-nodes-test
|
||||
attributes:
|
||||
|
||||
21
.github/PULL_REQUEST_TEMPLATE/api-node.md
vendored
Normal file
21
.github/PULL_REQUEST_TEMPLATE/api-node.md
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
<!-- API_NODE_PR_CHECKLIST: do not remove -->
|
||||
|
||||
## API Node PR Checklist
|
||||
|
||||
### Scope
|
||||
- [ ] **Is API Node Change**
|
||||
|
||||
### Pricing & Billing
|
||||
- [ ] **Need pricing update**
|
||||
- [ ] **No pricing update**
|
||||
|
||||
If **Need pricing update**:
|
||||
- [ ] Metronome rate cards updated
|
||||
- [ ] Auto‑billing tests updated and passing
|
||||
|
||||
### QA
|
||||
- [ ] **QA done**
|
||||
- [ ] **QA not required**
|
||||
|
||||
### Comms
|
||||
- [ ] Informed **Kosinkadink**
|
||||
58
.github/workflows/api-node-template.yml
vendored
Normal file
58
.github/workflows/api-node-template.yml
vendored
Normal file
@ -0,0 +1,58 @@
|
||||
name: Append API Node PR template
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
types: [opened, reopened, synchronize, ready_for_review]
|
||||
paths:
|
||||
- 'comfy_api_nodes/**' # only run if these files changed
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
|
||||
jobs:
|
||||
inject:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Ensure template exists and append to PR body
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
script: |
|
||||
const { owner, repo } = context.repo;
|
||||
const number = context.payload.pull_request.number;
|
||||
const templatePath = '.github/PULL_REQUEST_TEMPLATE/api-node.md';
|
||||
const marker = '<!-- API_NODE_PR_CHECKLIST: do not remove -->';
|
||||
|
||||
const { data: pr } = await github.rest.pulls.get({ owner, repo, pull_number: number });
|
||||
|
||||
let templateText;
|
||||
try {
|
||||
const res = await github.rest.repos.getContent({
|
||||
owner,
|
||||
repo,
|
||||
path: templatePath,
|
||||
ref: pr.base.ref
|
||||
});
|
||||
const buf = Buffer.from(res.data.content, res.data.encoding || 'base64');
|
||||
templateText = buf.toString('utf8');
|
||||
} catch (e) {
|
||||
core.setFailed(`Required PR template not found at "${templatePath}" on ${pr.base.ref}. Please add it to the repo.`);
|
||||
return;
|
||||
}
|
||||
|
||||
// Enforce the presence of the marker inside the template (for idempotence)
|
||||
if (!templateText.includes(marker)) {
|
||||
core.setFailed(`Template at "${templatePath}" does not contain the required marker:\n${marker}\nAdd it so we can detect duplicates safely.`);
|
||||
return;
|
||||
}
|
||||
|
||||
// If the PR already contains the marker, do not append again.
|
||||
const body = pr.body || '';
|
||||
if (body.includes(marker)) {
|
||||
core.info('Template already present in PR body; nothing to inject.');
|
||||
return;
|
||||
}
|
||||
|
||||
const newBody = (body ? body + '\n\n' : '') + templateText + '\n';
|
||||
await github.rest.pulls.update({ owner, repo, pull_number: number, body: newBody });
|
||||
core.notice('API Node template appended to PR description.');
|
||||
4
.github/workflows/test.yml
vendored
4
.github/workflows/test.yml
vendored
@ -62,7 +62,7 @@ jobs:
|
||||
uv pip install --torch-backend=auto ".[dev]" --inexact
|
||||
- name: Lint for errors
|
||||
run: |
|
||||
pylint --rcfile=.pylintrc comfy/ comfy_extras/ comfy_api/ comfy_api_nodes/
|
||||
pylint comfy/ comfy_extras/ comfy_api/ comfy_api_nodes/
|
||||
- name: Run unit tests
|
||||
run: |
|
||||
pytest -v tests/unit
|
||||
@ -108,7 +108,7 @@ jobs:
|
||||
- name: Lint for errors
|
||||
run: |
|
||||
cd ${GITHUB_WORKSPACE}/src
|
||||
pylint --rcfile=.pylintrc comfy/ comfy_extras/ comfy_api/ comfy_api_nodes/
|
||||
pylint comfy/ comfy_extras/ comfy_api/ comfy_api_nodes/
|
||||
- name: Run unit tests
|
||||
run: |
|
||||
# only use dGPU
|
||||
|
||||
889
.pylintrc
889
.pylintrc
@ -1,889 +0,0 @@
|
||||
[MAIN]
|
||||
|
||||
# Analyse import fallback blocks. This can be used to support both Python 2 and
|
||||
# 3 compatible code, which means that the block might have code that exists
|
||||
# only in one or another interpreter, leading to false positives when analysed.
|
||||
analyse-fallback-blocks=no
|
||||
|
||||
# Clear in-memory caches upon conclusion of linting. Useful if running pylint
|
||||
# in a server-like mode.
|
||||
clear-cache-post-run=no
|
||||
|
||||
# Load and enable all available extensions. Use --list-extensions to see a list
|
||||
# all available extensions.
|
||||
#enable-all-extensions=
|
||||
|
||||
# In error mode, messages with a category besides ERROR or FATAL are
|
||||
# suppressed, and no reports are done by default. Error mode is compatible with
|
||||
# disabling specific errors.
|
||||
#errors-only=
|
||||
|
||||
# Always return a 0 (non-error) status code, even if lint errors are found.
|
||||
# This is primarily useful in continuous integration scripts.
|
||||
#exit-zero=
|
||||
|
||||
# A comma-separated list of package or module names from where C extensions may
|
||||
# be loaded. Extensions are loading into the active Python interpreter and may
|
||||
# run arbitrary code.
|
||||
extension-pkg-allow-list=
|
||||
|
||||
# A comma-separated list of package or module names from where C extensions may
|
||||
# be loaded. Extensions are loading into the active Python interpreter and may
|
||||
# run arbitrary code. (This is an alternative name to extension-pkg-allow-list
|
||||
# for backward compatibility.)
|
||||
extension-pkg-whitelist=cv2
|
||||
|
||||
# Return non-zero exit code if any of these messages/categories are detected,
|
||||
# even if score is above --fail-under value. Syntax same as enable. Messages
|
||||
# specified are enabled, while categories only check already-enabled messages.
|
||||
fail-on=
|
||||
|
||||
# Specify a score threshold under which the program will exit with error.
|
||||
fail-under=10
|
||||
|
||||
# Interpret the stdin as a python script, whose filename needs to be passed as
|
||||
# the module_or_package argument.
|
||||
#from-stdin=
|
||||
|
||||
# Files or directories to be skipped. They should be base names, not paths.
|
||||
ignore=CVS
|
||||
|
||||
# Add files or directories matching the regular expressions patterns to the
|
||||
# ignore-list. The regex matches against paths and can be in Posix or Windows
|
||||
# format. Because '\\' represents the directory delimiter on Windows systems,
|
||||
# it can't be used as an escape character.
|
||||
ignore-paths=^comfy/api/.*$
|
||||
|
||||
# Files or directories matching the regular expression patterns are skipped.
|
||||
# The regex matches against base names, not paths. The default value ignores
|
||||
# Emacs file locks
|
||||
ignore-patterns=^\.#
|
||||
|
||||
# List of module names for which member attributes should not be checked and
|
||||
# will not be imported (useful for modules/projects where namespaces are
|
||||
# manipulated during runtime and thus existing member attributes cannot be
|
||||
# deduced by static analysis). It supports qualified module names, as well as
|
||||
# Unix pattern matching.
|
||||
ignored-modules=sentencepiece.*,comfy.api,comfy.cmd.folder_paths
|
||||
|
||||
# Python code to execute, usually for sys.path manipulation such as
|
||||
# pygtk.require().
|
||||
init-hook='import sys; sys.path.insert(0, ".")'
|
||||
|
||||
# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
|
||||
# number of processors available to use, and will cap the count on Windows to
|
||||
# avoid hangs.
|
||||
jobs=1
|
||||
|
||||
# Control the amount of potential inferred values when inferring a single
|
||||
# object. This can help the performance when dealing with large functions or
|
||||
# complex, nested conditions.
|
||||
limit-inference-results=100
|
||||
|
||||
# List of plugins (as comma separated values of python module names) to load,
|
||||
# usually to register additional checkers.
|
||||
load-plugins=tests.absolute_import_checker,tests.main_pre_import_checker
|
||||
|
||||
# Pickle collected data for later comparisons.
|
||||
persistent=yes
|
||||
|
||||
# Resolve imports to .pyi stubs if available. May reduce no-member messages and
|
||||
# increase not-an-iterable messages.
|
||||
prefer-stubs=no
|
||||
|
||||
# Minimum Python version to use for version dependent checks. Will default to
|
||||
# the version used to run pylint.
|
||||
py-version=3.10
|
||||
|
||||
# Discover python modules and packages in the file system subtree.
|
||||
recursive=no
|
||||
|
||||
# Add paths to the list of the source roots. Supports globbing patterns. The
|
||||
# source root is an absolute path or a path relative to the current working
|
||||
# directory used to determine a package namespace for modules located under the
|
||||
# source root.
|
||||
source-roots=
|
||||
|
||||
# When enabled, pylint would attempt to guess common misconfiguration and emit
|
||||
# user-friendly hints instead of false-positive error messages.
|
||||
# suggestion-mode=yes
|
||||
|
||||
# Allow loading of arbitrary C extensions. Extensions are imported into the
|
||||
# active Python interpreter and may run arbitrary code.
|
||||
unsafe-load-any-extension=no
|
||||
|
||||
# In verbose mode, extra non-checker-related info will be displayed.
|
||||
# verbose=
|
||||
|
||||
|
||||
[BASIC]
|
||||
|
||||
# Naming style matching correct argument names.
|
||||
argument-naming-style=snake_case
|
||||
|
||||
# Regular expression matching correct argument names. Overrides argument-
|
||||
# naming-style. If left empty, argument names will be checked with the set
|
||||
# naming style.
|
||||
#argument-rgx=
|
||||
|
||||
# Naming style matching correct attribute names.
|
||||
attr-naming-style=snake_case
|
||||
|
||||
# Regular expression matching correct attribute names. Overrides attr-naming-
|
||||
# style. If left empty, attribute names will be checked with the set naming
|
||||
# style.
|
||||
#attr-rgx=
|
||||
|
||||
# Bad variable names which should always be refused, separated by a comma.
|
||||
bad-names=foo,
|
||||
bar,
|
||||
baz,
|
||||
toto,
|
||||
tutu,
|
||||
tata
|
||||
|
||||
# Bad variable names regexes, separated by a comma. If names match any regex,
|
||||
# they will always be refused
|
||||
bad-names-rgxs=
|
||||
|
||||
# Naming style matching correct class attribute names.
|
||||
class-attribute-naming-style=any
|
||||
|
||||
# Regular expression matching correct class attribute names. Overrides class-
|
||||
# attribute-naming-style. If left empty, class attribute names will be checked
|
||||
# with the set naming style.
|
||||
#class-attribute-rgx=
|
||||
|
||||
# Naming style matching correct class constant names.
|
||||
class-const-naming-style=UPPER_CASE
|
||||
|
||||
# Regular expression matching correct class constant names. Overrides class-
|
||||
# const-naming-style. If left empty, class constant names will be checked with
|
||||
# the set naming style.
|
||||
#class-const-rgx=
|
||||
|
||||
# Naming style matching correct class names.
|
||||
class-naming-style=PascalCase
|
||||
|
||||
# Regular expression matching correct class names. Overrides class-naming-
|
||||
# style. If left empty, class names will be checked with the set naming style.
|
||||
#class-rgx=
|
||||
|
||||
# Naming style matching correct constant names.
|
||||
const-naming-style=UPPER_CASE
|
||||
|
||||
# Regular expression matching correct constant names. Overrides const-naming-
|
||||
# style. If left empty, constant names will be checked with the set naming
|
||||
# style.
|
||||
#const-rgx=
|
||||
|
||||
# Minimum line length for functions/classes that require docstrings, shorter
|
||||
# ones are exempt.
|
||||
docstring-min-length=-1
|
||||
|
||||
# Naming style matching correct function names.
|
||||
function-naming-style=snake_case
|
||||
|
||||
# Regular expression matching correct function names. Overrides function-
|
||||
# naming-style. If left empty, function names will be checked with the set
|
||||
# naming style.
|
||||
#function-rgx=
|
||||
|
||||
# Good variable names which should always be accepted, separated by a comma.
|
||||
good-names=i,
|
||||
j,
|
||||
k,
|
||||
ex,
|
||||
Run,
|
||||
_
|
||||
|
||||
# Good variable names regexes, separated by a comma. If names match any regex,
|
||||
# they will always be accepted
|
||||
good-names-rgxs=
|
||||
|
||||
# Include a hint for the correct naming format with invalid-name.
|
||||
include-naming-hint=no
|
||||
|
||||
# Naming style matching correct inline iteration names.
|
||||
inlinevar-naming-style=any
|
||||
|
||||
# Regular expression matching correct inline iteration names. Overrides
|
||||
# inlinevar-naming-style. If left empty, inline iteration names will be checked
|
||||
# with the set naming style.
|
||||
#inlinevar-rgx=
|
||||
|
||||
# Naming style matching correct method names.
|
||||
method-naming-style=snake_case
|
||||
|
||||
# Regular expression matching correct method names. Overrides method-naming-
|
||||
# style. If left empty, method names will be checked with the set naming style.
|
||||
#method-rgx=
|
||||
|
||||
# Naming style matching correct module names.
|
||||
module-naming-style=snake_case
|
||||
|
||||
# Regular expression matching correct module names. Overrides module-naming-
|
||||
# style. If left empty, module names will be checked with the set naming style.
|
||||
#module-rgx=
|
||||
|
||||
# Colon-delimited sets of names that determine each other's naming style when
|
||||
# the name regexes allow several styles.
|
||||
name-group=
|
||||
|
||||
# Regular expression which should only match function or class names that do
|
||||
# not require a docstring.
|
||||
no-docstring-rgx=^_
|
||||
|
||||
# List of decorators that produce properties, such as abc.abstractproperty. Add
|
||||
# to this list to register other decorators that produce valid properties.
|
||||
# These decorators are taken in consideration only for invalid-name.
|
||||
property-classes=abc.abstractproperty
|
||||
|
||||
# Regular expression matching correct type alias names. If left empty, type
|
||||
# alias names will be checked with the set naming style.
|
||||
#typealias-rgx=
|
||||
|
||||
# Regular expression matching correct type variable names. If left empty, type
|
||||
# variable names will be checked with the set naming style.
|
||||
#typevar-rgx=
|
||||
|
||||
# Naming style matching correct variable names.
|
||||
variable-naming-style=snake_case
|
||||
|
||||
# Regular expression matching correct variable names. Overrides variable-
|
||||
# naming-style. If left empty, variable names will be checked with the set
|
||||
# naming style.
|
||||
#variable-rgx=
|
||||
|
||||
|
||||
[CLASSES]
|
||||
|
||||
# Warn about protected attribute access inside special methods
|
||||
check-protected-access-in-special-methods=no
|
||||
|
||||
# List of method names used to declare (i.e. assign) instance attributes.
|
||||
defining-attr-methods=__init__,
|
||||
__new__,
|
||||
setUp,
|
||||
asyncSetUp,
|
||||
__post_init__
|
||||
|
||||
# List of member names, which should be excluded from the protected access
|
||||
# warning.
|
||||
exclude-protected=_asdict,_fields,_replace,_source,_make,os._exit
|
||||
|
||||
# List of valid names for the first argument in a class method.
|
||||
valid-classmethod-first-arg=cls
|
||||
|
||||
# List of valid names for the first argument in a metaclass class method.
|
||||
valid-metaclass-classmethod-first-arg=mcs
|
||||
|
||||
|
||||
[DESIGN]
|
||||
|
||||
# List of regular expressions of class ancestor names to ignore when counting
|
||||
# public methods (see R0903)
|
||||
exclude-too-few-public-methods=
|
||||
|
||||
# List of qualified class names to ignore when counting class parents (see
|
||||
# R0901)
|
||||
ignored-parents=
|
||||
|
||||
# Maximum number of arguments for function / method.
|
||||
max-args=5
|
||||
|
||||
# Maximum number of attributes for a class (see R0902).
|
||||
max-attributes=7
|
||||
|
||||
# Maximum number of boolean expressions in an if statement (see R0916).
|
||||
max-bool-expr=5
|
||||
|
||||
# Maximum number of branch for function / method body.
|
||||
max-branches=12
|
||||
|
||||
# Maximum number of locals for function / method body.
|
||||
max-locals=15
|
||||
|
||||
# Maximum number of parents for a class (see R0901).
|
||||
max-parents=7
|
||||
|
||||
# Maximum number of public methods for a class (see R0904).
|
||||
max-public-methods=20
|
||||
|
||||
# Maximum number of return / yield for function / method body.
|
||||
max-returns=6
|
||||
|
||||
# Maximum number of statements in function / method body.
|
||||
max-statements=50
|
||||
|
||||
# Minimum number of public methods for a class (see R0903).
|
||||
min-public-methods=2
|
||||
|
||||
|
||||
[EXCEPTIONS]
|
||||
|
||||
# Exceptions that will emit a warning when caught.
|
||||
overgeneral-exceptions=builtins.BaseException,builtins.Exception
|
||||
|
||||
|
||||
[FORMAT]
|
||||
|
||||
# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
|
||||
expected-line-ending-format=
|
||||
|
||||
# Regexp for a line that is allowed to be longer than the limit.
|
||||
ignore-long-lines=^\s*(# )?<?https?://\S+>?$
|
||||
|
||||
# Number of spaces of indent required inside a hanging or continued line.
|
||||
indent-after-paren=4
|
||||
|
||||
# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
|
||||
# tab).
|
||||
indent-string=' '
|
||||
|
||||
# Maximum number of characters on a single line.
|
||||
max-line-length=100
|
||||
|
||||
# Maximum number of lines in a module.
|
||||
max-module-lines=1000
|
||||
|
||||
# Allow the body of a class to be on the same line as the declaration if body
|
||||
# contains single statement.
|
||||
single-line-class-stmt=no
|
||||
|
||||
# Allow the body of an if to be on the same line as the test if there is no
|
||||
# else.
|
||||
single-line-if-stmt=no
|
||||
|
||||
|
||||
[IMPORTS]
|
||||
|
||||
# List of modules that can be imported at any level, not just the top level
|
||||
# one.
|
||||
allow-any-import-level=
|
||||
|
||||
# Allow explicit reexports by alias from a package __init__.
|
||||
allow-reexport-from-package=no
|
||||
|
||||
# Allow wildcard imports from modules that define __all__.
|
||||
allow-wildcard-with-all=no
|
||||
|
||||
# Deprecated modules which should not be used, separated by a comma.
|
||||
deprecated-modules=
|
||||
|
||||
# Output a graph (.gv or any supported image format) of external dependencies
|
||||
# to the given file (report RP0402 must not be disabled).
|
||||
ext-import-graph=
|
||||
|
||||
# Output a graph (.gv or any supported image format) of all (i.e. internal and
|
||||
# external) dependencies to the given file (report RP0402 must not be
|
||||
# disabled).
|
||||
import-graph=
|
||||
|
||||
# Output a graph (.gv or any supported image format) of internal dependencies
|
||||
# to the given file (report RP0402 must not be disabled).
|
||||
int-import-graph=
|
||||
|
||||
# Force import order to recognize a module as part of the standard
|
||||
# compatibility libraries.
|
||||
known-standard-library=
|
||||
|
||||
# Force import order to recognize a module as part of a third party library.
|
||||
known-third-party=enchant
|
||||
|
||||
# Couples of modules and preferred modules, separated by a comma.
|
||||
preferred-modules=
|
||||
|
||||
|
||||
[LOGGING]
|
||||
|
||||
# The type of string formatting that logging methods do. `old` means using %
|
||||
# formatting, `new` is for `{}` formatting.
|
||||
logging-format-style=old
|
||||
|
||||
# Logging modules to check that the string format arguments are in logging
|
||||
# function parameter format.
|
||||
logging-modules=logging
|
||||
|
||||
|
||||
[MESSAGES CONTROL]
|
||||
|
||||
# Only show warnings with the listed confidence levels. Leave empty to show
|
||||
# all. Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE,
|
||||
# UNDEFINED.
|
||||
confidence=HIGH,
|
||||
CONTROL_FLOW,
|
||||
INFERENCE,
|
||||
INFERENCE_FAILURE,
|
||||
UNDEFINED
|
||||
|
||||
# Disable the message, report, category or checker with the given id(s). You
|
||||
# can either give multiple identifiers separated by comma (,) or put this
|
||||
# option multiple times (only on the command line, not in the configuration
|
||||
# file where it should appear only once). You can also use "--disable=all" to
|
||||
# disable everything first and then re-enable specific checks. For example, if
|
||||
# you want to run only the similarities checker, you can use "--disable=all
|
||||
# --enable=similarities". If you want to run only the classes checker, but have
|
||||
# no Warning level messages displayed, use "--disable=all --enable=classes
|
||||
# --disable=W".
|
||||
disable=raw-checker-failed,
|
||||
bad-inline-option,
|
||||
locally-disabled,
|
||||
file-ignored,
|
||||
suppressed-message,
|
||||
useless-suppression,
|
||||
deprecated-pragma,
|
||||
use-symbolic-message-instead,
|
||||
use-implicit-booleaness-not-comparison-to-string,
|
||||
use-implicit-booleaness-not-comparison-to-zero,
|
||||
useless-option-value,
|
||||
no-classmethod-decorator,
|
||||
no-staticmethod-decorator,
|
||||
useless-object-inheritance,
|
||||
property-with-parameters,
|
||||
cyclic-import,
|
||||
consider-using-from-import,
|
||||
consider-merging-isinstance,
|
||||
too-many-nested-blocks,
|
||||
simplifiable-if-statement,
|
||||
redefined-argument-from-local,
|
||||
no-else-return,
|
||||
consider-using-ternary,
|
||||
trailing-comma-tuple,
|
||||
stop-iteration-return,
|
||||
simplify-boolean-expression,
|
||||
inconsistent-return-statements,
|
||||
useless-return,
|
||||
consider-swap-variables,
|
||||
consider-using-join,
|
||||
consider-using-in,
|
||||
consider-using-get,
|
||||
chained-comparison,
|
||||
consider-using-dict-comprehension,
|
||||
consider-using-set-comprehension,
|
||||
simplifiable-if-expression,
|
||||
no-else-raise,
|
||||
unnecessary-comprehension,
|
||||
consider-using-sys-exit,
|
||||
no-else-break,
|
||||
no-else-continue,
|
||||
super-with-arguments,
|
||||
simplifiable-condition,
|
||||
condition-evals-to-constant,
|
||||
consider-using-generator,
|
||||
use-a-generator,
|
||||
consider-using-min-builtin,
|
||||
consider-using-max-builtin,
|
||||
consider-using-with,
|
||||
unnecessary-dict-index-lookup,
|
||||
use-list-literal,
|
||||
use-dict-literal,
|
||||
unnecessary-list-index-lookup,
|
||||
use-yield-from,
|
||||
duplicate-code,
|
||||
too-many-ancestors,
|
||||
too-many-instance-attributes,
|
||||
too-few-public-methods,
|
||||
too-many-public-methods,
|
||||
too-many-return-statements,
|
||||
too-many-branches,
|
||||
too-many-arguments,
|
||||
too-many-positional-arguments,
|
||||
too-many-locals,
|
||||
too-many-statements,
|
||||
too-many-boolean-expressions,
|
||||
too-many-positional,
|
||||
literal-comparison,
|
||||
comparison-with-itself,
|
||||
comparison-of-constants,
|
||||
wrong-spelling-in-comment,
|
||||
wrong-spelling-in-docstring,
|
||||
invalid-characters-in-docstring,
|
||||
unnecessary-dunder-call,
|
||||
bad-file-encoding,
|
||||
bad-classmethod-argument,
|
||||
bad-mcs-method-argument,
|
||||
bad-mcs-classmethod-argument,
|
||||
single-string-used-for-slots,
|
||||
unnecessary-lambda-assignment,
|
||||
unnecessary-direct-lambda-call,
|
||||
non-ascii-name,
|
||||
non-ascii-module-import,
|
||||
line-too-long,
|
||||
too-many-lines,
|
||||
trailing-whitespace,
|
||||
missing-final-newline,
|
||||
trailing-newlines,
|
||||
multiple-statements,
|
||||
superfluous-parens,
|
||||
mixed-line-endings,
|
||||
unexpected-line-ending-format,
|
||||
multiple-imports,
|
||||
wrong-import-order,
|
||||
ungrouped-imports,
|
||||
wrong-import-position,
|
||||
useless-import-alias,
|
||||
import-outside-toplevel,
|
||||
unnecessary-negation,
|
||||
consider-using-enumerate,
|
||||
consider-iterating-dictionary,
|
||||
consider-using-dict-items,
|
||||
use-maxsplit-arg,
|
||||
use-sequence-for-iteration,
|
||||
consider-using-f-string,
|
||||
use-implicit-booleaness-not-len,
|
||||
use-implicit-booleaness-not-comparison,
|
||||
invalid-name,
|
||||
disallowed-name,
|
||||
typevar-name-incorrect-variance,
|
||||
typevar-double-variance,
|
||||
typevar-name-mismatch,
|
||||
empty-docstring,
|
||||
missing-module-docstring,
|
||||
missing-class-docstring,
|
||||
missing-function-docstring,
|
||||
singleton-comparison,
|
||||
unidiomatic-typecheck,
|
||||
unknown-option-value,
|
||||
logging-not-lazy,
|
||||
logging-format-interpolation,
|
||||
logging-fstring-interpolation,
|
||||
fixme,
|
||||
keyword-arg-before-vararg,
|
||||
arguments-out-of-order,
|
||||
non-str-assignment-to-dunder-name,
|
||||
isinstance-second-argument-not-valid-type,
|
||||
kwarg-superseded-by-positional-arg,
|
||||
modified-iterating-list,
|
||||
attribute-defined-outside-init,
|
||||
bad-staticmethod-argument,
|
||||
protected-access,
|
||||
implicit-flag-alias,
|
||||
arguments-differ,
|
||||
signature-differs,
|
||||
abstract-method,
|
||||
super-init-not-called,
|
||||
non-parent-init-called,
|
||||
invalid-overridden-method,
|
||||
arguments-renamed,
|
||||
unused-private-member,
|
||||
overridden-final-method,
|
||||
subclassed-final-class,
|
||||
redefined-slots-in-subclass,
|
||||
super-without-brackets,
|
||||
useless-parent-delegation,
|
||||
global-variable-undefined,
|
||||
global-variable-not-assigned,
|
||||
global-statement,
|
||||
global-at-module-level,
|
||||
unused-import,
|
||||
unused-variable,
|
||||
unused-argument,
|
||||
unused-wildcard-import,
|
||||
redefined-outer-name,
|
||||
redefined-builtin,
|
||||
undefined-loop-variable,
|
||||
unbalanced-tuple-unpacking,
|
||||
cell-var-from-loop,
|
||||
possibly-unused-variable,
|
||||
self-cls-assignment,
|
||||
unbalanced-dict-unpacking,
|
||||
using-f-string-in-unsupported-version,
|
||||
using-final-decorator-in-unsupported-version,
|
||||
unnecessary-ellipsis,
|
||||
non-ascii-file-name,
|
||||
unnecessary-semicolon,
|
||||
bad-indentation,
|
||||
wildcard-import,
|
||||
reimported,
|
||||
import-self,
|
||||
preferred-module,
|
||||
misplaced-future,
|
||||
shadowed-import,
|
||||
missing-timeout,
|
||||
useless-with-lock,
|
||||
bare-except,
|
||||
duplicate-except,
|
||||
try-except-raise,
|
||||
raise-missing-from,
|
||||
binary-op-exception,
|
||||
raising-format-tuple,
|
||||
wrong-exception-operation,
|
||||
broad-exception-caught,
|
||||
broad-exception-raised,
|
||||
bad-open-mode,
|
||||
boolean-datetime,
|
||||
redundant-unittest-assert,
|
||||
bad-thread-instantiation,
|
||||
shallow-copy-environ,
|
||||
invalid-envvar-default,
|
||||
subprocess-popen-preexec-fn,
|
||||
subprocess-run-check,
|
||||
unspecified-encoding,
|
||||
forgotten-debug-statement,
|
||||
method-cache-max-size-none,
|
||||
bad-format-string-key,
|
||||
unused-format-string-key,
|
||||
bad-format-string,
|
||||
missing-format-argument-key,
|
||||
unused-format-string-argument,
|
||||
format-combined-specification,
|
||||
missing-format-attribute,
|
||||
invalid-format-index,
|
||||
duplicate-string-formatting-argument,
|
||||
f-string-without-interpolation,
|
||||
format-string-without-interpolation,
|
||||
anomalous-backslash-in-string,
|
||||
anomalous-unicode-escape-in-string,
|
||||
implicit-str-concat,
|
||||
inconsistent-quotes,
|
||||
redundant-u-string-prefix,
|
||||
useless-else-on-loop,
|
||||
unreachable,
|
||||
dangerous-default-value,
|
||||
pointless-statement,
|
||||
pointless-string-statement,
|
||||
expression-not-assigned,
|
||||
unnecessary-lambda,
|
||||
duplicate-key,
|
||||
exec-used,
|
||||
eval-used,
|
||||
confusing-with-statement,
|
||||
using-constant-test,
|
||||
missing-parentheses-for-call-in-test,
|
||||
self-assigning-variable,
|
||||
redeclared-assigned-name,
|
||||
assert-on-string-literal,
|
||||
duplicate-value,
|
||||
named-expr-without-context,
|
||||
pointless-exception-statement,
|
||||
return-in-finally,
|
||||
lost-exception,
|
||||
assert-on-tuple,
|
||||
unnecessary-pass,
|
||||
comparison-with-callable,
|
||||
nan-comparison,
|
||||
contextmanager-generator-missing-cleanup,
|
||||
nested-min-max,
|
||||
bad-chained-comparison,
|
||||
not-callable
|
||||
|
||||
# Enable the message, report, category or checker with the given id(s). You can
|
||||
# either give multiple identifier separated by comma (,) or put this option
|
||||
# multiple time (only on the command line, not in the configuration file where
|
||||
# it should appear only once). See also the "--disable" option for examples.
|
||||
enable=deprecated-module,
|
||||
deprecated-method,
|
||||
deprecated-argument,
|
||||
deprecated-class,
|
||||
deprecated-decorator,
|
||||
deprecated-attribute
|
||||
|
||||
[METHOD_ARGS]
|
||||
|
||||
# List of qualified names (i.e., library.method) which require a timeout
|
||||
# parameter e.g. 'requests.api.get,requests.api.post'
|
||||
timeout-methods=requests.api.delete,requests.api.get,requests.api.head,requests.api.options,requests.api.patch,requests.api.post,requests.api.put,requests.api.request
|
||||
|
||||
|
||||
[MISCELLANEOUS]
|
||||
|
||||
# List of note tags to take in consideration, separated by a comma.
|
||||
notes=FIXME,
|
||||
XXX,
|
||||
TODO
|
||||
|
||||
# Regular expression of note tags to take in consideration.
|
||||
notes-rgx=
|
||||
|
||||
|
||||
[REFACTORING]
|
||||
|
||||
# Maximum number of nested blocks for function / method body
|
||||
max-nested-blocks=5
|
||||
|
||||
# Complete name of functions that never returns. When checking for
|
||||
# inconsistent-return-statements if a never returning function is called then
|
||||
# it will be considered as an explicit return statement and no message will be
|
||||
# printed.
|
||||
never-returning-functions=sys.exit,argparse.parse_error
|
||||
|
||||
# Let 'consider-using-join' be raised when the separator to join on would be
|
||||
# non-empty (resulting in expected fixes of the type: ``"- " + " -
|
||||
# ".join(items)``)
|
||||
suggest-join-with-non-empty-separator=yes
|
||||
|
||||
|
||||
[REPORTS]
|
||||
|
||||
# Python expression which should return a score less than or equal to 10. You
|
||||
# have access to the variables 'fatal', 'error', 'warning', 'refactor',
|
||||
# 'convention', and 'info' which contain the number of messages in each
|
||||
# category, as well as 'statement' which is the total number of statements
|
||||
# analyzed. This score is used by the global evaluation report (RP0004).
|
||||
evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10))
|
||||
|
||||
# Template used to display messages. This is a python new-style format string
|
||||
# used to format the message information. See doc for all details.
|
||||
msg-template=
|
||||
|
||||
# Set the output format. Available formats are: text, parseable, colorized,
|
||||
# json2 (improved json format), json (old json format) and msvs (visual
|
||||
# studio). You can also give a reporter class, e.g.
|
||||
# mypackage.mymodule.MyReporterClass.
|
||||
#output-format=
|
||||
|
||||
# Tells whether to display a full report or only the messages.
|
||||
reports=no
|
||||
|
||||
# Activate the evaluation score.
|
||||
score=yes
|
||||
|
||||
|
||||
[SIMILARITIES]
|
||||
|
||||
# Comments are removed from the similarity computation
|
||||
ignore-comments=yes
|
||||
|
||||
# Docstrings are removed from the similarity computation
|
||||
ignore-docstrings=yes
|
||||
|
||||
# Imports are removed from the similarity computation
|
||||
ignore-imports=yes
|
||||
|
||||
# Signatures are removed from the similarity computation
|
||||
ignore-signatures=yes
|
||||
|
||||
# Minimum lines number of a similarity.
|
||||
min-similarity-lines=4
|
||||
|
||||
|
||||
[SPELLING]
|
||||
|
||||
# Limits count of emitted suggestions for spelling mistakes.
|
||||
max-spelling-suggestions=4
|
||||
|
||||
# Spelling dictionary name. No available dictionaries : You need to install
|
||||
# both the python package and the system dependency for enchant to work.
|
||||
spelling-dict=
|
||||
|
||||
# List of comma separated words that should be considered directives if they
|
||||
# appear at the beginning of a comment and should not be checked.
|
||||
spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:
|
||||
|
||||
# List of comma separated words that should not be checked.
|
||||
spelling-ignore-words=
|
||||
|
||||
# A path to a file that contains the private dictionary; one word per line.
|
||||
spelling-private-dict-file=
|
||||
|
||||
# Tells whether to store unknown words to the private dictionary (see the
|
||||
# --spelling-private-dict-file option) instead of raising a message.
|
||||
spelling-store-unknown-words=no
|
||||
|
||||
|
||||
[STRING]
|
||||
|
||||
# This flag controls whether inconsistent-quotes generates a warning when the
|
||||
# character used as a quote delimiter is used inconsistently within a module.
|
||||
check-quote-consistency=no
|
||||
|
||||
# This flag controls whether the implicit-str-concat should generate a warning
|
||||
# on implicit string concatenation in sequences defined over several lines.
|
||||
check-str-concat-over-line-jumps=no
|
||||
|
||||
|
||||
[TYPECHECK]
|
||||
|
||||
# List of decorators that produce context managers, such as
|
||||
# contextlib.contextmanager. Add to this list to register other decorators that
|
||||
# produce valid context managers.
|
||||
contextmanager-decorators=contextlib.contextmanager
|
||||
|
||||
# List of members which are set dynamically and missed by pylint inference
|
||||
# system, and so shouldn't trigger E1101 when accessed. Python regular
|
||||
# expressions are accepted.
|
||||
generated-members=cv2.*,sentencepiece.*
|
||||
|
||||
# Tells whether to warn about missing members when the owner of the attribute
|
||||
# is inferred to be None.
|
||||
ignore-none=yes
|
||||
|
||||
# This flag controls whether pylint should warn about no-member and similar
|
||||
# checks whenever an opaque object is returned when inferring. The inference
|
||||
# can return multiple potential results while evaluating a Python object, but
|
||||
# some branches might not be evaluated, which results in partial inference. In
|
||||
# that case, it might be useful to still emit no-member and other checks for
|
||||
# the rest of the inferred objects.
|
||||
ignore-on-opaque-inference=yes
|
||||
|
||||
# List of symbolic message names to ignore for Mixin members.
|
||||
ignored-checks-for-mixins=no-member,
|
||||
not-async-context-manager,
|
||||
not-context-manager,
|
||||
attribute-defined-outside-init
|
||||
|
||||
# List of class names for which member attributes should not be checked (useful
|
||||
# for classes with dynamically set attributes). This supports the use of
|
||||
# qualified names.
|
||||
ignored-classes=optparse.Values,thread._local,_thread._local,argparse.Namespace
|
||||
|
||||
# Show a hint with possible names when a member name was not found. The aspect
|
||||
# of finding the hint is based on edit distance.
|
||||
missing-member-hint=yes
|
||||
|
||||
# The minimum edit distance a name should have in order to be considered a
|
||||
# similar match for a missing member name.
|
||||
missing-member-hint-distance=1
|
||||
|
||||
# The total number of similar names that should be taken in consideration when
|
||||
# showing a hint for a missing member.
|
||||
missing-member-max-choices=1
|
||||
|
||||
# Regex pattern to define which classes are considered mixins.
|
||||
mixin-class-rgx=.*[Mm]ixin
|
||||
|
||||
# List of decorators that change the signature of a decorated function.
|
||||
signature-mutators=
|
||||
|
||||
|
||||
[VARIABLES]
|
||||
|
||||
# List of additional names supposed to be defined in builtins. Remember that
|
||||
# you should avoid defining new builtins when possible.
|
||||
additional-builtins=
|
||||
|
||||
# Tells whether unused global variables should be treated as a violation.
|
||||
allow-global-unused-variables=yes
|
||||
|
||||
# List of names allowed to shadow builtins
|
||||
allowed-redefined-builtins=
|
||||
|
||||
# List of strings which can identify a callback function by name. A callback
|
||||
# name must start or end with one of those strings.
|
||||
callbacks=cb_,
|
||||
_cb
|
||||
|
||||
# A regular expression matching the name of dummy variables (i.e. expected to
|
||||
# not be used).
|
||||
dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
|
||||
|
||||
# Argument names that match this expression will be ignored.
|
||||
ignored-argument-names=_.*|^ignored_|^unused_
|
||||
|
||||
# Tells whether we should check for unused import in __init__ files.
|
||||
init-import=no
|
||||
|
||||
# List of qualified module names which can have objects that can redefine
|
||||
# builtins.
|
||||
redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
|
||||
|
||||
# Disable specific messages for specific files
|
||||
[file:paths/view/get/query_parameters.py]
|
||||
disable=duplicate-bases
|
||||
|
||||
[file:paths/view/get/parameters/parameter_1/schema.py]
|
||||
disable=no-self-argument
|
||||
|
||||
[file:schemas/schema.py]
|
||||
disable=no-self-argument,bad-super-call
|
||||
@ -1,6 +1,2 @@
|
||||
# Admins
|
||||
# From upstream
|
||||
* @comfyanonymous
|
||||
* @kosinkadink
|
||||
# For the fork
|
||||
* @doctorpangloss
|
||||
* @comfyanonymous @kosinkadink @guill @doctorpangloss
|
||||
|
||||
168
QUANTIZATION.md
Normal file
168
QUANTIZATION.md
Normal file
@ -0,0 +1,168 @@
|
||||
# The Comfy guide to Quantization
|
||||
|
||||
|
||||
## How does quantization work?
|
||||
|
||||
Quantization aims to map a high-precision value x_f to a lower precision format with minimal loss in accuracy. These smaller formats then serve to reduce the models memory footprint and increase throughput by using specialized hardware.
|
||||
|
||||
When simply converting a value from FP16 to FP8 using the round-nearest method we might hit two issues:
|
||||
- The dynamic range of FP16 (-65,504, 65,504) far exceeds FP8 formats like E4M3 (-448, 448) or E5M2 (-57,344, 57,344), potentially resulting in clipped values
|
||||
- The original values are concentrated in a small range (e.g. -1,1) leaving many FP8-bits "unused"
|
||||
|
||||
By using a scaling factor, we aim to map these values into the quantized-dtype range, making use of the full spectrum. One of the easiest approaches, and common, is using per-tensor absolute-maximum scaling.
|
||||
|
||||
```
|
||||
absmax = max(abs(tensor))
|
||||
scale = amax / max_dynamic_range_low_precision
|
||||
|
||||
# Quantization
|
||||
tensor_q = (tensor / scale).to(low_precision_dtype)
|
||||
|
||||
# De-Quantization
|
||||
tensor_dq = tensor_q.to(fp16) * scale
|
||||
|
||||
tensor_dq ~ tensor
|
||||
```
|
||||
|
||||
Given that additional information (scaling factor) is needed to "interpret" the quantized values, we describe those as derived datatypes.
|
||||
|
||||
|
||||
## Quantization in Comfy
|
||||
|
||||
```
|
||||
QuantizedTensor (torch.Tensor subclass)
|
||||
↓ __torch_dispatch__
|
||||
Two-Level Registry (generic + layout handlers)
|
||||
↓
|
||||
MixedPrecisionOps + Metadata Detection
|
||||
```
|
||||
|
||||
### Representation
|
||||
|
||||
To represent these derived datatypes, ComfyUI uses a subclass of torch.Tensor to implements these using the `QuantizedTensor` class found in `comfy/quant_ops.py`
|
||||
|
||||
A `Layout` class defines how a specific quantization format behaves:
|
||||
- Required parameters
|
||||
- Quantize method
|
||||
- De-Quantize method
|
||||
|
||||
```python
|
||||
from comfy.quant_ops import QuantizedLayout
|
||||
|
||||
class MyLayout(QuantizedLayout):
|
||||
@classmethod
|
||||
def quantize(cls, tensor, **kwargs):
|
||||
# Convert to quantized format
|
||||
qdata = ...
|
||||
params = {'scale': ..., 'orig_dtype': tensor.dtype}
|
||||
return qdata, params
|
||||
|
||||
@staticmethod
|
||||
def dequantize(qdata, scale, orig_dtype, **kwargs):
|
||||
return qdata.to(orig_dtype) * scale
|
||||
```
|
||||
|
||||
To then run operations using these QuantizedTensors we use two registry systems to define supported operations.
|
||||
The first is a **generic registry** that handles operations common to all quantized formats (e.g., `.to()`, `.clone()`, `.reshape()`).
|
||||
|
||||
The second registry is layout-specific and allows to implement fast-paths like nn.Linear.
|
||||
```python
|
||||
from comfy.quant_ops import register_layout_op
|
||||
|
||||
@register_layout_op(torch.ops.aten.linear.default, MyLayout)
|
||||
def my_linear(func, args, kwargs):
|
||||
# Extract tensors, call optimized kernel
|
||||
...
|
||||
```
|
||||
When `torch.nn.functional.linear()` is called with QuantizedTensor arguments, `__torch_dispatch__` automatically routes to the registered implementation.
|
||||
For any unsupported operation, QuantizedTensor will fallback to call `dequantize` and dispatch using the high-precision implementation.
|
||||
|
||||
|
||||
### Mixed Precision
|
||||
|
||||
The `MixedPrecisionOps` class (lines 542-648 in `comfy/ops.py`) enables per-layer quantization decisions, allowing different layers in a model to use different precisions. This is activated when a model config contains a `layer_quant_config` dictionary that specifies which layers should be quantized and how.
|
||||
|
||||
**Architecture:**
|
||||
|
||||
```python
|
||||
class MixedPrecisionOps(disable_weight_init):
|
||||
_layer_quant_config = {} # Maps layer names to quantization configs
|
||||
_compute_dtype = torch.bfloat16 # Default compute / dequantize precision
|
||||
```
|
||||
|
||||
**Key mechanism:**
|
||||
|
||||
The custom `Linear._load_from_state_dict()` method inspects each layer during model loading:
|
||||
- If the layer name is **not** in `_layer_quant_config`: load weight as regular tensor in `_compute_dtype`
|
||||
- If the layer name **is** in `_layer_quant_config`:
|
||||
- Load weight as `QuantizedTensor` with the specified layout (e.g., `TensorCoreFP8Layout`)
|
||||
- Load associated quantization parameters (scales, block_size, etc.)
|
||||
|
||||
**Why it's needed:**
|
||||
|
||||
Not all layers tolerate quantization equally. Sensitive operations like final projections can be kept in higher precision, while compute-heavy matmuls are quantized. This provides most of the performance benefits while maintaining quality.
|
||||
|
||||
The system is selected in `pick_operations()` when `model_config.layer_quant_config` is present, making it the highest-priority operation mode.
|
||||
|
||||
|
||||
## Checkpoint Format
|
||||
|
||||
Quantized checkpoints are stored as standard safetensors files with quantized weight tensors and associated scaling parameters, plus a `_quantization_metadata` JSON entry describing the quantization scheme.
|
||||
|
||||
The quantized checkpoint will contain the same layers as the original checkpoint but:
|
||||
- The weights are stored as quantized values, sometimes using a different storage datatype. E.g. uint8 container for fp8.
|
||||
- For each quantized weight a number of additional scaling parameters are stored alongside depending on the recipe.
|
||||
- We store a metadata.json in the metadata of the final safetensor containing the `_quantization_metadata` describing which layers are quantized and what layout has been used.
|
||||
|
||||
### Scaling Parameters details
|
||||
We define 4 possible scaling parameters that should cover most recipes in the near-future:
|
||||
- **weight_scale**: quantization scalers for the weights
|
||||
- **weight_scale_2**: global scalers in the context of double scaling
|
||||
- **pre_quant_scale**: scalers used for smoothing salient weights
|
||||
- **input_scale**: quantization scalers for the activations
|
||||
|
||||
| Format | Storage dtype | weight_scale | weight_scale_2 | pre_quant_scale | input_scale |
|
||||
|--------|---------------|--------------|----------------|-----------------|-------------|
|
||||
| float8_e4m3fn | float32 | float32 (scalar) | - | - | float32 (scalar) |
|
||||
|
||||
You can find the defined formats in `comfy/quant_ops.py` (QUANT_ALGOS).
|
||||
|
||||
### Quantization Metadata
|
||||
|
||||
The metadata stored alongside the checkpoint contains:
|
||||
- **format_version**: String to define a version of the standard
|
||||
- **layers**: A dictionary mapping layer names to their quantization format. The format string maps to the definitions found in `QUANT_ALGOS`.
|
||||
|
||||
Example:
|
||||
```json
|
||||
{
|
||||
"_quantization_metadata": {
|
||||
"format_version": "1.0",
|
||||
"layers": {
|
||||
"model.layers.0.mlp.up_proj": "float8_e4m3fn",
|
||||
"model.layers.0.mlp.down_proj": "float8_e4m3fn",
|
||||
"model.layers.1.mlp.up_proj": "float8_e4m3fn"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
## Creating Quantized Checkpoints
|
||||
|
||||
To create compatible checkpoints, use any quantization tool provided the output follows the checkpoint format described above and uses a layout defined in `QUANT_ALGOS`.
|
||||
|
||||
### Weight Quantization
|
||||
|
||||
Weight quantization is straightforward - compute the scaling factor directly from the weight tensor using the absolute maximum method described earlier. Each layer's weights are quantized independently and stored with their corresponding `weight_scale` parameter.
|
||||
|
||||
### Calibration (for Activation Quantization)
|
||||
|
||||
Activation quantization (e.g., for FP8 Tensor Core operations) requires `input_scale` parameters that cannot be determined from static weights alone. Since activation values depend on actual inputs, we use **post-training calibration (PTQ)**:
|
||||
|
||||
1. **Collect statistics**: Run inference on N representative samples
|
||||
2. **Track activations**: Record the absolute maximum (`amax`) of inputs to each quantized layer
|
||||
3. **Compute scales**: Derive `input_scale` from collected statistics
|
||||
4. **Store in checkpoint**: Save `input_scale` parameters alongside weights
|
||||
|
||||
The calibration dataset should be representative of your target use case. For diffusion models, this typically means a diverse set of prompts and generation parameters.
|
||||
72
README.md
72
README.md
@ -306,10 +306,28 @@ choco install -y vcredist2010 vcredist2013 vcredist140
|
||||
|
||||
Then, visit [NVIDIA.com's CUDA Toolkit Download Page](https://developer.nvidia.com/cuda-12-6-0-download-archive?target_os=Windows&target_arch=x86_64&target_version=Server2022&target_type=exe_network) and download and install the CUDA Toolkit. Verify it is correctly installed by running `nvcc --version`.
|
||||
|
||||
You are now ready to install Sage Attention 2 and Triton:
|
||||
You are now ready to install Sage Attention 2.
|
||||
|
||||
### Linux
|
||||
|
||||
```shell
|
||||
uv pip install --torch-backend=auto "comfyui[attention]@git+https://github.com/hiddenswitch/ComfyUI.git"
|
||||
uv pip install --no-build-isolation "sageattention@git+https://github.com/thu-ml/SageAttention.git"
|
||||
```
|
||||
|
||||
### Windows
|
||||
|
||||
Run this PowerShell script to install the correct version of Sage Attention for your installed PyTorch version:
|
||||
|
||||
```powershell
|
||||
$torch_version = (uv pip freeze | Select-String "torch==").ToString().Trim()
|
||||
$cuda_version = $torch_version -replace ".*(cu\d+).*", "`$1"
|
||||
if ($torch_version -match "\+cu") {
|
||||
$v = $torch_version -replace "torch==", ""
|
||||
$package_specifier = "sageattention==2.2.0+$($cuda_version)torch$v"
|
||||
uv pip install --find-links https://raw.githubusercontent.com/hiddenswitch/ComfyUI/main/pypi/sageattention_index.html $package_specifier
|
||||
} else {
|
||||
Write-Host "Could not determine CUDA version from torch version: $torch_version"
|
||||
}
|
||||
```
|
||||
|
||||
To start ComfyUI with it:
|
||||
@ -324,6 +342,26 @@ uv run comfyui --use-sage-attention
|
||||

|
||||
**With PyTorch Attention**
|
||||
|
||||
## Flash Attention
|
||||
|
||||
Flash Attention 2 is supported on Linux only.
|
||||
|
||||
```shell
|
||||
uv pip install --no-build-isolation flash_attn
|
||||
```
|
||||
|
||||
To start ComfyUI with it:
|
||||
|
||||
```shell
|
||||
uv run comfyui --use-flash-attention
|
||||
```
|
||||
|
||||

|
||||
**With SageAttention**
|
||||
|
||||

|
||||
**With PyTorch Attention**
|
||||
|
||||
## Cosmos Prompt Upsampling
|
||||
|
||||
The Cosmos prompt "upsampler," a fine tune of Mistral-Nemo-12b, correctly rewrites Cosmos prompts in the narrative style that NVIDIA's captioner used for the training data of Cosmos, improving generation results significantly.
|
||||
@ -359,6 +397,8 @@ There are two kinds of custom nodes: vanilla custom nodes, which generally expec
|
||||
|
||||
ComfyUI-Manager is a popular extension to help you install and manage other custom nodes. To install it, you will need `git` on your system.
|
||||
|
||||
#### Manual Install
|
||||
|
||||
The installation process for ComfyUI-Manager requires two steps: installing its Python dependencies, and then cloning its code into the `custom_nodes` directory.
|
||||
|
||||
1. **Install dependencies.**
|
||||
@ -381,6 +421,34 @@ The installation process for ComfyUI-Manager requires two steps: installing its
|
||||
3. **Restart ComfyUI.**
|
||||
After the cloning is complete, restart ComfyUI. You should now see a "Manager" button in the menu.
|
||||
|
||||
### PyPi Install
|
||||
|
||||
[ComfyUI-Manager](https://github.com/Comfy-Org/ComfyUI-Manager/tree/manager-v4)
|
||||
|
||||
**ComfyUI-Manager** is an extension that allows you to easily install, update, and manage custom nodes for ComfyUI.
|
||||
|
||||
### Setup
|
||||
|
||||
1. Install the manager dependencies:
|
||||
```bash
|
||||
pip install -r manager_requirements.txt
|
||||
```
|
||||
|
||||
2. Enable the manager with the `--enable-manager` flag when running ComfyUI:
|
||||
```bash
|
||||
python main.py --enable-manager
|
||||
```
|
||||
|
||||
### Command Line Options
|
||||
|
||||
| Flag | Description |
|
||||
|------|-------------|
|
||||
| `--enable-manager` | Enable ComfyUI-Manager |
|
||||
| `--enable-manager-legacy-ui` | Use the legacy manager UI instead of the new UI (requires `--enable-manager`) |
|
||||
| `--disable-manager-ui` | Disable the manager UI and endpoints while keeping background features like security checks and scheduled installation completion (requires `--enable-manager`) |
|
||||
|
||||
|
||||
|
||||
### Vanilla Custom Nodes
|
||||
|
||||
Clone the repository containing the custom nodes into `custom_nodes/` in your working directory and install its requirements, or use the manager.
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# This file is automatically generated by the build process when version is
|
||||
# updated in pyproject.toml.
|
||||
__version__ = "0.3.66"
|
||||
__version__ = "0.4.0"
|
||||
|
||||
# This deals with workspace issues
|
||||
from comfy_compatibility.workspace import auto_patch_workspace_and_restart
|
||||
|
||||
@ -62,8 +62,13 @@ class InternalRoutes:
|
||||
return web.json_response({"error": "Invalid directory type"}, status=400)
|
||||
|
||||
directory = get_directory_by_type(directory_type)
|
||||
|
||||
def is_visible_file(entry: os.DirEntry) -> bool:
|
||||
"""Filter out hidden files (e.g., .DS_Store on macOS)."""
|
||||
return entry.is_file() and not entry.name.startswith('.')
|
||||
|
||||
sorted_files = sorted(
|
||||
(entry for entry in os.scandir(directory) if entry.is_file()),
|
||||
(entry for entry in os.scandir(directory) if is_visible_file(entry)),
|
||||
key=lambda entry: -entry.stat().st_mtime
|
||||
)
|
||||
return web.json_response([entry.name for entry in sorted_files], status=200)
|
||||
|
||||
@ -1,23 +1,25 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import importlib.metadata
|
||||
import importlib.resources
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
import zipfile
|
||||
import importlib.metadata
|
||||
from dataclasses import dataclass
|
||||
from functools import cached_property
|
||||
from pathlib import Path
|
||||
from typing import TypedDict, Optional
|
||||
from typing import Dict, TypedDict, Optional
|
||||
|
||||
import requests
|
||||
from aiohttp import web
|
||||
from typing_extensions import NotRequired
|
||||
|
||||
from ..cli_args import DEFAULT_VERSION_STRING
|
||||
from ..cmd.folder_paths import add_model_folder_path # pylint: disable=import-error
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
REQUEST_TIMEOUT = 10 # seconds
|
||||
|
||||
@ -145,7 +147,7 @@ class FrontendManager:
|
||||
# this isn't used the way it says
|
||||
return importlib.metadata.version("comfyui_frontend_package")
|
||||
except Exception as exc_info:
|
||||
return "1.23.4"
|
||||
return "1.33.10"
|
||||
|
||||
@classmethod
|
||||
def get_installed_templates_version(cls) -> str:
|
||||
@ -154,12 +156,12 @@ class FrontendManager:
|
||||
templates_version_str = importlib.metadata.version("comfyui-workflow-templates")
|
||||
return templates_version_str
|
||||
except Exception:
|
||||
return None
|
||||
return ""
|
||||
|
||||
@classmethod
|
||||
def get_required_templates_version(cls) -> str:
|
||||
# returns a stub, since this isn't a helpful check in this environment
|
||||
return "0.1.95"
|
||||
return "0.7.51"
|
||||
|
||||
@classmethod
|
||||
def default_frontend_path(cls) -> str:
|
||||
@ -172,7 +174,45 @@ class FrontendManager:
|
||||
return ""
|
||||
|
||||
@classmethod
|
||||
def templates_path(cls) -> str:
|
||||
def template_asset_map(cls) -> Optional[Dict[str, str]]:
|
||||
"""Return a mapping of template asset names to their absolute paths."""
|
||||
try:
|
||||
from comfyui_workflow_templates import (
|
||||
get_asset_path,
|
||||
iter_templates,
|
||||
)
|
||||
except ImportError:
|
||||
logger.error(
|
||||
f"comfyui-workflow-templates is not installed. {frontend_install_warning_message()}"
|
||||
)
|
||||
return None
|
||||
|
||||
try:
|
||||
template_entries = list(iter_templates())
|
||||
except Exception as exc:
|
||||
logger.error(f"Failed to enumerate workflow templates: {exc}")
|
||||
return None
|
||||
|
||||
asset_map: Dict[str, str] = {}
|
||||
try:
|
||||
for entry in template_entries:
|
||||
for asset in entry.assets:
|
||||
asset_map[asset.filename] = get_asset_path(
|
||||
entry.template_id, asset.filename
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.error(f"Failed to resolve template asset paths: {exc}")
|
||||
return None
|
||||
|
||||
if not asset_map:
|
||||
logger.error("No workflow template assets found. Did the packages install correctly?")
|
||||
return None
|
||||
|
||||
return asset_map
|
||||
|
||||
@classmethod
|
||||
def legacy_templates_path(cls) -> Optional[str]:
|
||||
"""Return the legacy templates directory shipped inside the meta package."""
|
||||
try:
|
||||
import comfyui_workflow_templates
|
||||
|
||||
@ -299,3 +339,18 @@ class FrontendManager:
|
||||
logger.info("Falling back to the default frontend.")
|
||||
check_frontend_version()
|
||||
return cls.default_frontend_path()
|
||||
|
||||
@classmethod
|
||||
def template_asset_handler(cls):
|
||||
assets = cls.template_asset_map()
|
||||
if not assets:
|
||||
return None
|
||||
|
||||
async def serve_template(request: web.Request) -> web.StreamResponse:
|
||||
rel_path = request.match_info.get("path", "")
|
||||
target = assets.get(rel_path)
|
||||
if target is None:
|
||||
raise web.HTTPNotFound()
|
||||
return web.FileResponse(target)
|
||||
|
||||
return serve_template
|
||||
|
||||
112
comfy/app/subgraph_manager.py
Normal file
112
comfy/app/subgraph_manager.py
Normal file
@ -0,0 +1,112 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TypedDict
|
||||
import os
|
||||
from ..cmd import folder_paths
|
||||
import glob
|
||||
from aiohttp import web
|
||||
import hashlib
|
||||
|
||||
|
||||
class Source:
|
||||
custom_node = "custom_node"
|
||||
|
||||
class SubgraphEntry(TypedDict):
|
||||
source: str
|
||||
"""
|
||||
Source of subgraph - custom_nodes vs templates.
|
||||
"""
|
||||
path: str
|
||||
"""
|
||||
Relative path of the subgraph file.
|
||||
For custom nodes, will be the relative directory like <custom_node_dir>/subgraphs/<name>.json
|
||||
"""
|
||||
name: str
|
||||
"""
|
||||
Name of subgraph file.
|
||||
"""
|
||||
info: CustomNodeSubgraphEntryInfo
|
||||
"""
|
||||
Additional info about subgraph; in the case of custom_nodes, will contain nodepack name
|
||||
"""
|
||||
data: str
|
||||
|
||||
class CustomNodeSubgraphEntryInfo(TypedDict):
|
||||
node_pack: str
|
||||
"""Node pack name."""
|
||||
|
||||
class SubgraphManager:
|
||||
def __init__(self):
|
||||
self.cached_custom_node_subgraphs: dict[str, SubgraphEntry] | None = None
|
||||
|
||||
async def load_entry_data(self, entry: SubgraphEntry):
|
||||
with open(entry['path'], 'r') as f:
|
||||
entry['data'] = f.read()
|
||||
return entry
|
||||
|
||||
async def sanitize_entry(self, entry: SubgraphEntry | None, remove_data=False) -> SubgraphEntry | None:
|
||||
if entry is None:
|
||||
return None
|
||||
entry = entry.copy()
|
||||
entry.pop('path', None)
|
||||
if remove_data:
|
||||
entry.pop('data', None)
|
||||
return entry
|
||||
|
||||
async def sanitize_entries(self, entries: dict[str, SubgraphEntry], remove_data=False) -> dict[str, SubgraphEntry]:
|
||||
entries = entries.copy()
|
||||
for key in list(entries.keys()):
|
||||
entries[key] = await self.sanitize_entry(entries[key], remove_data)
|
||||
return entries
|
||||
|
||||
async def get_custom_node_subgraphs(self, loadedModules, force_reload=False):
|
||||
# if not forced to reload and cached, return cache
|
||||
if not force_reload and self.cached_custom_node_subgraphs is not None:
|
||||
return self.cached_custom_node_subgraphs
|
||||
# Load subgraphs from custom nodes
|
||||
subfolder = "subgraphs"
|
||||
subgraphs_dict: dict[str, SubgraphEntry] = {}
|
||||
|
||||
for folder in folder_paths.get_folder_paths("custom_nodes"):
|
||||
pattern = os.path.join(folder, f"*/{subfolder}/*.json")
|
||||
matched_files = glob.glob(pattern)
|
||||
for file in matched_files:
|
||||
# replace backslashes with forward slashes
|
||||
file = file.replace('\\', '/')
|
||||
info: CustomNodeSubgraphEntryInfo = {
|
||||
"node_pack": "custom_nodes." + file.split('/')[-3]
|
||||
}
|
||||
source = Source.custom_node
|
||||
# hash source + path to make sure id will be as unique as possible, but
|
||||
# reproducible across backend reloads
|
||||
id = hashlib.sha256(f"{source}{file}".encode()).hexdigest()
|
||||
entry: SubgraphEntry = {
|
||||
"source": Source.custom_node,
|
||||
"name": os.path.splitext(os.path.basename(file))[0],
|
||||
"path": file,
|
||||
"info": info,
|
||||
}
|
||||
subgraphs_dict[id] = entry
|
||||
self.cached_custom_node_subgraphs = subgraphs_dict
|
||||
return subgraphs_dict
|
||||
|
||||
async def get_custom_node_subgraph(self, id: str, loadedModules):
|
||||
subgraphs = await self.get_custom_node_subgraphs(loadedModules)
|
||||
entry: SubgraphEntry = subgraphs.get(id, None)
|
||||
if entry is not None and entry.get('data', None) is None:
|
||||
await self.load_entry_data(entry)
|
||||
return entry
|
||||
|
||||
def add_routes(self, routes, loadedModules):
|
||||
@routes.get("/global_subgraphs")
|
||||
async def get_global_subgraphs(request):
|
||||
subgraphs_dict = await self.get_custom_node_subgraphs(loadedModules)
|
||||
# NOTE: we may want to include other sources of global subgraphs such as templates in the future;
|
||||
# that's the reasoning for the current implementation
|
||||
return web.json_response(await self.sanitize_entries(subgraphs_dict, remove_data=True))
|
||||
|
||||
@routes.get("/global_subgraphs/{id}")
|
||||
async def get_global_subgraph(request):
|
||||
id = request.match_info.get("id", None)
|
||||
subgraph = await self.get_custom_node_subgraph(id, loadedModules)
|
||||
return web.json_response(await self.sanitize_entry(subgraph))
|
||||
@ -60,6 +60,9 @@ class UserManager():
|
||||
user = "default"
|
||||
if args.multi_user and "comfy-user" in request.headers:
|
||||
user = request.headers["comfy-user"]
|
||||
# Block System Users (use same error message to prevent probing)
|
||||
if user.startswith(folder_paths.SYSTEM_USER_PREFIX):
|
||||
raise KeyError("Unknown user: " + user)
|
||||
|
||||
if user not in self.users:
|
||||
raise KeyError("Unknown user: " + user)
|
||||
@ -67,15 +70,16 @@ class UserManager():
|
||||
return user
|
||||
|
||||
def get_request_user_filepath(self, request, file, type="userdata", create_dir=True):
|
||||
user_directory = folder_paths.get_user_directory()
|
||||
|
||||
if type == "userdata":
|
||||
root_dir = user_directory
|
||||
root_dir = folder_paths.get_user_directory()
|
||||
else:
|
||||
raise KeyError("Unknown filepath type:" + type)
|
||||
|
||||
user = self.get_request_user_id(request)
|
||||
path = user_root = os.path.abspath(os.path.join(root_dir, user))
|
||||
user_root = folder_paths.get_public_user_directory(user)
|
||||
if user_root is None:
|
||||
return None
|
||||
path = user_root
|
||||
|
||||
# prevent leaving /{type}
|
||||
if os.path.commonpath((root_dir, user_root)) != root_dir:
|
||||
@ -102,7 +106,11 @@ class UserManager():
|
||||
name = name.strip()
|
||||
if not name:
|
||||
raise ValueError("username not provided")
|
||||
if name.startswith(folder_paths.SYSTEM_USER_PREFIX):
|
||||
raise ValueError("System User prefix not allowed")
|
||||
user_id = re.sub("[^a-zA-Z0-9-_]+", '-', name)
|
||||
if user_id.startswith(folder_paths.SYSTEM_USER_PREFIX):
|
||||
raise ValueError("System User prefix not allowed")
|
||||
user_id = user_id + "_" + str(uuid.uuid4())
|
||||
|
||||
self.users[user_id] = name
|
||||
@ -133,7 +141,10 @@ class UserManager():
|
||||
if username in self.users.values():
|
||||
return web.json_response({"error": "Duplicate username."}, status=400)
|
||||
|
||||
user_id = self.add_user(username)
|
||||
try:
|
||||
user_id = self.add_user(username)
|
||||
except ValueError as e:
|
||||
return web.json_response({"error": str(e)}, status=400)
|
||||
return web.json_response(user_id)
|
||||
|
||||
@routes.get("/userdata")
|
||||
@ -425,7 +436,7 @@ class UserManager():
|
||||
return source
|
||||
|
||||
dest = get_user_data_path(request, check_exists=False, param="dest")
|
||||
if not isinstance(source, str):
|
||||
if not isinstance(dest, str):
|
||||
return dest
|
||||
|
||||
overwrite = request.query.get("overwrite", 'true') != "false"
|
||||
|
||||
@ -415,7 +415,8 @@ class ControlNet(nn.Module):
|
||||
out_middle = []
|
||||
|
||||
if self.num_classes is not None:
|
||||
assert y.shape[0] == x.shape[0], "There may be a mismatch between the ControlNet and Diffusion models being used"
|
||||
if y is None:
|
||||
raise ValueError("y is None, did you try using a controlnet for SDXL on SD1?")
|
||||
emb = emb + self.label_emb(y)
|
||||
|
||||
h = x
|
||||
|
||||
@ -105,6 +105,7 @@ def _create_parser() -> EnhancedConfigArgParser:
|
||||
cache_group.add_argument("--cache-classic", action="store_true", help="WARNING: Unused. Use the old style (aggressive) caching.")
|
||||
cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.")
|
||||
cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.")
|
||||
cache_group.add_argument("--cache-ram", nargs='?', const=4.0, type=float, default=0, help="Use RAM pressure caching with the specified headroom threshold. If available RAM drops below the threhold the cache remove large items to free RAM. Default 4GB")
|
||||
attn_group = parser.add_mutually_exclusive_group()
|
||||
attn_group.add_argument("--use-split-cross-attention", action="store_true",
|
||||
help="Use the split cross attention optimization. Ignored when xformers is used.")
|
||||
@ -120,6 +121,10 @@ def _create_parser() -> EnhancedConfigArgParser:
|
||||
upcast = parser.add_mutually_exclusive_group()
|
||||
upcast.add_argument("--force-upcast-attention", action="store_true", help="Force enable attention upcasting, please report if it fixes black images.")
|
||||
upcast.add_argument("--dont-upcast-attention", action="store_true", help="Disable all upcasting of attention. Should be unnecessary except for debugging.")
|
||||
parser.add_argument("--enable-manager", action="store_true", help="Enable the ComfyUI-Manager feature.")
|
||||
manager_group = parser.add_mutually_exclusive_group()
|
||||
manager_group.add_argument("--disable-manager-ui", action="store_true", help="Disables only the ComfyUI-Manager UI and endpoints. Scheduled installations and similar background tasks will still operate.")
|
||||
manager_group.add_argument("--enable-manager-legacy-ui", action="store_true", help="Enables the legacy UI of ComfyUI-Manager")
|
||||
vram_group = parser.add_mutually_exclusive_group()
|
||||
vram_group.add_argument("--gpu-only", action="store_true",
|
||||
help="Store and run everything (text encoders/CLIP models, etc... on the GPU).")
|
||||
@ -131,8 +136,9 @@ def _create_parser() -> EnhancedConfigArgParser:
|
||||
vram_group.add_argument("--novram", action="store_true", help="When lowvram isn't enough.")
|
||||
vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for everything (slow).")
|
||||
|
||||
parser.add_argument("--reserve-vram", type=float, default=None, help="Set the amount of vram in GB you want to reserve for use by your OS/other software. By default some amount is reserved depending on your OS.")
|
||||
parser.add_argument("--async-offload", action="store_true", help="Use async weight offloading.")
|
||||
parser.add_argument("--reserve-vram", type=float, default=0, help="Set the amount of vram in GB you want to reserve for use by your OS/other software. Defaults to 0.0, since this isn't conceptually robust anyway.")
|
||||
parser.add_argument("--async-offload", nargs='?', const=2, type=int, default=None, metavar="NUM_STREAMS", help="Use async weight offloading. An optional argument controls the amount of offload streams. Default is 2. Enabled by default on Nvidia.")
|
||||
parser.add_argument("--disable-async-offload", action="store_true", help="Disable async weight offloading.")
|
||||
parser.add_argument("--force-non-blocking", action="store_true", help="Force ComfyUI to use non-blocking operations for all applicable tensors. This may improve performance on some non-Nvidia systems but can cause issues with some workflows.")
|
||||
parser.add_argument("--default-hashing-function", type=str, choices=['md5', 'sha1', 'sha256', 'sha512'], default='sha256', help="Allows you to choose the hash function to use for duplicate filename / contents comparison. Default is sha256.")
|
||||
parser.add_argument("--disable-smart-memory", action="store_true",
|
||||
@ -141,6 +147,7 @@ def _create_parser() -> EnhancedConfigArgParser:
|
||||
help="Make pytorch use slower deterministic algorithms when it can. Note that this might not make images deterministic in all cases.")
|
||||
|
||||
parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help=f"Enable some untested and potentially quality deteriorating optimizations. Pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: {' '.join([f.value for f in PerformanceFeature])}", default=set())
|
||||
parser.add_argument("--disable-pinned-memory", action="store_true", help="Disable pinned memory use.")
|
||||
|
||||
parser.add_argument("--mmap-torch-files", action="store_true", help="Use mmap when loading ckpt/pt files.")
|
||||
parser.add_argument("--disable-mmap", action="store_true", help="Don't use mmap when loading safetensors.")
|
||||
@ -155,7 +162,7 @@ def _create_parser() -> EnhancedConfigArgParser:
|
||||
parser.add_argument("--disable-all-custom-nodes", action="store_true", help="Disable loading all custom nodes.")
|
||||
parser.add_argument("--whitelist-custom-nodes", type=str, action=FlattenAndAppendAction, nargs='+', default=[], help="Specify custom node folders to load even when --disable-all-custom-nodes is enabled.")
|
||||
parser.add_argument("--blacklist-custom-nodes", type=str, action=FlattenAndAppendAction, nargs='+', default=[], help="Specify custom node folders to never load. Accepts shell-style globs.")
|
||||
parser.add_argument("--disable-api-nodes", action="store_true", help="Disable loading all api nodes.")
|
||||
parser.add_argument("--disable-api-nodes", action="store_true", help="Disable loading all api nodes. Also prevents the frontend from communicating with the internet.")
|
||||
parser.add_argument("--enable-eval", action="store_true", help="Enable nodes that can evaluate Python code in workflows.")
|
||||
|
||||
parser.add_argument("--multi-user", action="store_true", help="Enables per-user storage.")
|
||||
@ -196,6 +203,7 @@ def _create_parser() -> EnhancedConfigArgParser:
|
||||
parser.add_argument("--otel-exporter-otlp-endpoint", type=str, default=None, env_var="OTEL_EXPORTER_OTLP_ENDPOINT", help="A base endpoint URL for any signal type, with an optionally-specified port number. Helpful for when you're sending more than one signal to the same endpoint and want one environment variable to control the endpoint.")
|
||||
parser.add_argument("--force-channels-last", action="store_true", help="Force channels last format when inferencing the models.")
|
||||
parser.add_argument("--force-hf-local-dir-mode", action="store_true", help="Download repos from huggingface.co to the models/huggingface directory with the \"local_dir\" argument instead of models/huggingface_cache with the \"cache_dir\" argument, recreating the traditional file structure.")
|
||||
parser.add_argument("--enable-video-to-image-fallback", action="store_true", help="Enable fallback to convert video frames to images for models that do not natively support video inputs.")
|
||||
|
||||
parser.add_argument(
|
||||
"--front-end-version",
|
||||
@ -298,6 +306,7 @@ def _create_parser() -> EnhancedConfigArgParser:
|
||||
except Exception as exc:
|
||||
logger.error("Failed to load custom config plugin", exc_info=exc)
|
||||
|
||||
parser.add_argument("--disable-requests-caching", action="store_true", help="Disable requests caching (useful for testing)")
|
||||
return parser
|
||||
|
||||
|
||||
|
||||
@ -73,6 +73,7 @@ class Configuration(dict):
|
||||
temp_directory (Optional[str]): Temporary directory for processing.
|
||||
input_directory (Optional[str]): Directory for input files. When this is a relative path, it will be looked up relative to the cwd (current working directory) and all of the base_paths.
|
||||
auto_launch (bool): Auto-launch UI in the default browser. Defaults to False.
|
||||
disable_auto_launch (bool): Disable auto launching the browser.
|
||||
cuda_device (Optional[int]): CUDA device ID. None means default device.
|
||||
cuda_malloc (bool): Enable cudaMallocAsync. Defaults to True in applicable setups.
|
||||
disable_cuda_malloc (bool): Disable cudaMallocAsync.
|
||||
@ -100,6 +101,7 @@ class Configuration(dict):
|
||||
disable_ipex_optimize (bool): Disable IPEX optimization for Intel GPUs.
|
||||
preview_method (LatentPreviewMethod): Method for generating previews. Defaults to "auto".
|
||||
cache_lru (int): Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.
|
||||
cache_ram (float): Use RAM pressure caching with the specified headroom threshold.
|
||||
use_split_cross_attention (bool): Use split cross-attention optimization.
|
||||
use_quad_cross_attention (bool): Use sub-quadratic cross-attention optimization.
|
||||
use_pytorch_cross_attention (bool): Use PyTorch's cross-attention function.
|
||||
@ -147,14 +149,19 @@ class Configuration(dict):
|
||||
user_directory (Optional[str]): Set the ComfyUI user directory with an absolute path.
|
||||
log_stdout (bool): Send normal process output to stdout instead of stderr (default)
|
||||
panic_when (list[str]): List of fully qualified exception class names to panic (sys.exit(1)) when a workflow raises it.
|
||||
enable_manager (bool): Enable the ComfyUI-Manager feature.
|
||||
disable_manager_ui (bool): Disables only the ComfyUI-Manager UI.
|
||||
enable_manager_legacy_ui (bool): Enables the legacy UI of ComfyUI-Manager.
|
||||
enable_compress_response_body (bool): Enable compressing response body.
|
||||
workflows (list[str]): Execute the API workflow(s) specified in the provided files. For each workflow, its outputs will be printed to a line to standard out. Application logging will be redirected to standard error. Use `-` to signify standard in.
|
||||
disable_pinned_memory (bool): Disable pinned memory use.
|
||||
fp8_e8m0fnu_unet (bool): Store unet weights in fp8_e8m0fnu.
|
||||
bf16_text_enc (bool): Store text encoder weights in bf16.
|
||||
supports_fp8_compute (bool): ComfyUI will act like if the device supports fp8 compute.
|
||||
cache_classic (bool): WARNING: Unused. Use the old style (aggressive) caching.
|
||||
cache_none (bool): Reduced RAM/VRAM usage at the expense of executing every node for each run.
|
||||
async_offload (bool): Use async weight offloading.
|
||||
async_offload (Optional[int]): Use async weight offloading. An optional argument controls the amount of offload streams.
|
||||
disable_async_offload (bool): Disable async weight offloading.
|
||||
force_non_blocking (bool): Force ComfyUI to use non-blocking operations for all applicable tensors. This may improve performance on some non-Nvidia systems but can cause issues with some workflows.
|
||||
default_hashing_function (str): Allows you to choose the hash function to use for duplicate filename / contents comparison. Default is sha256.
|
||||
mmap_torch_files (bool): Use mmap when loading ckpt/pt files.
|
||||
@ -189,6 +196,7 @@ class Configuration(dict):
|
||||
self.temp_directory: Optional[str] = None
|
||||
self.input_directory: Optional[str] = None
|
||||
self.auto_launch: bool = False
|
||||
self.disable_auto_launch: bool = False
|
||||
self.cuda_device: Optional[int] = None
|
||||
self.cuda_malloc: bool = True
|
||||
self.disable_cuda_malloc: bool = True
|
||||
@ -227,7 +235,8 @@ class Configuration(dict):
|
||||
self.novram: bool = False
|
||||
self.cpu: bool = False
|
||||
self.fast: set[PerformanceFeature] = set()
|
||||
self.reserve_vram: Optional[float] = None
|
||||
# reserve 0, because this has been exceptionally buggy
|
||||
self.reserve_vram: float = 0.0
|
||||
self.disable_smart_memory: bool = False
|
||||
self.deterministic: bool = False
|
||||
self.dont_print_server: bool = False
|
||||
@ -250,6 +259,7 @@ class Configuration(dict):
|
||||
self.external_address: Optional[str] = None
|
||||
self.disable_known_models: bool = False
|
||||
self.max_queue_size: int = 65536
|
||||
self.disable_requests_caching: bool = False
|
||||
self.force_channels_last: bool = False
|
||||
self.force_hf_local_dir_mode = False
|
||||
self.preview_size: int = 512
|
||||
@ -271,13 +281,19 @@ class Configuration(dict):
|
||||
self.user_directory: Optional[str] = None
|
||||
self.panic_when: list[str] = []
|
||||
self.workflows: list[str] = []
|
||||
self.enable_manager: bool = False
|
||||
self.disable_manager_ui: bool = False
|
||||
self.enable_manager_legacy_ui: bool = False
|
||||
self.disable_pinned_memory: bool = False
|
||||
|
||||
self.fp8_e8m0fnu_unet: bool = False
|
||||
self.bf16_text_enc: bool = False
|
||||
self.supports_fp8_compute: bool = False
|
||||
self.cache_classic: bool = False
|
||||
self.cache_none: bool = False
|
||||
self.async_offload: bool = False
|
||||
self.cache_ram: float = 0.0
|
||||
self.async_offload: Optional[int] = None
|
||||
self.disable_async_offload: bool = False
|
||||
self.force_non_blocking: bool = False
|
||||
self.default_hashing_function: str = 'sha256'
|
||||
self.mmap_torch_files: bool = False
|
||||
@ -288,8 +304,9 @@ class Configuration(dict):
|
||||
self.comfy_api_base: str = "https://api.comfy.org"
|
||||
self.database_url: str = db_config()
|
||||
self.default_device: Optional[int] = None
|
||||
self.block_runtime_package_installation = None
|
||||
self.block_runtime_package_installation: bool = False
|
||||
self.enable_eval: Optional[bool] = False
|
||||
self.enable_video_to_image_fallback: bool = False
|
||||
|
||||
for key, value in kwargs.items():
|
||||
self[key] = value
|
||||
|
||||
@ -4,6 +4,7 @@ from ..cmd.main_pre import tracer
|
||||
|
||||
import asyncio
|
||||
import concurrent.futures
|
||||
import contextlib
|
||||
import copy
|
||||
import gc
|
||||
import json
|
||||
@ -12,24 +13,26 @@ import threading
|
||||
import uuid
|
||||
from asyncio import get_event_loop
|
||||
from multiprocessing import RLock
|
||||
from typing import Optional, Generator
|
||||
from typing import Optional, Literal
|
||||
|
||||
from opentelemetry import context, propagate
|
||||
from opentelemetry.context import Context, attach, detach
|
||||
from opentelemetry.trace import Status, StatusCode
|
||||
from .async_progress_iterable import _ProgressHandler, QueuePromptWithProgress
|
||||
|
||||
from .async_progress_iterable import QueuePromptWithProgress
|
||||
from .client_types import V1QueuePromptResponse
|
||||
from ..api.components.schema.prompt import PromptDict
|
||||
from ..cli_args_types import Configuration
|
||||
from ..cli_args import default_configuration
|
||||
from ..cmd.folder_paths import init_default_paths # pylint: disable=import-error
|
||||
from ..component_model.executor_types import ExecutorToClientProgress
|
||||
from ..component_model.make_mutable import make_mutable
|
||||
from ..component_model.queue_types import QueueItem, ExecutionStatus, TaskInvocation
|
||||
from ..component_model.queue_types import QueueItem, ExecutionStatus, TaskInvocation, QueueTuple, ExtraData
|
||||
from ..distributed.executors import ContextVarExecutor
|
||||
from ..distributed.history import History
|
||||
from ..distributed.process_pool_executor import ProcessPoolExecutor
|
||||
from ..distributed.server_stub import ServerStub
|
||||
from ..execution_context import current_execution_context, context_configuration
|
||||
from ..component_model.configuration import MODEL_MANAGEMENT_ARGS, requires_process_pool_executor
|
||||
|
||||
_prompt_executor = threading.local()
|
||||
|
||||
@ -45,6 +48,7 @@ def _execute_prompt(
|
||||
configuration: Configuration | None,
|
||||
partial_execution_targets: Optional[list[str]] = None) -> dict:
|
||||
configuration = copy.deepcopy(configuration) if configuration is not None else None
|
||||
from ..execution_context import current_execution_context
|
||||
execution_context = current_execution_context()
|
||||
if len(execution_context.folder_names_and_paths) == 0 or configuration is not None:
|
||||
init_default_paths(execution_context.folder_names_and_paths, configuration, replace_existing=True)
|
||||
@ -66,6 +70,7 @@ async def __execute_prompt(
|
||||
progress_handler: ExecutorToClientProgress | None,
|
||||
configuration: Configuration | None,
|
||||
partial_execution_targets: list[str] | None) -> dict:
|
||||
from ..execution_context import context_configuration
|
||||
with context_configuration(configuration):
|
||||
return await ___execute_prompt(prompt, prompt_id, client_id, span_context, progress_handler, partial_execution_targets)
|
||||
|
||||
@ -143,45 +148,187 @@ def _cleanup(invalidate_nodes=True):
|
||||
|
||||
class Comfy:
|
||||
"""
|
||||
This manages a single-threaded executor to run long-running or blocking workflows
|
||||
asynchronously without blocking the asyncio event loop. It initializes a PromptExecutor
|
||||
in a dedicated thread for executing prompts and handling server-stub communications.
|
||||
Example usage:
|
||||
A client for running ComfyUI workflows within a Python application.
|
||||
|
||||
Asynchronous (non-blocking) usage with async-await:
|
||||
```
|
||||
# Write a workflow, or enable Dev Mode in the UI settings, then Save (API Format) to get the workflow in your
|
||||
# workspace.
|
||||
This client allows you to execute ComfyUI workflows (in API JSON format) programmatically.
|
||||
It manages the execution environment, including model loading and resource cleanup.
|
||||
|
||||
### Configuration and Executors
|
||||
|
||||
ComfyUI relies on global state for model management (e.g., loaded models in VRAM). To handle this safely, `Comfy`
|
||||
executes workflows using one of two strategies based on your `configuration`:
|
||||
|
||||
1. **ContextVarExecutor (Default)**: Runs in a thread pool within the current process.
|
||||
- **Pros**: Efficient, low overhead.
|
||||
- **Cons**: Modifies global state in the current process.
|
||||
- **Use Case**: Standard workflows where you are happy with the default ComfyUI settings or sharing state.
|
||||
|
||||
2. **ProcessPoolExecutor**: Runs in a separate process.
|
||||
- **Pros**: Complete isolation. Configuration changes (like `lowvram`) do not affect the main process.
|
||||
- **Cons**: Higher overhead (process startup).
|
||||
- **Use Case**: Required when `configuration` overrides arguments that affect global model management state.
|
||||
These arguments include: `lowvram`, `highvram`, `cpu`, `gpu_only`, `deterministic`, `directml`,
|
||||
various `fp8`/`fp16`/`bf16` settings, and attention optimizations (e.g., `use_flash_attention`).
|
||||
|
||||
The client automatically selects `ProcessPoolExecutor` if you provide a `configuration` that modifies any of these
|
||||
global settings, unless you explicitly pass an `executor`.
|
||||
|
||||
### Parameters
|
||||
|
||||
- **configuration** (`Optional[Configuration]`): A dictionary of arguments to override defaults.
|
||||
See `comfy.cli_args_types.Configuration`.
|
||||
Example: `{"lowvram": True}` or `{"gpu_only": True}`.
|
||||
- **progress_handler** (`Optional[ExecutorToClientProgress]`): callback handler for progress updates and previews.
|
||||
- **max_workers** (`int`): Maximum number of concurrent workflows (default: 1).
|
||||
- **executor** (`Optional[Union[Executor, str]]`): Explicitly define the executor to use.
|
||||
- Pass an instance of `ProcessPoolExecutor` or `ContextVarExecutor`.
|
||||
- Pass the string `"ProcessPoolExecutor"` or `"ContextVarExecutor"` to force initialization of that type.
|
||||
- If `None` (default), the best executor is chosen based on `configuration`.
|
||||
|
||||
### Examples
|
||||
|
||||
#### 1. Running a Workflow (Basic)
|
||||
|
||||
This example executes a simple workflow and prints the path of the saved image.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from comfy.client.embedded_comfy_client import Comfy
|
||||
|
||||
# A simple API format workflow (simplified for brevity)
|
||||
prompt_dict = {
|
||||
"1": {"class_type": "KSamplerAdvanced", ...}
|
||||
...
|
||||
"3": {
|
||||
"class_type": "KSampler",
|
||||
"inputs": {
|
||||
"seed": 8566257, "steps": 20, "cfg": 8, "sampler_name": "euler",
|
||||
"scheduler": "normal", "denoise": 1,
|
||||
"model": ["4", 0], "positive": ["6", 0], "negative": ["7", 0],
|
||||
"latent_image": ["5", 0]
|
||||
}
|
||||
},
|
||||
"4": {"class_type": "CheckpointLoaderSimple", "inputs": {"ckpt_name": "v1-5-pruned-emaonly.safetensors"}},
|
||||
"5": {"class_type": "EmptyLatentImage", "inputs": {"width": 512, "height": 512, "batch_size": 1}},
|
||||
"6": {"class_type": "CLIPTextEncode", "inputs": {"text": "masterpiece best quality girl", "clip": ["4", 1]}},
|
||||
"7": {"class_type": "CLIPTextEncode", "inputs": {"text": "bad hands", "clip": ["4", 1]}},
|
||||
"8": {"class_type": "VAEDecode", "inputs": {"samples": ["3", 0], "vae": ["4", 2]}},
|
||||
"9": {"class_type": "SaveImage", "inputs": {"filename_prefix": "ComfyUI_API", "images": ["8", 0]}}
|
||||
}
|
||||
# Validate your workflow (the prompt)
|
||||
from comfy.api.components.schema.prompt import Prompt
|
||||
prompt = Prompt.validate(prompt_dict)
|
||||
# Then use the client to run your workflow. This will start, then stop, a local ComfyUI workflow executor.
|
||||
# It does not connect to a remote server.
|
||||
|
||||
async def main():
|
||||
async with EmbeddedComfyClient() as client:
|
||||
outputs = await client.queue_prompt(prompt)
|
||||
print(outputs)
|
||||
print("Now that we've exited the with statement, all your VRAM has been cleared from ComfyUI")
|
||||
if __name__ == "__main__"
|
||||
asyncio.run(main())
|
||||
# Using default configuration (runs in-process)
|
||||
async with Comfy() as client:
|
||||
# Queue the prompt and await the result
|
||||
outputs = await client.queue_prompt(prompt_dict)
|
||||
|
||||
# Retrieve the output path from the SaveImage node (Node ID "9")
|
||||
image_path = outputs["9"]["images"][0]["abs_path"]
|
||||
print(f"Image saved to: {image_path}")
|
||||
|
||||
# asyncio.run(main())
|
||||
```
|
||||
|
||||
In order to use this in blocking methods, learn more about asyncio online.
|
||||
#### 2. Using Custom Configuration (Isolated Process)
|
||||
|
||||
To run with specific settings like `lowvram`, pass the configuration. This implies `ProcessPoolExecutor`.
|
||||
|
||||
```python
|
||||
async def run_lowvram():
|
||||
# This will spawn a new process with lowvram enabled
|
||||
async with Comfy(configuration={"lowvram": True}) as client:
|
||||
outputs = await client.queue_prompt(prompt_dict)
|
||||
print("Finished lowvram generation")
|
||||
```
|
||||
|
||||
#### 3. Programmatically Building Workflows
|
||||
|
||||
You can use `GraphBuilder` constructing workflows with a more pythonic API.
|
||||
|
||||
```python
|
||||
from comfy_execution.graph_utils import GraphBuilder
|
||||
|
||||
def build_graph():
|
||||
builder = GraphBuilder()
|
||||
checkpoint = builder.node("CheckpointLoaderSimple", ckpt_name="v1-5-pruned-emaonly.safetensors")
|
||||
latent = builder.node("EmptyLatentImage", width=512, height=512, batch_size=1)
|
||||
pos = builder.node("CLIPTextEncode", text="masterpiece", clip=checkpoint.out(1))
|
||||
neg = builder.node("CLIPTextEncode", text="bad quality", clip=checkpoint.out(1))
|
||||
|
||||
sampler = builder.node("KSampler",
|
||||
seed=42, steps=20, cfg=8, sampler_name="euler", scheduler="normal", denoise=1,
|
||||
model=checkpoint.out(0), positive=pos.out(0), negative=neg.out(0), latent_image=latent.out(0)
|
||||
)
|
||||
vae = builder.node("VAEDecode", samples=sampler.out(0), vae=checkpoint.out(2))
|
||||
builder.node("SaveImage", filename_prefix="Generated", images=vae.out(0))
|
||||
return builder.finalize()
|
||||
|
||||
async def run_builder():
|
||||
prompt = build_graph()
|
||||
async with Comfy() as client:
|
||||
await client.queue_prompt(prompt)
|
||||
```
|
||||
|
||||
#### 4. Streaming Progress and Previews
|
||||
|
||||
To receive real-time progress updates and preview images (e.g., step-by-step decoding).
|
||||
|
||||
```python
|
||||
from comfy.component_model.queue_types import BinaryEventTypes
|
||||
|
||||
async def run_streaming():
|
||||
async with Comfy() as client:
|
||||
# Get a task that supports progress iteration
|
||||
task = client.queue_with_progress(prompt_dict)
|
||||
|
||||
async for notification in task.progress():
|
||||
if notification.event == BinaryEventTypes.PREVIEW_IMAGE_WITH_METADATA:
|
||||
# 'data' contains the PIL Image and metadata
|
||||
image, metadata = notification.data
|
||||
print(f"Received preview: {image.size}")
|
||||
elif notification.event == "progress":
|
||||
print(f"Step: {notification.data['value']}/{notification.data['max']}")
|
||||
|
||||
# Await final result
|
||||
result = await task.get()
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, configuration: Optional[Configuration] = None, progress_handler: Optional[ExecutorToClientProgress] = None, max_workers: int = 1, executor: ProcessPoolExecutor | ContextVarExecutor = None):
|
||||
def __init__(self, configuration: Optional[Configuration] = None, progress_handler: Optional[ExecutorToClientProgress] = None, max_workers: int = 1, executor: ProcessPoolExecutor | ContextVarExecutor | Literal["ProcessPoolExecutor", "ContextVarExecutor"] = None):
|
||||
self._progress_handler = progress_handler or ServerStub()
|
||||
self._executor = executor or ContextVarExecutor(max_workers=max_workers)
|
||||
self._default_configuration = default_configuration()
|
||||
self._configuration = configuration
|
||||
|
||||
need_process_pool = requires_process_pool_executor(configuration)
|
||||
|
||||
if executor is None:
|
||||
if need_process_pool:
|
||||
self._executor = ProcessPoolExecutor(max_workers=max_workers)
|
||||
self._owns_executor = True
|
||||
else:
|
||||
self._executor = ContextVarExecutor(max_workers=max_workers)
|
||||
self._owns_executor = True
|
||||
elif isinstance(executor, str):
|
||||
self._owns_executor = True
|
||||
if executor == "ProcessPoolExecutor":
|
||||
self._executor = ProcessPoolExecutor(max_workers=max_workers)
|
||||
elif executor == "ContextVarExecutor":
|
||||
if need_process_pool:
|
||||
raise ValueError(f"Configuration requires ProcessPoolExecutor but ContextVarExecutor was requested. Configuration keys causing this: {[k for k in MODEL_MANAGEMENT_ARGS if configuration.get(k) != self._default_configuration.get(k)]}")
|
||||
self._executor = ContextVarExecutor(max_workers=max_workers)
|
||||
else:
|
||||
raise ValueError(f"Unknown executor type string: {executor}")
|
||||
else:
|
||||
# Executor instance passed
|
||||
self._owns_executor = False
|
||||
self._executor = executor
|
||||
if need_process_pool and not isinstance(executor, ProcessPoolExecutor):
|
||||
raise ValueError(f"Configuration requires ProcessPoolExecutor but {type(executor).__name__} was passed. Configuration keys causing this: {[k for k in MODEL_MANAGEMENT_ARGS if configuration.get(k) != self._default_configuration.get(k)]}")
|
||||
|
||||
self._is_running = False
|
||||
self._task_count_lock = RLock()
|
||||
self._task_count = 0
|
||||
self._history = History()
|
||||
self._context_stack = []
|
||||
self._exit_stack = None
|
||||
self._async_exit_stack = None
|
||||
|
||||
@property
|
||||
def is_running(self) -> bool:
|
||||
@ -192,10 +339,13 @@ class Comfy:
|
||||
return self._task_count
|
||||
|
||||
def __enter__(self):
|
||||
self._exit_stack = contextlib.ExitStack()
|
||||
self._is_running = True
|
||||
from ..execution_context import context_configuration
|
||||
cm = context_configuration(self._configuration)
|
||||
cm.__enter__()
|
||||
self._context_stack.append(cm)
|
||||
self._exit_stack.enter_context(cm)
|
||||
if self._owns_executor:
|
||||
self._exit_stack.enter_context(self._executor)
|
||||
return self
|
||||
|
||||
@property
|
||||
@ -207,15 +357,17 @@ class Comfy:
|
||||
|
||||
def __exit__(self, *args):
|
||||
get_event_loop().run_in_executor(self._executor, _cleanup)
|
||||
self._executor.shutdown(wait=True)
|
||||
self._is_running = False
|
||||
self._context_stack.pop().__exit__(*args)
|
||||
self._exit_stack.__exit__(*args)
|
||||
|
||||
async def __aenter__(self):
|
||||
self._async_exit_stack = contextlib.AsyncExitStack()
|
||||
self._is_running = True
|
||||
from ..execution_context import context_configuration
|
||||
cm = context_configuration(self._configuration)
|
||||
cm.__enter__()
|
||||
self._context_stack.append(cm)
|
||||
self._async_exit_stack.enter_context(cm)
|
||||
if self._owns_executor:
|
||||
self._async_exit_stack.enter_context(self._executor)
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *args):
|
||||
@ -225,9 +377,8 @@ class Comfy:
|
||||
|
||||
await get_event_loop().run_in_executor(self._executor, _cleanup)
|
||||
|
||||
self._executor.shutdown(wait=True)
|
||||
self._is_running = False
|
||||
self._context_stack.pop().__exit__(*args)
|
||||
await self._async_exit_stack.__aexit__(*args)
|
||||
|
||||
async def queue_prompt_api(self,
|
||||
prompt: PromptDict | str | dict,
|
||||
@ -304,16 +455,20 @@ class Comfy:
|
||||
|
||||
fut = concurrent.futures.Future()
|
||||
fut.set_result(TaskInvocation(prompt_id, copy.deepcopy(outputs), ExecutionStatus('success', True, [])))
|
||||
self._history.put(QueueItem(queue_tuple=(float(self._task_count), prompt_id, prompt, {}, []), completed=fut), outputs, ExecutionStatus('success', True, []))
|
||||
self._history.put(QueueItem(queue_tuple=QueueTuple(float(self._task_count), prompt_id, prompt, ExtraData(), [], {}), completed=fut), outputs, ExecutionStatus('success', True, []))
|
||||
return outputs
|
||||
except Exception as exc_info:
|
||||
fut = concurrent.futures.Future()
|
||||
fut.set_exception(exc_info)
|
||||
self._history.put(QueueItem(queue_tuple=(float(self._task_count), prompt_id, prompt, {}, []), completed=fut), {}, ExecutionStatus('error', False, [str(exc_info)]))
|
||||
self._history.put(QueueItem(queue_tuple=QueueTuple(float(self._task_count), prompt_id, prompt, ExtraData(), [], {}), completed=fut), {}, ExecutionStatus('error', False, [str(exc_info)]))
|
||||
raise exc_info
|
||||
finally:
|
||||
with self._task_count_lock:
|
||||
self._task_count -= 1
|
||||
|
||||
def __str__(self):
|
||||
diff = {k: v for k, v in (self._configuration or {}).items() if v != self._default_configuration.get(k)}
|
||||
return f"<Comfy task_count={self.task_count} configuration={diff} executor={self._executor}>"
|
||||
|
||||
|
||||
EmbeddedComfyClient = Comfy
|
||||
|
||||
@ -71,18 +71,23 @@ def cuda_malloc_supported():
|
||||
return True
|
||||
|
||||
|
||||
# todo: is this really how we want to get the torch version?
|
||||
version = ""
|
||||
|
||||
try:
|
||||
torch_spec = importlib.util.find_spec("torch")
|
||||
for folder in torch_spec.submodule_search_locations:
|
||||
ver_file = os.path.join(folder, "version.py")
|
||||
if os.path.isfile(ver_file):
|
||||
spec = importlib.util.spec_from_file_location("torch_version_import", ver_file)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
version = module.__version__
|
||||
except:
|
||||
pass
|
||||
|
||||
if not args.cuda_malloc:
|
||||
try:
|
||||
version = ""
|
||||
torch_spec = importlib.util.find_spec("torch")
|
||||
for folder in torch_spec.submodule_search_locations:
|
||||
ver_file = os.path.join(folder, "version.py")
|
||||
if os.path.isfile(ver_file):
|
||||
spec = importlib.util.spec_from_file_location("torch_version_import", ver_file)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
version = module.__version__
|
||||
|
||||
if int(version[0]) >= 2 and "+cu" in version: # enable by default for torch version 2.0 and up only on cuda torch
|
||||
if PerformanceFeature.AutoTune not in args.fast: # Autotune has issues with cuda malloc
|
||||
args.cuda_malloc = cuda_malloc_supported()
|
||||
@ -97,3 +102,6 @@ if args.cuda_malloc and not args.disable_cuda_malloc:
|
||||
env_var += ",backend:cudaMallocAsync"
|
||||
|
||||
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = env_var
|
||||
|
||||
def get_torch_version_noimport():
|
||||
return str(version)
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from .main_pre import tracer
|
||||
|
||||
from typing_extensions import NotRequired, TypedDict, NamedTuple
|
||||
import asyncio
|
||||
import copy
|
||||
import heapq
|
||||
@ -21,14 +21,24 @@ from typing import List, Optional, Tuple, Literal
|
||||
import torch
|
||||
from opentelemetry.trace import get_current_span, StatusCode, Status
|
||||
|
||||
from comfy_api.internal import _ComfyNodeInternal, _NodeOutputInternal, first_real_override, is_class, \
|
||||
make_locked_method_func
|
||||
from comfy_api.latest import io
|
||||
from comfy_api.internal import _ComfyNodeInternal, _NodeOutputInternal, first_real_override, is_class, make_locked_method_func
|
||||
from comfy_api.latest import io, _io
|
||||
from comfy_compatibility.vanilla import vanilla_environment_node_execution_hooks
|
||||
from comfy_execution.caching import HierarchicalCache, LRUCache, CacheKeySetInputSignature, CacheKeySetID, \
|
||||
DependencyAwareCache, \
|
||||
BasicCache
|
||||
from comfy_execution.graph import get_input_info, ExecutionList, DynamicPrompt, ExecutionBlocker
|
||||
from comfy_execution.caching import (
|
||||
BasicCache,
|
||||
CacheKeySetID,
|
||||
CacheKeySetInputSignature,
|
||||
NullCache,
|
||||
HierarchicalCache,
|
||||
LRUCache,
|
||||
RAMPressureCache,
|
||||
)
|
||||
from comfy_execution.graph import (
|
||||
DynamicPrompt,
|
||||
ExecutionBlocker,
|
||||
ExecutionList,
|
||||
get_input_info,
|
||||
)
|
||||
from comfy_execution.graph_types import FrozenTopologicalSort
|
||||
from comfy_execution.graph_utils import is_link, GraphBuilder
|
||||
from comfy_execution.progress import get_progress_state, reset_progress_state, add_progress_handler, \
|
||||
@ -41,12 +51,11 @@ from .. import model_management
|
||||
from ..component_model.abstract_prompt_queue import AbstractPromptQueue
|
||||
from ..component_model.executor_types import ExecutorToClientProgress, ValidationTuple, ValidateInputsTuple, \
|
||||
ValidationErrorDict, NodeErrorsDictValue, ValidationErrorExtraInfoDict, FormattedValue, RecursiveExecutionTuple, \
|
||||
RecursiveExecutionErrorDetails, RecursiveExecutionErrorDetailsInterrupted, ExecutionResult, DuplicateNodeError, \
|
||||
HistoryResultDict, ExecutionErrorMessage, ExecutionInterruptedMessage, ComboOptions
|
||||
RecursiveExecutionErrorDetails, RecursiveExecutionErrorDetailsInterrupted, ExecutionResult, HistoryResultDict, ExecutionErrorMessage, ExecutionInterruptedMessage, ComboOptions
|
||||
from ..component_model.files import canonicalize_path
|
||||
from ..component_model.module_property import create_module_properties
|
||||
from ..component_model.queue_types import QueueTuple, HistoryEntry, QueueItem, MAXIMUM_HISTORY_SIZE, ExecutionStatus, \
|
||||
ExecutionStatusAsDict
|
||||
ExecutionStatusAsDict, AbstractPromptQueueGetCurrentQueueItems
|
||||
from ..execution_context import context_execute_node, context_execute_prompt
|
||||
from ..execution_context import current_execution_context, context_set_execution_list_and_inputs
|
||||
from ..execution_ext import should_panic_on_exception
|
||||
@ -94,7 +103,7 @@ class IsChangedCache:
|
||||
return self.is_changed[node_id]
|
||||
|
||||
# Intentionally do not use cached outputs here. We only want constants in IS_CHANGED
|
||||
input_data_all, _, hidden_inputs = get_input_data(node["inputs"], class_def, node_id, None)
|
||||
input_data_all, _, v3_data = get_input_data(node["inputs"], class_def, node_id, None)
|
||||
try:
|
||||
is_changed = await _async_map_node_over_list(self.prompt_id, node_id, class_def, input_data_all, is_changed_name)
|
||||
is_changed = await resolve_map_node_over_list_results(is_changed)
|
||||
@ -106,61 +115,70 @@ class IsChangedCache:
|
||||
return self.is_changed[node_id]
|
||||
|
||||
|
||||
class CacheEntry(NamedTuple):
|
||||
ui: dict
|
||||
outputs: list
|
||||
|
||||
|
||||
class CacheType(Enum):
|
||||
CLASSIC = 0
|
||||
LRU = 1
|
||||
DEPENDENCY_AWARE = 2
|
||||
NONE = 2
|
||||
RAM_PRESSURE = 3
|
||||
|
||||
|
||||
class CacheSet:
|
||||
def __init__(self, cache_type=None, cache_size=None):
|
||||
if cache_type == CacheType.DEPENDENCY_AWARE:
|
||||
self.init_dependency_aware_cache()
|
||||
def __init__(self, cache_type=None, cache_args: Optional[CacheArgs] = None):
|
||||
if cache_args is None:
|
||||
cache_args = {}
|
||||
if cache_type == CacheType.NONE:
|
||||
self.init_null_cache()
|
||||
logger.info("Disabling intermediate node cache.")
|
||||
elif cache_type == CacheType.RAM_PRESSURE:
|
||||
cache_ram = cache_args.get("ram", 16.0)
|
||||
self.init_ram_cache(cache_ram)
|
||||
logger.info("Using RAM pressure cache.")
|
||||
elif cache_type == CacheType.LRU:
|
||||
if cache_size is None:
|
||||
cache_size = 0
|
||||
cache_size = cache_args.get("lru", 0)
|
||||
self.init_lru_cache(cache_size)
|
||||
logger.info("Using LRU cache")
|
||||
else:
|
||||
self.init_classic_cache()
|
||||
|
||||
self.all = [self.outputs, self.ui, self.objects]
|
||||
self.all = [self.outputs, self.objects]
|
||||
|
||||
# Performs like the old cache -- dump data ASAP
|
||||
def init_classic_cache(self):
|
||||
self.outputs = HierarchicalCache(CacheKeySetInputSignature)
|
||||
self.ui = HierarchicalCache(CacheKeySetInputSignature)
|
||||
self.objects = HierarchicalCache(CacheKeySetID)
|
||||
|
||||
def init_lru_cache(self, cache_size):
|
||||
self.outputs = LRUCache(CacheKeySetInputSignature, max_size=cache_size)
|
||||
self.ui = LRUCache(CacheKeySetInputSignature, max_size=cache_size)
|
||||
self.objects = HierarchicalCache(CacheKeySetID)
|
||||
|
||||
# only hold cached items while the decendents have not executed
|
||||
def init_dependency_aware_cache(self):
|
||||
self.outputs = DependencyAwareCache(CacheKeySetInputSignature)
|
||||
self.ui = DependencyAwareCache(CacheKeySetInputSignature)
|
||||
self.objects = DependencyAwareCache(CacheKeySetID)
|
||||
def init_ram_cache(self, min_headroom):
|
||||
self.outputs = RAMPressureCache(CacheKeySetInputSignature)
|
||||
self.objects = HierarchicalCache(CacheKeySetID)
|
||||
|
||||
def init_null_cache(self):
|
||||
self.outputs = NullCache()
|
||||
self.objects = NullCache()
|
||||
|
||||
def recursive_debug_dump(self):
|
||||
result = {
|
||||
"outputs": self.outputs.recursive_debug_dump(),
|
||||
"ui": self.ui.recursive_debug_dump(),
|
||||
}
|
||||
return result
|
||||
|
||||
|
||||
SENSITIVE_EXTRA_DATA_KEYS = ("auth_token_comfy_org", "api_key_comfy_org")
|
||||
|
||||
|
||||
def get_input_data(inputs, class_def, unique_id, outputs=None, dynprompt=None, extra_data=None):
|
||||
def get_input_data(inputs, class_def, unique_id, execution_list=None, dynprompt=None, extra_data=None):
|
||||
if extra_data is None:
|
||||
extra_data = {}
|
||||
is_v3 = issubclass(class_def, _ComfyNodeInternal)
|
||||
v3_data: io.V3Data = {}
|
||||
schema = None
|
||||
if is_v3:
|
||||
valid_inputs, schema = class_def.INPUT_TYPES(include_hidden=False, return_schema=True)
|
||||
valid_inputs, schema, v3_data = class_def.INPUT_TYPES(include_hidden=False, return_schema=True, live_inputs=inputs)
|
||||
else:
|
||||
valid_inputs = class_def.INPUT_TYPES()
|
||||
input_data_all = {}
|
||||
@ -177,17 +195,17 @@ def get_input_data(inputs, class_def, unique_id, outputs=None, dynprompt=None, e
|
||||
if is_link(input_data) and (not input_info or not input_info.get("rawLink", False)):
|
||||
input_unique_id = input_data[0]
|
||||
output_index = input_data[1]
|
||||
if outputs is None:
|
||||
if execution_list is None:
|
||||
mark_missing()
|
||||
continue # This might be a lazily-evaluated input
|
||||
cached_output = outputs.get(input_unique_id)
|
||||
if cached_output is None:
|
||||
cached = execution_list.get_cache(input_unique_id, unique_id)
|
||||
if cached is None or cached.outputs is None:
|
||||
mark_missing()
|
||||
continue
|
||||
if output_index >= len(cached_output):
|
||||
if output_index >= len(cached.outputs):
|
||||
mark_missing()
|
||||
continue
|
||||
obj = cached_output[output_index]
|
||||
obj = cached.outputs[output_index]
|
||||
input_data_all[x] = obj
|
||||
elif input_category is not None:
|
||||
input_data_all[x] = [input_data]
|
||||
@ -223,7 +241,8 @@ def get_input_data(inputs, class_def, unique_id, outputs=None, dynprompt=None, e
|
||||
input_data_all[x] = [extra_data.get("auth_token_comfy_org", None)]
|
||||
if h[x] == "API_KEY_COMFY_ORG":
|
||||
input_data_all[x] = [extra_data.get("api_key_comfy_org", None)]
|
||||
return input_data_all, missing_keys, hidden_inputs_v3
|
||||
v3_data["hidden_inputs"] = hidden_inputs_v3
|
||||
return input_data_all, missing_keys, v3_data
|
||||
|
||||
|
||||
def map_node_over_list(obj, input_data_all: typing.Dict[str, typing.Any], func: str, allow_interrupt=False, execution_block_cb=None, pre_execute_cb=None):
|
||||
@ -244,12 +263,12 @@ async def resolve_map_node_over_list_results(results):
|
||||
|
||||
|
||||
@tracer.start_as_current_span("Execute Node")
|
||||
async def _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, func, allow_interrupt=False, execution_block_cb=None, pre_execute_cb=None, hidden_inputs=None, execution_list=None, executed=None):
|
||||
async def _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, func, allow_interrupt=False, execution_block_cb=None, pre_execute_cb=None, v3_data=None, execution_list=None, executed=None):
|
||||
with context_set_execution_list_and_inputs(FrozenTopologicalSort.from_topological_sort(execution_list) if execution_list is not None else None, frozenset(executed) if executed is not None else None):
|
||||
return await __async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, func, allow_interrupt, execution_block_cb, pre_execute_cb, hidden_inputs)
|
||||
return await __async_map_node_over_list(prompt_id=prompt_id, unique_id=unique_id, obj=obj, input_data_all=input_data_all, func=func, allow_interrupt=allow_interrupt, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb, v3_data=v3_data)
|
||||
|
||||
|
||||
async def __async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, func, allow_interrupt=False, execution_block_cb=None, pre_execute_cb=None, hidden_inputs=None):
|
||||
async def __async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, func, allow_interrupt=False, execution_block_cb=None, pre_execute_cb=None, v3_data=None):
|
||||
span = get_current_span()
|
||||
class_type = obj.__class__.__name__
|
||||
span.set_attribute("class_type", class_type)
|
||||
@ -312,13 +331,16 @@ async def __async_map_node_over_list(prompt_id, unique_id, obj, input_data_all,
|
||||
if is_class(obj):
|
||||
type_obj = obj
|
||||
obj.VALIDATE_CLASS()
|
||||
class_clone = obj.PREPARE_CLASS_CLONE(hidden_inputs)
|
||||
class_clone = obj.PREPARE_CLASS_CLONE(v3_data)
|
||||
# otherwise, use class instance to populate/reuse some fields
|
||||
else:
|
||||
type_obj = type(obj)
|
||||
type_obj.VALIDATE_CLASS()
|
||||
class_clone = type_obj.PREPARE_CLASS_CLONE(hidden_inputs)
|
||||
class_clone = type_obj.PREPARE_CLASS_CLONE(v3_data)
|
||||
f = make_locked_method_func(type_obj, func, class_clone)
|
||||
# in case of dynamic inputs, restructure inputs to expected nested dict
|
||||
if v3_data is not None:
|
||||
inputs = _io.build_nested_inputs(inputs, v3_data)
|
||||
# V1
|
||||
else:
|
||||
f = getattr(obj, func)
|
||||
@ -376,8 +398,8 @@ def merge_result_data(results, obj):
|
||||
return output
|
||||
|
||||
|
||||
async def get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=None, pre_execute_cb=None, hidden_inputs=None, inputs=None, execution_list=None, executed=None):
|
||||
return_values = await _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, obj.FUNCTION, allow_interrupt=True, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb, hidden_inputs=hidden_inputs, execution_list=execution_list, executed=executed)
|
||||
async def get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=None, pre_execute_cb=None, v3_data=None, execution_list=None, executed=None):
|
||||
return_values = await _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, obj.FUNCTION, allow_interrupt=True, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb, v3_data=v3_data, execution_list=execution_list, executed=executed)
|
||||
has_pending_task = any(isinstance(r, asyncio.Task) and not r.done() for r in return_values)
|
||||
if has_pending_task:
|
||||
return return_values, {}, False, has_pending_task
|
||||
@ -462,7 +484,7 @@ def format_value(x) -> FormattedValue:
|
||||
return str(x.__class__)
|
||||
|
||||
|
||||
async def execute(server: ExecutorToClientProgress, dynprompt: DynamicPrompt, caches, node_id: str, extra_data: dict, executed, prompt_id, execution_list, pending_subgraph_results, pending_async_nodes) -> RecursiveExecutionTuple:
|
||||
async def execute(server: ExecutorToClientProgress, dynprompt: DynamicPrompt, caches, node_id: str, extra_data: dict, executed, prompt_id, execution_list, pending_subgraph_results, pending_async_nodes, ui_outputs) -> RecursiveExecutionTuple:
|
||||
"""
|
||||
Executes a prompt
|
||||
:param server:
|
||||
@ -481,10 +503,10 @@ async def execute(server: ExecutorToClientProgress, dynprompt: DynamicPrompt, ca
|
||||
vanilla_environment_node_execution_hooks(),
|
||||
use_requests_caching(),
|
||||
):
|
||||
return await _execute(server, dynprompt, caches, node_id, extra_data, executed, prompt_id, execution_list, pending_subgraph_results, pending_async_nodes)
|
||||
return await _execute(server, dynprompt, caches, node_id, extra_data, executed, prompt_id, execution_list, pending_subgraph_results, pending_async_nodes, ui_outputs)
|
||||
|
||||
|
||||
async def _execute(server, dynprompt: DynamicPrompt, caches: CacheSet, current_item: str, extra_data, executed, prompt_id, execution_list: ExecutionList, pending_subgraph_results, pending_async_nodes) -> RecursiveExecutionTuple:
|
||||
async def _execute(server, dynprompt: DynamicPrompt, caches: CacheSet, current_item: str, extra_data, executed, prompt_id, execution_list: ExecutionList, pending_subgraph_results, pending_async_nodes, ui_outputs) -> RecursiveExecutionTuple:
|
||||
unique_id = current_item
|
||||
real_node_id = dynprompt.get_real_node_id(unique_id)
|
||||
display_node_id = dynprompt.get_display_node_id(unique_id)
|
||||
@ -492,11 +514,15 @@ async def _execute(server, dynprompt: DynamicPrompt, caches: CacheSet, current_i
|
||||
inputs = dynprompt.get_node(unique_id)['inputs']
|
||||
class_type = dynprompt.get_node(unique_id)['class_type']
|
||||
class_def = get_nodes().NODE_CLASS_MAPPINGS[class_type]
|
||||
if caches.outputs.get(unique_id) is not None:
|
||||
cached = caches.outputs.get(unique_id)
|
||||
if cached is not None:
|
||||
if server.client_id is not None:
|
||||
cached_output = caches.ui.get(unique_id) or {}
|
||||
server.send_sync("executed", {"node": unique_id, "display_node": display_node_id, "output": cached_output.get("output", None), "prompt_id": prompt_id}, server.client_id)
|
||||
cached_ui = cached.ui or {}
|
||||
server.send_sync("executed", {"node": unique_id, "display_node": display_node_id, "output": cached_ui.get("output", None), "prompt_id": prompt_id}, server.client_id)
|
||||
if cached.ui is not None:
|
||||
ui_outputs[unique_id] = cached.ui
|
||||
get_progress_state().finish_progress(unique_id)
|
||||
execution_list.cache_update(unique_id, cached)
|
||||
return RecursiveExecutionTuple(ExecutionResult.SUCCESS, None, None)
|
||||
|
||||
input_data_all = None
|
||||
@ -526,8 +552,8 @@ async def _execute(server, dynprompt: DynamicPrompt, caches: CacheSet, current_i
|
||||
for r in result:
|
||||
if is_link(r):
|
||||
source_node, source_output = r[0], r[1]
|
||||
node_output = caches.outputs.get(source_node)[source_output]
|
||||
for o in node_output:
|
||||
node_cached = execution_list.get_cache(source_node, unique_id)
|
||||
for o in node_cached.outputs[source_output]:
|
||||
resolved_output.append(o)
|
||||
|
||||
else:
|
||||
@ -535,10 +561,11 @@ async def _execute(server, dynprompt: DynamicPrompt, caches: CacheSet, current_i
|
||||
resolved_outputs.append(tuple(resolved_output))
|
||||
output_data = merge_result_data(resolved_outputs, class_def)
|
||||
output_ui = []
|
||||
del pending_subgraph_results[unique_id]
|
||||
has_subgraph = False
|
||||
else:
|
||||
get_progress_state().start_progress(unique_id)
|
||||
input_data_all, missing_keys, hidden_inputs = get_input_data(inputs, class_def, unique_id, caches.outputs, dynprompt, extra_data)
|
||||
input_data_all, missing_keys, v3_data = get_input_data(inputs, class_def, unique_id, execution_list, dynprompt, extra_data)
|
||||
if server.client_id is not None:
|
||||
server.last_node_id = display_node_id
|
||||
server.send_sync("executing", {"node": unique_id, "display_node": display_node_id, "prompt_id": prompt_id}, server.client_id)
|
||||
@ -553,7 +580,7 @@ async def _execute(server, dynprompt: DynamicPrompt, caches: CacheSet, current_i
|
||||
else:
|
||||
lazy_status_present = getattr(obj, "check_lazy_status", None) is not None
|
||||
if lazy_status_present:
|
||||
required_inputs = await _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, "check_lazy_status", allow_interrupt=True, hidden_inputs=hidden_inputs, execution_list=execution_list, executed=executed)
|
||||
required_inputs = await _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, "check_lazy_status", allow_interrupt=True, execution_list=execution_list, executed=executed, v3_data=v3_data)
|
||||
required_inputs = await resolve_map_node_over_list_results(required_inputs)
|
||||
required_inputs = set(sum([r for r in required_inputs if isinstance(r, list)], []))
|
||||
required_inputs = [x for x in required_inputs if isinstance(x, str) and (
|
||||
@ -587,7 +614,7 @@ async def _execute(server, dynprompt: DynamicPrompt, caches: CacheSet, current_i
|
||||
# TODO - How to handle this with async functions without contextvars (which requires Python 3.12)?
|
||||
GraphBuilder.set_default_prefix(unique_id, call_index, 0)
|
||||
|
||||
output_data, output_ui, has_subgraph, has_pending_tasks = await get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb, hidden_inputs=hidden_inputs, inputs=inputs, execution_list=execution_list, executed=executed)
|
||||
output_data, output_ui, has_subgraph, has_pending_tasks = await get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb, execution_list=execution_list, executed=executed, v3_data=v3_data)
|
||||
if has_pending_tasks:
|
||||
pending_async_nodes[unique_id] = output_data
|
||||
unblock = execution_list.add_external_block(unique_id)
|
||||
@ -600,7 +627,7 @@ async def _execute(server, dynprompt: DynamicPrompt, caches: CacheSet, current_i
|
||||
asyncio.create_task(await_completion())
|
||||
return RecursiveExecutionTuple(ExecutionResult.PENDING, None, None)
|
||||
if len(output_ui) > 0:
|
||||
caches.ui.set(unique_id, {
|
||||
ui_outputs[unique_id] = {
|
||||
"meta": {
|
||||
"node_id": unique_id,
|
||||
"display_node": display_node_id,
|
||||
@ -608,7 +635,7 @@ async def _execute(server, dynprompt: DynamicPrompt, caches: CacheSet, current_i
|
||||
"real_node_id": real_node_id,
|
||||
},
|
||||
"output": output_ui
|
||||
})
|
||||
}
|
||||
if server.client_id is not None:
|
||||
server.send_sync("executed", {"node": unique_id, "display_node": display_node_id, "output": output_ui, "prompt_id": prompt_id},
|
||||
server.client_id)
|
||||
@ -622,10 +649,6 @@ async def _execute(server, dynprompt: DynamicPrompt, caches: CacheSet, current_i
|
||||
if new_graph is None:
|
||||
cached_outputs.append((False, node_outputs))
|
||||
else:
|
||||
# Check for conflicts
|
||||
for node_id in new_graph.keys():
|
||||
if dynprompt.has_node(node_id):
|
||||
raise DuplicateNodeError(f"Attempt to add duplicate node {node_id}. Ensure node ids are unique and deterministic or use graph_utils.GraphBuilder.")
|
||||
for node_id, node_info in new_graph.items():
|
||||
new_node_ids.append(node_id)
|
||||
display_id = node_info.get("override_display_id", unique_id)
|
||||
@ -646,11 +669,16 @@ async def _execute(server, dynprompt: DynamicPrompt, caches: CacheSet, current_i
|
||||
subcache.clean_unused()
|
||||
for node_id in new_output_ids:
|
||||
execution_list.add_node(node_id)
|
||||
execution_list.cache_link(node_id, unique_id)
|
||||
for link in new_output_links:
|
||||
execution_list.add_strong_link(link[0], link[1], unique_id)
|
||||
pending_subgraph_results[unique_id] = cached_outputs
|
||||
return RecursiveExecutionTuple(ExecutionResult.PENDING, None, None)
|
||||
caches.outputs.set(unique_id, output_data)
|
||||
|
||||
cache_entry = CacheEntry(ui=ui_outputs.get(unique_id), outputs=output_data)
|
||||
execution_list.cache_update(unique_id, cache_entry)
|
||||
caches.outputs.set(unique_id, cache_entry)
|
||||
|
||||
except interruption.InterruptProcessingException as iex:
|
||||
logger.info("Processing interrupted")
|
||||
|
||||
@ -702,10 +730,17 @@ async def _execute(server, dynprompt: DynamicPrompt, caches: CacheSet, current_i
|
||||
return RecursiveExecutionTuple(ExecutionResult.SUCCESS, None, None)
|
||||
|
||||
|
||||
class CacheArgs(TypedDict):
|
||||
ram: NotRequired[int]
|
||||
lru: NotRequired[float]
|
||||
|
||||
|
||||
class PromptExecutor:
|
||||
def __init__(self, server: ExecutorToClientProgress, cache_type: CacheType | Literal[False] = False, cache_size: int | None = None):
|
||||
def __init__(self, server: ExecutorToClientProgress, cache_type: CacheType | Literal[False] = False, cache_args: Optional[CacheArgs] = None):
|
||||
self.status_messages = []
|
||||
self.caches: Optional[CacheSet] = None
|
||||
self.success = None
|
||||
self.cache_size = cache_size
|
||||
self.cache_args = cache_args or {}
|
||||
self.cache_type = cache_type
|
||||
self.server = server
|
||||
self.raise_exceptions = False
|
||||
@ -714,7 +749,7 @@ class PromptExecutor:
|
||||
|
||||
def reset(self):
|
||||
self.success = True
|
||||
self.caches = CacheSet(cache_type=self.cache_type, cache_size=self.cache_size)
|
||||
self.caches = CacheSet(cache_type=self.cache_type, cache_args=self.cache_args)
|
||||
self.status_messages = []
|
||||
|
||||
def add_message(self, event, data: dict, broadcast: bool):
|
||||
@ -819,6 +854,7 @@ class PromptExecutor:
|
||||
broadcast=False)
|
||||
pending_subgraph_results = {}
|
||||
pending_async_nodes = {} # TODO - Unify this with pending_subgraph_results
|
||||
ui_node_outputs = {}
|
||||
executed = set()
|
||||
execution_list = ExecutionList(dynamic_prompt, self.caches.outputs)
|
||||
current_outputs = self.caches.outputs.all_node_ids()
|
||||
@ -833,7 +869,8 @@ class PromptExecutor:
|
||||
break
|
||||
|
||||
assert node_id is not None, "Node ID should not be None at this point"
|
||||
result, error, ex = await execute(self.server, dynamic_prompt, self.caches, node_id, extra_data, executed, prompt_id, execution_list, pending_subgraph_results, pending_async_nodes)
|
||||
|
||||
result, error, ex = await execute(self.server, dynamic_prompt, self.caches, node_id, extra_data, executed, prompt_id, execution_list, pending_subgraph_results, pending_async_nodes, ui_node_outputs)
|
||||
self.success = result != ExecutionResult.FAILURE
|
||||
if result == ExecutionResult.FAILURE:
|
||||
self.handle_execution_error(prompt_id, dynamic_prompt.original_prompt, current_outputs, executed, error, ex)
|
||||
@ -842,18 +879,16 @@ class PromptExecutor:
|
||||
execution_list.unstage_node_execution()
|
||||
else: # result == ExecutionResult.SUCCESS:
|
||||
execution_list.complete_node_execution()
|
||||
self.caches.outputs.poll(ram_headroom=self.cache_args.get("ram", 0))
|
||||
else:
|
||||
# Only execute when the while-loop ends without break
|
||||
self.add_message("execution_success", {"prompt_id": prompt_id}, broadcast=False)
|
||||
|
||||
ui_outputs = {}
|
||||
meta_outputs = {}
|
||||
all_node_ids = self.caches.ui.all_node_ids()
|
||||
for node_id in all_node_ids:
|
||||
ui_info = self.caches.ui.get(node_id)
|
||||
if ui_info is not None:
|
||||
ui_outputs[node_id] = ui_info["output"]
|
||||
meta_outputs[node_id] = ui_info["meta"]
|
||||
for node_id, ui_info in ui_node_outputs.items():
|
||||
ui_outputs[node_id] = ui_info["output"]
|
||||
meta_outputs[node_id] = ui_info["meta"]
|
||||
self.history_result = {
|
||||
"outputs": ui_outputs,
|
||||
"meta": meta_outputs,
|
||||
@ -883,9 +918,6 @@ async def validate_inputs(prompt_id: typing.Any, prompt, item, validated: typing
|
||||
class_type = prompt[unique_id]['class_type']
|
||||
obj_class = get_nodes().NODE_CLASS_MAPPINGS[class_type]
|
||||
|
||||
class_inputs = obj_class.INPUT_TYPES()
|
||||
valid_inputs = set(class_inputs.get('required', {})).union(set(class_inputs.get('optional', {})))
|
||||
|
||||
error: ValidationErrorDict
|
||||
errors = []
|
||||
valid = True
|
||||
@ -893,9 +925,11 @@ async def validate_inputs(prompt_id: typing.Any, prompt, item, validated: typing
|
||||
validate_function_inputs = []
|
||||
validate_has_kwargs = False
|
||||
if issubclass(obj_class, _ComfyNodeInternal):
|
||||
class_inputs, _, _ = obj_class.INPUT_TYPES(include_hidden=False, return_schema=True, live_inputs=inputs)
|
||||
validate_function_name = "validate_inputs"
|
||||
validate_function = first_real_override(obj_class, validate_function_name)
|
||||
else:
|
||||
class_inputs = obj_class.INPUT_TYPES()
|
||||
validate_function_name = "VALIDATE_INPUTS"
|
||||
validate_function = getattr(obj_class, validate_function_name, None)
|
||||
if validate_function is not None:
|
||||
@ -904,6 +938,8 @@ async def validate_inputs(prompt_id: typing.Any, prompt, item, validated: typing
|
||||
validate_has_kwargs = argspec.varkw is not None
|
||||
received_types = {}
|
||||
|
||||
valid_inputs = set(class_inputs.get('required', {})).union(set(class_inputs.get('optional', {})))
|
||||
|
||||
for x in valid_inputs:
|
||||
input_type, input_category, extra_info = get_input_info(obj_class, x, class_inputs)
|
||||
assert extra_info is not None
|
||||
@ -1085,7 +1121,7 @@ async def validate_inputs(prompt_id: typing.Any, prompt, item, validated: typing
|
||||
continue
|
||||
|
||||
if len(validate_function_inputs) > 0 or validate_has_kwargs:
|
||||
input_data_all, _, hidden_inputs = get_input_data(inputs, obj_class, unique_id)
|
||||
input_data_all, _, v3_data = get_input_data(inputs, obj_class, unique_id)
|
||||
input_filtered = {}
|
||||
for x in input_data_all:
|
||||
if x in validate_function_inputs or validate_has_kwargs:
|
||||
@ -1093,7 +1129,7 @@ async def validate_inputs(prompt_id: typing.Any, prompt, item, validated: typing
|
||||
if 'input_types' in validate_function_inputs:
|
||||
input_filtered['input_types'] = [received_types]
|
||||
|
||||
ret = await _async_map_node_over_list(prompt_id, unique_id, obj_class, input_filtered, validate_function_name, hidden_inputs=hidden_inputs)
|
||||
ret = await _async_map_node_over_list(prompt_id, unique_id, obj_class, input_filtered, validate_function_name, v3_data=v3_data)
|
||||
ret = await resolve_map_node_over_list_results(ret)
|
||||
for x in input_filtered:
|
||||
for i, r in enumerate(ret):
|
||||
@ -1320,8 +1356,7 @@ class PromptQueue(AbstractPromptQueue):
|
||||
self.server.queue_updated()
|
||||
return copy.deepcopy(item_with_future.queue_tuple), task_id
|
||||
|
||||
def task_done(self, item_id: str, outputs: HistoryResultDict,
|
||||
status: Optional[ExecutionStatus], error_details: Optional[ExecutionErrorMessage] = None):
|
||||
def task_done(self, item_id: str, outputs: HistoryResultDict, status: Optional[ExecutionStatus], error_details: typing.Optional[ExecutionErrorMessage] = None, process_item: typing.Optional[typing.Callable[[QueueTuple], QueueItem]] = None):
|
||||
history_result = outputs
|
||||
with self.mutex:
|
||||
queue_item = self.currently_running.pop(item_id)
|
||||
@ -1331,16 +1366,14 @@ class PromptQueue(AbstractPromptQueue):
|
||||
|
||||
status_dict = None
|
||||
if status is not None:
|
||||
status_dict: Optional[ExecutionStatusAsDict] = status.as_dict(error_details=error_details)
|
||||
status_dict: Optional[ExecutionStatusAsDict] = status.as_dict(error_details=None)
|
||||
|
||||
outputs_ = history_result["outputs"]
|
||||
# Remove sensitive data from extra_data before storing in history
|
||||
for sensitive_val in SENSITIVE_EXTRA_DATA_KEYS:
|
||||
if sensitive_val in prompt[3]:
|
||||
prompt[3].pop(sensitive_val)
|
||||
if process_item is not None:
|
||||
prompt = process_item(prompt)
|
||||
|
||||
history_entry: HistoryEntry = {
|
||||
"prompt": prompt,
|
||||
"prompt": prompt.queue_tuple if isinstance(prompt, QueueItem) else prompt,
|
||||
"outputs": copy.deepcopy(outputs_),
|
||||
}
|
||||
if status_dict is not None:
|
||||
@ -1352,19 +1385,19 @@ class PromptQueue(AbstractPromptQueue):
|
||||
queue_item.completed.set_result(outputs_)
|
||||
|
||||
# Note: slow
|
||||
def get_current_queue(self) -> Tuple[typing.List[QueueTuple], typing.List[QueueTuple]]:
|
||||
def get_current_queue(self) -> AbstractPromptQueueGetCurrentQueueItems:
|
||||
with self.mutex:
|
||||
out: typing.List[QueueTuple] = []
|
||||
out: typing.List[QueueItem] = []
|
||||
for x in self.currently_running.values():
|
||||
out += [x.queue_tuple]
|
||||
return out, copy.deepcopy([item.queue_tuple for item in self.queue])
|
||||
out += [x]
|
||||
return out, copy.deepcopy(self.queue)
|
||||
|
||||
# read-safe as long as queue items are immutable
|
||||
def get_current_queue_volatile(self):
|
||||
def get_current_queue_volatile(self) -> AbstractPromptQueueGetCurrentQueueItems:
|
||||
with self.mutex:
|
||||
running = [x for x in self.currently_running.values()]
|
||||
queued = copy.copy(self.queue)
|
||||
return (running, queued)
|
||||
return running, queued
|
||||
|
||||
def get_tasks_remaining(self):
|
||||
with self.mutex:
|
||||
|
||||
@ -24,6 +24,12 @@ _module_properties = create_module_properties()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# todo: investigate what this is actually trying to do
|
||||
# System User Protection - Protects system directories from HTTP endpoint access
|
||||
# System Users are internal-only users that cannot be accessed via HTTP endpoints.
|
||||
# They use the '__' prefix convention (similar to Python's private member convention).
|
||||
SYSTEM_USER_PREFIX = "__"
|
||||
|
||||
|
||||
@_module_properties.getter
|
||||
def _supported_pt_extensions() -> set[str]:
|
||||
@ -58,6 +64,65 @@ def _resolve_path_with_compatibility(path: Path | str) -> PurePosixPath | Path:
|
||||
return Path(path).resolve()
|
||||
|
||||
|
||||
def get_system_user_directory(name: str = "system") -> str:
|
||||
"""
|
||||
Get the path to a System User directory.
|
||||
|
||||
System User directories (prefixed with '__') are only accessible via internal API,
|
||||
not through HTTP endpoints. Use this for storing system-internal data that
|
||||
should not be exposed to users.
|
||||
|
||||
Args:
|
||||
name: System user name (e.g., "system", "cache"). Must be alphanumeric
|
||||
with underscores allowed, but cannot start with underscore.
|
||||
|
||||
Returns:
|
||||
Absolute path to the system user directory.
|
||||
|
||||
Raises:
|
||||
ValueError: If name is empty, invalid, or starts with underscore.
|
||||
|
||||
Example:
|
||||
>>> get_system_user_directory("cache")
|
||||
'/path/to/user/__cache'
|
||||
"""
|
||||
if not name or not isinstance(name, str):
|
||||
raise ValueError("System user name cannot be empty")
|
||||
if not name.replace("_", "").isalnum():
|
||||
raise ValueError(f"Invalid system user name: '{name}'")
|
||||
if name.startswith("_"):
|
||||
raise ValueError("System user name should not start with underscore")
|
||||
return os.path.join(get_user_directory(), f"{SYSTEM_USER_PREFIX}{name}")
|
||||
|
||||
|
||||
def get_public_user_directory(user_id: str) -> str | None:
|
||||
"""
|
||||
Get the path to a Public User directory for HTTP endpoint access.
|
||||
|
||||
This function provides structural security by returning None for any
|
||||
System User (prefixed with '__'). All HTTP endpoints should use this
|
||||
function instead of directly constructing user paths.
|
||||
|
||||
Args:
|
||||
user_id: User identifier from HTTP request.
|
||||
|
||||
Returns:
|
||||
Absolute path to the user directory, or None if user_id is invalid
|
||||
or refers to a System User.
|
||||
|
||||
Example:
|
||||
>>> get_public_user_directory("default")
|
||||
'/path/to/user/default'
|
||||
>>> get_public_user_directory("__system")
|
||||
None
|
||||
"""
|
||||
if not user_id or not isinstance(user_id, str):
|
||||
return None
|
||||
if user_id.startswith(SYSTEM_USER_PREFIX):
|
||||
return None
|
||||
return os.path.join(get_user_directory(), user_id)
|
||||
|
||||
|
||||
def init_default_paths(folder_names_and_paths: FolderNames, configuration: Optional[Configuration] = None, create_all_directories=False, replace_existing=True, base_paths_from_configuration=True):
|
||||
"""
|
||||
Populates the folder names and paths object with the default, upstream model directories and custom_nodes directory.
|
||||
@ -111,6 +176,7 @@ def init_default_paths(folder_names_and_paths: FolderNames, configuration: Optio
|
||||
ModelPaths(["huggingface"], supported_extensions=set()),
|
||||
ModelPaths(["model_patches"], supported_extensions=set(supported_pt_extensions)),
|
||||
ModelPaths(["audio_encoders"], supported_extensions=set(supported_pt_extensions)),
|
||||
ModelPaths(["latent_upscale_models"], supported_extensions=set(supported_pt_extensions)),
|
||||
hf_cache_paths,
|
||||
hf_xet,
|
||||
]
|
||||
@ -527,4 +593,8 @@ __all__ = [
|
||||
"invalidate_cache",
|
||||
"filter_files_content_types",
|
||||
"get_input_subfolders",
|
||||
"get_system_user_directory",
|
||||
"get_public_user_directory",
|
||||
# todo: why? what is the purpose?
|
||||
"SYSTEM_USER_PREFIX",
|
||||
]
|
||||
|
||||
@ -15,7 +15,8 @@ output_directory: str
|
||||
temp_directory: str
|
||||
input_directory: str
|
||||
supported_pt_extensions: set[str]
|
||||
|
||||
extension_mimetypes_cache: dict[str, str]
|
||||
SYSTEM_USER_PREFIX: str
|
||||
|
||||
# Functions
|
||||
def init_default_paths(folder_names_and_paths: FolderNames, configuration: Optional[Configuration] = None, create_all_directories: bool = ..., replace_existing: bool = ..., base_paths_from_configuration: bool = ...): ...
|
||||
@ -39,7 +40,7 @@ def get_output_directory() -> str: ...
|
||||
def get_temp_directory() -> str: ...
|
||||
|
||||
|
||||
def get_input_directory() -> str: ...
|
||||
def get_input_directory(mkdirs: bool = ...) -> str: ...
|
||||
|
||||
|
||||
def get_user_directory() -> str: ...
|
||||
@ -108,3 +109,9 @@ def filter_files_content_types(files: List[str], content_types: List[Literal["im
|
||||
|
||||
|
||||
def get_input_subfolders() -> list[str]: ...
|
||||
|
||||
|
||||
def get_system_user_directory(name: str = ...) -> str: ...
|
||||
|
||||
|
||||
def get_public_user_directory(user_id: str) -> Optional[str]: ...
|
||||
|
||||
@ -15,14 +15,24 @@ from ..component_model.executor_types import UnencodedPreviewImageMessage
|
||||
from ..execution_context import current_execution_context
|
||||
from ..model_downloader import get_or_download, KNOWN_APPROX_VAES
|
||||
from ..taesd.taesd import TAESD
|
||||
from ..sd import VAE
|
||||
from ..utils import load_torch_file
|
||||
|
||||
MAX_PREVIEW_RESOLUTION = args.preview_size
|
||||
VIDEO_TAES = ["taehv", "lighttaew2_2", "lighttaew2_1", "lighttaehy1_5"]
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def preview_to_image(latent_image) -> Image:
|
||||
latents_ubyte = (((latent_image + 1.0) / 2.0).clamp(0, 1) # change scale from -1..1 to 0..1
|
||||
.mul(0xFF) # to 0..255
|
||||
)
|
||||
|
||||
def preview_to_image(latent_image, do_scale=True) -> Image.Image:
|
||||
if do_scale:
|
||||
latents_ubyte = (((latent_image + 1.0) / 2.0).clamp(0, 1) # change scale from -1..1 to 0..1
|
||||
.mul(0xFF) # to 0..255
|
||||
)
|
||||
else:
|
||||
latents_ubyte = (latent_image.clamp(0, 1)
|
||||
.mul(0xFF) # to 0..255
|
||||
)
|
||||
if model_management.directml_device is not None:
|
||||
latents_ubyte = latents_ubyte.to(dtype=torch.uint8)
|
||||
latents_ubyte = latents_ubyte.to(device="cpu", dtype=torch.uint8, non_blocking=model_management.device_supports_non_blocking(latent_image.device))
|
||||
@ -31,7 +41,7 @@ def preview_to_image(latent_image) -> Image:
|
||||
|
||||
|
||||
class LatentPreviewer:
|
||||
def decode_latent_to_preview(self, x0) -> Image:
|
||||
def decode_latent_to_preview(self, x0) -> Image.Image:
|
||||
raise NotImplementedError
|
||||
|
||||
def decode_latent_to_preview_image(self, preview_format, x0) -> UnencodedPreviewImageMessage:
|
||||
@ -49,14 +59,23 @@ class TAESDPreviewerImpl(LatentPreviewer):
|
||||
return preview_to_image(x_sample)
|
||||
|
||||
|
||||
class TAEHVPreviewerImpl(TAESDPreviewerImpl):
|
||||
def decode_latent_to_preview(self, x0):
|
||||
x_sample = self.taesd.decode(x0[:1, :, :1])[0][0]
|
||||
return preview_to_image(x_sample, do_scale=False)
|
||||
|
||||
|
||||
class Latent2RGBPreviewer(LatentPreviewer):
|
||||
def __init__(self, latent_rgb_factors, latent_rgb_factors_bias=None):
|
||||
def __init__(self, latent_rgb_factors, latent_rgb_factors_bias=None, latent_rgb_factors_reshape=None):
|
||||
self.latent_rgb_factors = torch.tensor(latent_rgb_factors, device="cpu").transpose(0, 1)
|
||||
self.latent_rgb_factors_bias = None
|
||||
if latent_rgb_factors_bias is not None:
|
||||
self.latent_rgb_factors_bias = torch.tensor(latent_rgb_factors_bias, device="cpu")
|
||||
self.latent_rgb_factors_reshape = latent_rgb_factors_reshape
|
||||
|
||||
def decode_latent_to_preview(self, x0):
|
||||
if self.latent_rgb_factors_reshape is not None:
|
||||
x0 = self.latent_rgb_factors_reshape(x0)
|
||||
self.latent_rgb_factors = self.latent_rgb_factors.to(dtype=x0.dtype, device=x0.device)
|
||||
if self.latent_rgb_factors_bias is not None:
|
||||
self.latent_rgb_factors_bias = self.latent_rgb_factors_bias.to(dtype=x0.dtype, device=x0.device)
|
||||
@ -91,14 +110,19 @@ def get_previewer(device, latent_format):
|
||||
|
||||
if method == LatentPreviewMethod.TAESD:
|
||||
if taesd_decoder_path:
|
||||
taesd = TAESD(None, taesd_decoder_path, latent_channels=latent_format.latent_channels).to(device)
|
||||
previewer = TAESDPreviewerImpl(taesd)
|
||||
if latent_format.taesd_decoder_name in VIDEO_TAES:
|
||||
taesd = VAE(load_torch_file(taesd_decoder_path))
|
||||
taesd.first_stage_model.show_progress_bar = False
|
||||
previewer = TAEHVPreviewerImpl(taesd)
|
||||
else:
|
||||
taesd = TAESD(None, taesd_decoder_path, latent_channels=latent_format.latent_channels).to(device)
|
||||
previewer = TAESDPreviewerImpl(taesd)
|
||||
else:
|
||||
logger.warning("Warning: TAESD previews enabled, but could not find models/vae_approx/{}".format(latent_format.taesd_decoder_name))
|
||||
|
||||
if previewer is None:
|
||||
if latent_format.latent_rgb_factors is not None:
|
||||
previewer = Latent2RGBPreviewer(latent_format.latent_rgb_factors, latent_format.latent_rgb_factors_bias)
|
||||
previewer = Latent2RGBPreviewer(latent_format.latent_rgb_factors, latent_format.latent_rgb_factors_bias, latent_format.latent_rgb_factors_reshape)
|
||||
return previewer
|
||||
|
||||
|
||||
|
||||
@ -1,30 +1,30 @@
|
||||
from .main_pre import tracer
|
||||
|
||||
import asyncio
|
||||
import contextvars
|
||||
import gc
|
||||
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from ..cli_args_types import Configuration
|
||||
from ..component_model.file_counter import cleanup_temp as fc_cleanup_temp
|
||||
from ..execution_context import current_execution_context
|
||||
|
||||
from ..component_model.abstract_prompt_queue import AbstractPromptQueue
|
||||
from . import hook_breaker_ac10a0
|
||||
from .extra_model_paths import load_extra_path_config
|
||||
from .. import model_management
|
||||
from ..analytics.analytics import initialize_event_tracking
|
||||
from ..cmd import cuda_malloc
|
||||
from ..cmd import folder_paths
|
||||
from ..cmd import server as server_module
|
||||
from ..component_model.abstract_prompt_queue import AbstractPromptQueue
|
||||
from ..cli_args_types import Configuration
|
||||
from . import cuda_malloc
|
||||
from . import folder_paths
|
||||
from . import server as server_module
|
||||
from ..component_model.entrypoints_common import configure_application_paths, executor_from_args
|
||||
from ..component_model.file_counter import cleanup_temp as fc_cleanup_temp
|
||||
from ..distributed.distributed_prompt_queue import DistributedPromptQueue
|
||||
from ..distributed.server_stub import ServerStub
|
||||
from ..execution_context import current_execution_context
|
||||
from ..nodes.package import import_all_nodes_in_workspace
|
||||
from ..nodes_context import get_nodes
|
||||
|
||||
@ -44,22 +44,27 @@ def cuda_malloc_warning():
|
||||
"\nWARNING: this card most likely does not support cuda-malloc, if you get \"CUDA error\" please run ComfyUI with: --disable-cuda-malloc\n")
|
||||
|
||||
|
||||
def prompt_worker(q: AbstractPromptQueue, server_instance: server_module.PromptServer):
|
||||
asyncio.run(_prompt_worker(q, server_instance))
|
||||
def handle_comfyui_manager_unavailable(args: Configuration):
|
||||
if not args.windows_standalone_build:
|
||||
logger.warning(f"\n\nYou appear to be running comfyui-manager from source, this is not recommended. Please install comfyui-manager using the following command:\ncommand:\n\t{sys.executable} -m pip install --pre comfyui_manager\n")
|
||||
args.enable_manager = False
|
||||
|
||||
|
||||
async def _prompt_worker(q: AbstractPromptQueue, server_instance: server_module.PromptServer):
|
||||
from ..cmd import execution
|
||||
from . import execution
|
||||
from ..component_model import queue_types
|
||||
from .. import model_management
|
||||
|
||||
args = current_execution_context().configuration
|
||||
cache_type = execution.CacheType.CLASSIC
|
||||
if args.cache_lru > 0:
|
||||
cache_type = execution.CacheType.LRU
|
||||
elif args.cache_ram > 0:
|
||||
cache_type = execution.CacheType.RAM_PRESSURE
|
||||
elif args.cache_none:
|
||||
cache_type = execution.CacheType.DEPENDENCY_AWARE
|
||||
cache_type = execution.CacheType.NONE
|
||||
|
||||
e = execution.PromptExecutor(server_instance, cache_type=cache_type, cache_size=args.cache_lru)
|
||||
e = execution.PromptExecutor(server_instance, cache_type=cache_type, cache_args={"lru": args.cache_lru, "ram": args.cache_ram})
|
||||
last_gc_collect = 0
|
||||
need_gc = False
|
||||
gc_collect_interval = 10.0
|
||||
@ -76,10 +81,17 @@ async def _prompt_worker(q: AbstractPromptQueue, server_instance: server_module.
|
||||
prompt_id = item[1]
|
||||
server_instance.last_prompt_id = prompt_id
|
||||
|
||||
sensitive = item[5]
|
||||
extra_data = item[3].copy()
|
||||
for k in sensitive:
|
||||
extra_data[k] = sensitive[k]
|
||||
|
||||
# todo: ??? what jank
|
||||
remove_sensitive = lambda prompt: prompt[:5] + prompt[6:]
|
||||
|
||||
await e.execute_async(item[2], prompt_id, item[3], item[4])
|
||||
need_gc = True
|
||||
|
||||
# Extract error details from status_messages if there's an error
|
||||
error_details = None
|
||||
if not e.success:
|
||||
for event, data in e.status_messages:
|
||||
@ -87,7 +99,6 @@ async def _prompt_worker(q: AbstractPromptQueue, server_instance: server_module.
|
||||
error_details = data
|
||||
break
|
||||
|
||||
# Convert status_messages tuples to string messages for backward compatibility
|
||||
messages = [f"{event}: {data.get('exception_message', str(data))}" if isinstance(data, dict) and 'exception_message' in data else f"{event}" for event, data in e.status_messages]
|
||||
|
||||
q.task_done(item_id,
|
||||
@ -96,13 +107,16 @@ async def _prompt_worker(q: AbstractPromptQueue, server_instance: server_module.
|
||||
status_str='success' if e.success else 'error',
|
||||
completed=e.success,
|
||||
messages=messages),
|
||||
error_details=error_details)
|
||||
error_details=error_details,
|
||||
process_item=remove_sensitive,
|
||||
)
|
||||
|
||||
if server_instance.client_id is not None:
|
||||
server_instance.send_sync("executing", {"node": None, "prompt_id": prompt_id},
|
||||
server_instance.client_id)
|
||||
server_instance.send_sync("executing", {"node": None, "prompt_id": prompt_id}, server_instance.client_id)
|
||||
|
||||
current_time = time.perf_counter()
|
||||
execution_time = current_time - execution_start_time
|
||||
|
||||
# Log Time in a more readable way after 10 minutes
|
||||
if execution_time > 600:
|
||||
execution_time = time.strftime("%H:%M:%S", time.gmtime(execution_time))
|
||||
@ -133,7 +147,11 @@ async def _prompt_worker(q: AbstractPromptQueue, server_instance: server_module.
|
||||
hook_breaker_ac10a0.restore_functions()
|
||||
|
||||
|
||||
async def run(server_instance, address='', port=8188, verbose=True, call_on_start=None):
|
||||
def prompt_worker(q: AbstractPromptQueue, server_instance: server_module.PromptServer):
|
||||
asyncio.run(_prompt_worker(q, server_instance))
|
||||
|
||||
|
||||
async def run(server_instance, address='', port=8188, call_on_start=None):
|
||||
addresses = []
|
||||
for addr in address.split(","):
|
||||
addresses.append((addr, port))
|
||||
@ -173,6 +191,7 @@ async def _start_comfyui(from_script_dir: Optional[Path] = None, configuration:
|
||||
await __start_comfyui(from_script_dir=from_script_dir)
|
||||
|
||||
|
||||
@tracer.start_as_current_span("Start ComfyUI")
|
||||
async def __start_comfyui(from_script_dir: Optional[Path] = None):
|
||||
"""
|
||||
Runs ComfyUI's frontend and backend like upstream.
|
||||
@ -194,6 +213,23 @@ async def __start_comfyui(from_script_dir: Optional[Path] = None):
|
||||
logger.info(f"Setting user directory to: {user_dir}")
|
||||
folder_paths.set_user_directory(user_dir)
|
||||
|
||||
# todo: the manager code has to live inside vanilla_node_importing, it has to deal with a git repo already being in custom_nodes
|
||||
# if args.enable_manager:
|
||||
# if importlib.util.find_spec("comfyui_manager"):
|
||||
# import comfyui_manager
|
||||
#
|
||||
# if not comfyui_manager.__file__ or not comfyui_manager.__file__.endswith('__init__.py'):
|
||||
# handle_comfyui_manager_unavailable(args)
|
||||
# else:
|
||||
# handle_comfyui_manager_unavailable(args)
|
||||
#
|
||||
# if args.enable_manager:
|
||||
# try:
|
||||
# import comfyui_manager
|
||||
# comfyui_manager.prestartup()
|
||||
# except:
|
||||
# pass
|
||||
|
||||
# configure extra model paths earlier
|
||||
try:
|
||||
extra_model_paths_config_path = os.path.join(os_getcwd, "extra_model_paths.yaml")
|
||||
@ -224,6 +260,15 @@ async def __start_comfyui(from_script_dir: Optional[Path] = None):
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
server = server_module.PromptServer(loop)
|
||||
|
||||
# todo: the manager code has to live inside vanilla_node_importing, it has to deal with a git repo already being in custom_nodes
|
||||
# if args.enable_manager and not args.disable_manager_ui:
|
||||
# try:
|
||||
# import comfyui_manager
|
||||
# comfyui_manager.start()
|
||||
# except:
|
||||
# pass
|
||||
|
||||
if args.external_address is not None:
|
||||
server.external_address = args.external_address
|
||||
|
||||
@ -317,8 +362,7 @@ async def __start_comfyui(from_script_dir: Optional[Path] = None):
|
||||
|
||||
try:
|
||||
await server.setup()
|
||||
await run(server, address=first_listen_addr, port=args.port, verbose=not args.dont_print_server,
|
||||
call_on_start=call_on_start)
|
||||
await run(server, address=first_listen_addr, port=args.port, call_on_start=call_on_start)
|
||||
except (asyncio.CancelledError, KeyboardInterrupt):
|
||||
logger.debug("Stopped server")
|
||||
finally:
|
||||
|
||||
@ -13,9 +13,7 @@ import logging
|
||||
import os
|
||||
import shutil
|
||||
import warnings
|
||||
|
||||
import fsspec
|
||||
from opentelemetry.instrumentation.urllib3 import URLLib3Instrumentor
|
||||
|
||||
from .. import options
|
||||
from ..app import logger
|
||||
@ -133,6 +131,8 @@ def _create_tracer():
|
||||
from opentelemetry.processor.baggage import BaggageSpanProcessor, ALLOW_ALL_BAGGAGE_KEYS
|
||||
from opentelemetry.instrumentation.aiohttp_server import AioHttpServerInstrumentor
|
||||
from opentelemetry.instrumentation.aiohttp_client import AioHttpClientInstrumentor
|
||||
from opentelemetry.instrumentation.urllib3 import URLLib3Instrumentor
|
||||
|
||||
from ..tracing_compatibility import ProgressSpanSampler
|
||||
from ..tracing_compatibility import patch_spanbuilder_set_channel
|
||||
|
||||
|
||||
@ -28,6 +28,7 @@ from PIL import Image, ImageOps
|
||||
from PIL.PngImagePlugin import PngInfo
|
||||
from aiohttp import web
|
||||
from can_ada import URL, parse as urlparse # pylint: disable=no-name-in-module
|
||||
from packaging import version
|
||||
from typing_extensions import NamedTuple
|
||||
|
||||
from comfy_api import feature_flags
|
||||
@ -41,6 +42,7 @@ from ..api_server.routes.internal.internal_routes import InternalRoutes
|
||||
from ..app.custom_node_manager import CustomNodeManager
|
||||
from ..app.frontend_management import FrontendManager
|
||||
from ..app.model_manager import ModelFileManager
|
||||
from ..app.subgraph_manager import SubgraphManager
|
||||
from ..app.user_manager import UserManager
|
||||
from ..cli_args import args
|
||||
from ..client.client_types import FileOutput
|
||||
@ -52,9 +54,10 @@ from ..component_model.executor_types import ExecutorToClientProgress, StatusMes
|
||||
UnencodedPreviewImageMessage, PreviewImageWithMetadataMessage
|
||||
from ..component_model.file_output_path import file_output_path
|
||||
from ..component_model.queue_types import QueueItem, HistoryEntry, BinaryEventTypes, TaskInvocation, ExecutionError, \
|
||||
ExecutionStatus
|
||||
ExecutionStatus, QueueTuple, ExtraData
|
||||
from ..digest import digest
|
||||
from ..images import open_image
|
||||
from ..middleware.cache_middleware import cache_control
|
||||
from ..model_management import get_torch_device, get_torch_device_name, get_total_memory, get_free_memory, torch_version
|
||||
from ..nodes.package_typing import ExportedNodes
|
||||
from ..progress_types import PreviewImageMetadata
|
||||
@ -67,8 +70,17 @@ class HeuristicPath(NamedTuple):
|
||||
abs_path: str
|
||||
|
||||
|
||||
# Import cache control middleware
|
||||
from ..middleware.cache_middleware import cache_control
|
||||
# todo: what is this really trying to do?
|
||||
LOADED_MODULE_DIRS = {}
|
||||
|
||||
|
||||
# todo: is this really how we want to enable the manager? we will have to deal with this later
|
||||
# if args.enable_manager:
|
||||
# try:
|
||||
# import comfyui_manager
|
||||
# except ImportError:
|
||||
# logger.warning("ComfyUI Manager not found but enabled in args.")
|
||||
|
||||
|
||||
async def send_socket_catch_exception(function, message):
|
||||
try:
|
||||
@ -84,6 +96,7 @@ def get_comfyui_version():
|
||||
# Track deprecated paths that have been warned about to only warn once per file
|
||||
_deprecated_paths_warned = set()
|
||||
|
||||
|
||||
@web.middleware
|
||||
async def deprecation_warning(request: web.Request, handler):
|
||||
"""Middleware to warn about deprecated frontend API paths"""
|
||||
@ -93,7 +106,7 @@ async def deprecation_warning(request: web.Request, handler):
|
||||
# Only warn once per unique file path
|
||||
if path not in _deprecated_paths_warned:
|
||||
_deprecated_paths_warned.add(path)
|
||||
logging.warning(
|
||||
logger.warning(
|
||||
f"[DEPRECATION WARNING] Detected import of deprecated legacy API: {path}. "
|
||||
f"This is likely caused by a custom node extension using outdated APIs. "
|
||||
f"Please update your extensions or contact the extension author for an updated version."
|
||||
@ -144,7 +157,7 @@ def create_cors_middleware(allowed_origin: str):
|
||||
response = await handler(request)
|
||||
|
||||
response.headers['Access-Control-Allow-Origin'] = allowed_origin
|
||||
response.headers['Access-Control-Allow-Methods'] = 'POST, GET, DELETE, PUT, OPTIONS'
|
||||
response.headers['Access-Control-Allow-Methods'] = 'POST, GET, DELETE, PUT, OPTIONS, PATCH'
|
||||
response.headers['Access-Control-Allow-Headers'] = 'Content-Type, Authorization, traceparent, tracestate'
|
||||
response.headers['Access-Control-Allow-Credentials'] = 'true'
|
||||
return response
|
||||
@ -215,6 +228,21 @@ def create_origin_only_middleware():
|
||||
return origin_only_middleware
|
||||
|
||||
|
||||
def create_block_external_middleware():
|
||||
@web.middleware
|
||||
async def block_external_middleware(request: web.Request, handler):
|
||||
if request.method == "OPTIONS":
|
||||
# Pre-flight request. Reply successfully:
|
||||
response = web.Response()
|
||||
else:
|
||||
response = await handler(request)
|
||||
|
||||
response.headers['Content-Security-Policy'] = "default-src 'self'; script-src 'self' 'unsafe-inline' 'unsafe-eval' blob:; style-src 'self' 'unsafe-inline'; img-src 'self' data: blob:; font-src 'self'; connect-src 'self'; frame-src 'self'; object-src 'self';"
|
||||
return response
|
||||
|
||||
return block_external_middleware
|
||||
|
||||
|
||||
class PromptServer(ExecutorToClientProgress):
|
||||
instance: Optional['PromptServer'] = None
|
||||
|
||||
@ -230,6 +258,7 @@ class PromptServer(ExecutorToClientProgress):
|
||||
self.user_manager = UserManager()
|
||||
self.model_file_manager = ModelFileManager()
|
||||
self.custom_node_manager = CustomNodeManager()
|
||||
self.subgraph_manager = SubgraphManager()
|
||||
self.internal_routes = InternalRoutes(self)
|
||||
# todo: this is probably read by custom nodes elsewhere
|
||||
self.supports: List[str] = ["custom_nodes_from_web"]
|
||||
@ -251,6 +280,13 @@ class PromptServer(ExecutorToClientProgress):
|
||||
else:
|
||||
middlewares.append(create_origin_only_middleware())
|
||||
|
||||
if args.disable_api_nodes:
|
||||
middlewares.append(create_block_external_middleware())
|
||||
|
||||
# todo: enable the package-installed manager later
|
||||
# if args.enable_manager:
|
||||
# middlewares.append(comfyui_manager.create_middleware())
|
||||
|
||||
max_upload_size = round(args.max_upload_size * 1024 * 1024)
|
||||
self.app: web.Application = web.Application(client_max_size=max_upload_size,
|
||||
handler_args={'max_field_size': 16380},
|
||||
@ -634,7 +670,7 @@ class PromptServer(ExecutorToClientProgress):
|
||||
|
||||
system_stats = {
|
||||
"system": {
|
||||
"os": os.name,
|
||||
"os": sys.platform,
|
||||
"ram_total": ram_total,
|
||||
"ram_free": ram_free,
|
||||
"comfyui_version": __version__,
|
||||
@ -746,8 +782,18 @@ class PromptServer(ExecutorToClientProgress):
|
||||
async def get_queue(request):
|
||||
queue_info = {}
|
||||
current_queue = self.prompt_queue.get_current_queue_volatile()
|
||||
queue_info['queue_running'] = current_queue[0]
|
||||
queue_info['queue_pending'] = current_queue[1]
|
||||
|
||||
def remove_sensitive(queue: List[QueueItem]):
|
||||
items = []
|
||||
for item in queue:
|
||||
items.append({
|
||||
**item,
|
||||
"sensitive": None,
|
||||
})
|
||||
return items
|
||||
|
||||
queue_info['queue_running'] = remove_sensitive(current_queue[0])
|
||||
queue_info['queue_pending'] = remove_sensitive(current_queue[1])
|
||||
return web.json_response(queue_info)
|
||||
|
||||
@routes.post("/prompt")
|
||||
@ -783,7 +829,7 @@ class PromptServer(ExecutorToClientProgress):
|
||||
if valid[0]:
|
||||
outputs_to_execute = valid[2]
|
||||
self.prompt_queue.put(
|
||||
QueueItem(queue_tuple=(number, prompt_id, prompt, extra_data, outputs_to_execute),
|
||||
QueueItem(queue_tuple=QueueTuple(number, prompt_id, prompt, extra_data, outputs_to_execute, None),
|
||||
completed=None))
|
||||
response = {"prompt_id": prompt_id, "number": number, "node_errors": valid[3]}
|
||||
return web.json_response(response)
|
||||
@ -828,8 +874,7 @@ class PromptServer(ExecutorToClientProgress):
|
||||
# Check if the prompt_id matches any currently running prompt
|
||||
should_interrupt = False
|
||||
for item in currently_running:
|
||||
# item structure: (number, prompt_id, prompt, extra_data, outputs_to_execute)
|
||||
if item[1] == prompt_id:
|
||||
if item.prompt_id == prompt_id:
|
||||
logger.debug(f"Interrupting prompt {prompt_id}")
|
||||
should_interrupt = True
|
||||
break
|
||||
@ -968,7 +1013,8 @@ class PromptServer(ExecutorToClientProgress):
|
||||
completed: Future[TaskInvocation | dict] = self.loop.create_future()
|
||||
# todo: actually implement idempotency keys
|
||||
# we would need some kind of more durable, distributed task queue
|
||||
item = QueueItem(queue_tuple=(number, task_id, prompt_dict, {}, valid[2]), completed=completed)
|
||||
# QueueItem deals with sensitive data uniformly now
|
||||
item = QueueItem(queue_tuple=QueueTuple(number, task_id, prompt_dict, ExtraData(), valid[2], None), completed=completed)
|
||||
|
||||
try:
|
||||
if hasattr(self.prompt_queue, "put_async") or isinstance(self.prompt_queue, AsyncAbstractPromptQueue):
|
||||
@ -1112,6 +1158,7 @@ class PromptServer(ExecutorToClientProgress):
|
||||
self.model_file_manager.add_routes(self.routes)
|
||||
# todo: needs to use module directories
|
||||
self.custom_node_manager.add_routes(self.routes, self.app, {})
|
||||
self.subgraph_manager.add_routes(self.routes, LOADED_MODULE_DIRS.items())
|
||||
self.app.add_subapp('/internal', self.internal_routes.get_app())
|
||||
|
||||
# Prefix every route with /api for easier matching for delegation.
|
||||
@ -1132,11 +1179,31 @@ class PromptServer(ExecutorToClientProgress):
|
||||
for name, dir in self.nodes.EXTENSION_WEB_DIRS.items():
|
||||
self.app.add_routes([web.static('/extensions/' + name, dir, follow_symlinks=True)])
|
||||
|
||||
workflow_templates_path = FrontendManager.templates_path()
|
||||
if workflow_templates_path:
|
||||
self.app.add_routes([
|
||||
web.static('/templates', workflow_templates_path)
|
||||
])
|
||||
installed_templates_version = FrontendManager.get_installed_templates_version()
|
||||
use_legacy_templates = True
|
||||
if installed_templates_version:
|
||||
try:
|
||||
use_legacy_templates = (
|
||||
version.parse(installed_templates_version)
|
||||
< version.parse("0.3.0")
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"Unable to parse templates version '%s': %s",
|
||||
installed_templates_version,
|
||||
exc,
|
||||
)
|
||||
|
||||
if use_legacy_templates:
|
||||
workflow_templates_path = FrontendManager.legacy_templates_path()
|
||||
if workflow_templates_path:
|
||||
self.app.add_routes([
|
||||
web.static('/templates', workflow_templates_path)
|
||||
])
|
||||
else:
|
||||
handler = FrontendManager.template_asset_handler()
|
||||
if handler:
|
||||
self.app.router.add_get("/templates/{path:.*}", handler)
|
||||
|
||||
# Serve embedded documentation from the package
|
||||
embedded_docs_path = FrontendManager.embedded_docs_path()
|
||||
@ -1161,8 +1228,12 @@ class PromptServer(ExecutorToClientProgress):
|
||||
await self.send_image(data, sid=sid)
|
||||
elif event == BinaryEventTypes.PREVIEW_IMAGE_WITH_METADATA:
|
||||
# data is (preview_image, metadata)
|
||||
|
||||
data: PreviewImageWithMetadataMessage
|
||||
preview_image, metadata = data
|
||||
if isinstance(preview_image, dict):
|
||||
# todo: this has to be fixed from transformers loader for previewing tokens in real time
|
||||
return
|
||||
await self.send_image_with_metadata(preview_image, metadata, sid=sid)
|
||||
elif isinstance(data, (bytes, bytearray)):
|
||||
await self.send_bytes(event, data, sid)
|
||||
|
||||
@ -3,7 +3,7 @@ from __future__ import annotations
|
||||
import typing
|
||||
from abc import ABCMeta, abstractmethod
|
||||
|
||||
from .executor_types import HistoryResultDict
|
||||
from .executor_types import HistoryResultDict, ExecutionErrorMessage
|
||||
from .queue_types import QueueTuple, HistoryEntry, QueueItem, Flags, ExecutionStatus, TaskInvocation, AbstractPromptQueueGetCurrentQueueItems
|
||||
|
||||
|
||||
@ -43,10 +43,11 @@ class AbstractPromptQueue(metaclass=ABCMeta):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def task_done(self, item_id: str, outputs: HistoryResultDict,
|
||||
status: typing.Optional[ExecutionStatus]):
|
||||
def task_done(self, item_id: str, outputs: HistoryResultDict, status: typing.Optional[ExecutionStatus], error_details: typing.Optional[ExecutionErrorMessage] = None, process_item: typing.Optional[typing.Callable[[QueueTuple], QueueItem]] = None):
|
||||
"""
|
||||
Signals to the user interface that the task with the specified id is completed
|
||||
:param error_details:
|
||||
:param process_item:
|
||||
:param item_id: the ID of the task that should be marked as completed
|
||||
:param outputs: an opaque dictionary of outputs
|
||||
:param status:
|
||||
|
||||
66
comfy/component_model/configuration.py
Normal file
66
comfy/component_model/configuration.py
Normal file
@ -0,0 +1,66 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from ..cli_args_types import Configuration
|
||||
from ..cli_args import default_configuration
|
||||
|
||||
MODEL_MANAGEMENT_ARGS = {
|
||||
"deterministic",
|
||||
"directml",
|
||||
"cpu",
|
||||
"disable_xformers",
|
||||
# todo: this is the default, so it will be omitted
|
||||
# "use_pytorch_cross_attention",
|
||||
"use_split_cross_attention",
|
||||
"use_quad_cross_attention",
|
||||
"supports_fp8_compute",
|
||||
"fast",
|
||||
"lowvram",
|
||||
"novram",
|
||||
"highvram",
|
||||
"gpu_only",
|
||||
"force_fp32",
|
||||
"force_fp16",
|
||||
"force_bf16",
|
||||
"reserve_vram",
|
||||
"disable_smart_memory",
|
||||
"disable_ipex_optimize",
|
||||
"disable_pinned_memory",
|
||||
"async_offload",
|
||||
"disable_async_offload",
|
||||
"force_non_blocking",
|
||||
"force_channels_last",
|
||||
"fp32_unet",
|
||||
"fp64_unet",
|
||||
"bf16_unet",
|
||||
"fp16_unet",
|
||||
"fp8_e4m3fn_unet",
|
||||
"fp8_e5m2_unet",
|
||||
"fp8_e8m0fnu_unet",
|
||||
"fp8_e4m3fn_text_enc",
|
||||
"fp8_e5m2_text_enc",
|
||||
"fp16_text_enc",
|
||||
"bf16_text_enc",
|
||||
"fp32_text_enc",
|
||||
"cpu_vae",
|
||||
"fp16_vae",
|
||||
"bf16_vae",
|
||||
"fp32_vae",
|
||||
"force_upcast_attention",
|
||||
"use_sage_attention",
|
||||
"use_flash_attention",
|
||||
}
|
||||
|
||||
|
||||
def requires_process_pool_executor(configuration: Configuration | None) -> bool:
|
||||
if configuration is None:
|
||||
return False
|
||||
|
||||
default = default_configuration()
|
||||
for key in MODEL_MANAGEMENT_ARGS:
|
||||
# Check if key is in configuration and differs from default
|
||||
if key in configuration:
|
||||
val = configuration[key]
|
||||
# Use equality check, handling potential missing keys in default (though default should have them)
|
||||
if key not in default or val != default[key]:
|
||||
return True
|
||||
return False
|
||||
@ -2,7 +2,7 @@ from __future__ import annotations # for Python 3.7-3.9
|
||||
|
||||
import concurrent.futures
|
||||
from enum import Enum
|
||||
from typing import Optional, Literal, Protocol, Union, NamedTuple, List, runtime_checkable, Iterable, Dict, Any
|
||||
from typing import Optional, Literal, Protocol, Union, NamedTuple, List, runtime_checkable, Dict, Any
|
||||
|
||||
import PIL.Image
|
||||
from typing_extensions import NotRequired, TypedDict, Never
|
||||
|
||||
@ -2,15 +2,29 @@ from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import copy
|
||||
import time
|
||||
import typing
|
||||
from enum import Enum
|
||||
from typing import NamedTuple, Optional, List, Literal, Sequence
|
||||
from typing import Tuple
|
||||
|
||||
from typing_extensions import NotRequired, TypedDict
|
||||
|
||||
from .outputs_types import OutputsDict
|
||||
from .sensitive_data import SENSITIVE_EXTRA_DATA_KEYS
|
||||
|
||||
if typing.TYPE_CHECKING:
|
||||
from .executor_types import ExecutionErrorMessage
|
||||
|
||||
|
||||
class QueueTuple(NamedTuple):
|
||||
priority: float
|
||||
prompt_id: str
|
||||
prompt: dict
|
||||
extra_data: Optional[ExtraData] = None
|
||||
good_outputs: Optional[List[str]] = None
|
||||
sensitive: Optional[dict] = None
|
||||
|
||||
|
||||
QueueTuple = Tuple[float, str, dict, dict, list]
|
||||
MAXIMUM_HISTORY_SIZE = 10000
|
||||
|
||||
|
||||
@ -63,6 +77,7 @@ class ExecutionStatusAsDict(TypedDict):
|
||||
status_str: Literal['success', 'error']
|
||||
completed: bool
|
||||
messages: List[str]
|
||||
error_details: NotRequired[ExecutionErrorMessage]
|
||||
|
||||
|
||||
class Flags(TypedDict, total=False):
|
||||
@ -82,7 +97,7 @@ class ExtraData(TypedDict):
|
||||
token: NotRequired[str]
|
||||
|
||||
|
||||
class NamedQueueTuple(dict):
|
||||
class QueueDict(dict):
|
||||
"""
|
||||
A wrapper class for a queue tuple, the object that is given to executors.
|
||||
|
||||
@ -92,13 +107,25 @@ class NamedQueueTuple(dict):
|
||||
__slots__ = ('queue_tuple',)
|
||||
|
||||
def __init__(self, queue_tuple: QueueTuple):
|
||||
# Initialize the dictionary superclass with the data we want to serialize.
|
||||
# initialize the dictionary superclass with the data we want to serialize.
|
||||
# populate the queue tuple with the appropriate dummy fields
|
||||
queue_tuple = QueueTuple(*queue_tuple)
|
||||
if queue_tuple.sensitive is None:
|
||||
sensitive = {}
|
||||
extra_data = queue_tuple.extra_data or {}
|
||||
for sensitive_val in SENSITIVE_EXTRA_DATA_KEYS:
|
||||
if sensitive_val in extra_data:
|
||||
sensitive[sensitive_val] = extra_data.pop(sensitive_val)
|
||||
extra_data["create_time"] = int(time.time() * 1000) # timestamp in milliseconds
|
||||
queue_tuple = QueueTuple(queue_tuple.priority, queue_tuple.prompt_id, queue_tuple.prompt, extra_data, queue_tuple.good_outputs, sensitive)
|
||||
|
||||
super().__init__(
|
||||
priority=queue_tuple[0],
|
||||
prompt_id=queue_tuple[1],
|
||||
prompt=queue_tuple[2],
|
||||
extra_data=queue_tuple[3] if len(queue_tuple) > 3 else None,
|
||||
good_outputs=queue_tuple[4] if len(queue_tuple) > 4 else None
|
||||
extra_data=queue_tuple[3],
|
||||
good_outputs=queue_tuple[4],
|
||||
sensitive=queue_tuple[5],
|
||||
)
|
||||
# Store the original tuple in a slot, making it invisible to json.dumps.
|
||||
self.queue_tuple = queue_tuple
|
||||
@ -127,8 +154,17 @@ class NamedQueueTuple(dict):
|
||||
return self.queue_tuple[4]
|
||||
return None
|
||||
|
||||
@property
|
||||
def sensitive(self) -> Optional[dict]:
|
||||
if len(self.queue_tuple) > 5:
|
||||
return self.queue_tuple[5]
|
||||
return None
|
||||
|
||||
class QueueItem(NamedQueueTuple):
|
||||
|
||||
NamedQueueTuple = QueueDict
|
||||
|
||||
|
||||
class QueueItem(QueueDict):
|
||||
"""
|
||||
An item awaiting processing in the queue: a NamedQueueTuple with a future that is completed when the item is done
|
||||
processing.
|
||||
@ -164,4 +200,4 @@ class ExecutorToClientMessage(TypedDict, total=False):
|
||||
output: NotRequired[str]
|
||||
|
||||
|
||||
AbstractPromptQueueGetCurrentQueueItems = tuple[list[QueueTuple], list[QueueTuple]]
|
||||
AbstractPromptQueueGetCurrentQueueItems = tuple[list[QueueItem], list[QueueItem]]
|
||||
|
||||
3
comfy/component_model/sensitive_data.py
Normal file
3
comfy/component_model/sensitive_data.py
Normal file
@ -0,0 +1,3 @@
|
||||
from __future__ import annotations
|
||||
|
||||
SENSITIVE_EXTRA_DATA_KEYS = ("auth_token_comfy_org", "api_key_comfy_org")
|
||||
@ -6,6 +6,8 @@ import collections
|
||||
from dataclasses import dataclass
|
||||
from abc import ABC, abstractmethod
|
||||
import logging
|
||||
|
||||
from . import patcher_extension
|
||||
from .model_management import throw_exception_if_processing_interrupted
|
||||
from .patcher_extension import get_all_callbacks, WrappersMP
|
||||
|
||||
@ -54,26 +56,36 @@ class ContextHandlerABC(ABC):
|
||||
|
||||
|
||||
class IndexListContextWindow(ContextWindowABC):
|
||||
def __init__(self, index_list: list[int], dim: int = 0):
|
||||
def __init__(self, index_list: list[int], dim: int = 0, total_frames: int=0):
|
||||
self.index_list = index_list
|
||||
self.context_length = len(index_list)
|
||||
self.dim = dim
|
||||
self.total_frames = total_frames
|
||||
self.center_ratio = (min(index_list) + max(index_list)) / (2 * total_frames)
|
||||
|
||||
def get_tensor(self, full: torch.Tensor, device=None, dim=None) -> torch.Tensor:
|
||||
def get_tensor(self, full: torch.Tensor, device=None, dim=None, retain_index_list=[]) -> torch.Tensor:
|
||||
if dim is None:
|
||||
dim = self.dim
|
||||
if dim == 0 and full.shape[dim] == 1:
|
||||
return full
|
||||
idx = [slice(None)] * dim + [self.index_list]
|
||||
return full[idx].to(device)
|
||||
idx = tuple([slice(None)] * dim + [self.index_list])
|
||||
window = full[idx]
|
||||
if retain_index_list:
|
||||
idx = tuple([slice(None)] * dim + [retain_index_list])
|
||||
window[idx] = full[idx]
|
||||
return window.to(device)
|
||||
|
||||
def add_window(self, full: torch.Tensor, to_add: torch.Tensor, dim=None) -> torch.Tensor:
|
||||
if dim is None:
|
||||
dim = self.dim
|
||||
idx = [slice(None)] * dim + [self.index_list]
|
||||
idx = tuple([slice(None)] * dim + [self.index_list])
|
||||
full[idx] += to_add
|
||||
return full
|
||||
|
||||
def get_region_index(self, num_regions: int) -> int:
|
||||
region_idx = int(self.center_ratio * num_regions)
|
||||
return min(max(region_idx, 0), num_regions - 1)
|
||||
|
||||
|
||||
class IndexListCallbacks:
|
||||
EVALUATE_CONTEXT_WINDOWS = "evaluate_context_windows"
|
||||
@ -101,7 +113,8 @@ ContextResults = collections.namedtuple("ContextResults", ['window_idx', 'sub_co
|
||||
|
||||
|
||||
class IndexListContextHandler(ContextHandlerABC):
|
||||
def __init__(self, context_schedule: ContextSchedule, fuse_method: ContextFuseMethod, context_length: int = 1, context_overlap: int = 0, context_stride: int = 1, closed_loop=False, dim=0):
|
||||
def __init__(self, context_schedule: ContextSchedule, fuse_method: ContextFuseMethod, context_length: int = 1, context_overlap: int = 0, context_stride: int = 1,
|
||||
closed_loop: bool=False, dim:int=0, freenoise: bool=False, cond_retain_index_list: list[int]=[], split_conds_to_windows: bool=False):
|
||||
self.context_schedule = context_schedule
|
||||
self.fuse_method = fuse_method
|
||||
self.context_length = context_length
|
||||
@ -110,13 +123,18 @@ class IndexListContextHandler(ContextHandlerABC):
|
||||
self.closed_loop = closed_loop
|
||||
self.dim = dim
|
||||
self._step = 0
|
||||
self.freenoise = freenoise
|
||||
self.cond_retain_index_list = [int(x.strip()) for x in cond_retain_index_list.split(",")] if cond_retain_index_list else []
|
||||
self.split_conds_to_windows = split_conds_to_windows
|
||||
|
||||
self.callbacks = {}
|
||||
|
||||
def should_use_context(self, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]) -> bool:
|
||||
# for now, assume first dim is batch - should have stored on BaseModel in actual implementation
|
||||
if x_in.size(self.dim) > self.context_length:
|
||||
logger.info(f"Using context windows {self.context_length} for {x_in.size(self.dim)} frames.")
|
||||
logger.info(f"Using context windows {self.context_length} with overlap {self.context_overlap} for {x_in.size(self.dim)} frames.")
|
||||
if self.cond_retain_index_list:
|
||||
logger.info(f"Retaining original cond for indexes: {self.cond_retain_index_list}")
|
||||
return True
|
||||
return False
|
||||
|
||||
@ -130,6 +148,11 @@ class IndexListContextHandler(ContextHandlerABC):
|
||||
return None
|
||||
# reuse or resize cond items to match context requirements
|
||||
resized_cond = []
|
||||
# if multiple conds, split based on primary region
|
||||
if self.split_conds_to_windows and len(cond_in) > 1:
|
||||
region = window.get_region_index(len(cond_in))
|
||||
logger.info(f"Splitting conds to windows; using region {region} for window {window[0]}-{window[-1]} with center ratio {window.center_ratio:.3f}")
|
||||
cond_in = [cond_in[region]]
|
||||
# cond object is a list containing a dict - outer list is irrelevant, so just loop through it
|
||||
for actual_cond in cond_in:
|
||||
resized_actual_cond = actual_cond.copy()
|
||||
@ -153,12 +176,19 @@ class IndexListContextHandler(ContextHandlerABC):
|
||||
# when in dictionary, look for tensors and CONDCrossAttn [comfy/conds.py] (has cond attr that is a tensor)
|
||||
for cond_key, cond_value in new_cond_item.items():
|
||||
if isinstance(cond_value, torch.Tensor):
|
||||
if cond_value.ndim < self.dim and cond_value.size(0) == x_in.size(self.dim):
|
||||
if (self.dim < cond_value.ndim and cond_value(self.dim) == x_in.size(self.dim)) or \
|
||||
(cond_value.ndim < self.dim and cond_value.size(0) == x_in.size(self.dim)):
|
||||
new_cond_item[cond_key] = window.get_tensor(cond_value, device)
|
||||
# Handle audio_embed (temporal dim is 1)
|
||||
elif cond_key == "audio_embed" and hasattr(cond_value, "cond") and isinstance(cond_value.cond, torch.Tensor):
|
||||
audio_cond = cond_value.cond
|
||||
if audio_cond.ndim > 1 and audio_cond.size(1) == x_in.size(self.dim):
|
||||
new_cond_item[cond_key] = cond_value._copy_with(window.get_tensor(audio_cond, device, dim=1))
|
||||
# if has cond that is a Tensor, check if needs to be subset
|
||||
elif hasattr(cond_value, "cond") and isinstance(cond_value.cond, torch.Tensor):
|
||||
if cond_value.cond.ndim < self.dim and cond_value.cond.size(0) == x_in.size(self.dim):
|
||||
new_cond_item[cond_key] = cond_value._copy_with(window.get_tensor(cond_value.cond, device))
|
||||
if (self.dim < cond_value.cond.ndim and cond_value.cond.size(self.dim) == x_in.size(self.dim)) or \
|
||||
(cond_value.cond.ndim < self.dim and cond_value.cond.size(0) == x_in.size(self.dim)):
|
||||
new_cond_item[cond_key] = cond_value._copy_with(window.get_tensor(cond_value.cond, device, retain_index_list=self.cond_retain_index_list))
|
||||
elif cond_key == "num_video_frames": # for SVD
|
||||
new_cond_item[cond_key] = cond_value._copy_with(cond_value.cond)
|
||||
new_cond_item[cond_key].cond = window.context_length
|
||||
@ -171,7 +201,7 @@ class IndexListContextHandler(ContextHandlerABC):
|
||||
return resized_cond
|
||||
|
||||
def set_step(self, timestep: torch.Tensor, model_options: dict[str]):
|
||||
mask = torch.isclose(model_options["transformer_options"]["sample_sigmas"], timestep, rtol=0.0001)
|
||||
mask = torch.isclose(model_options["transformer_options"]["sample_sigmas"], timestep[0], rtol=0.0001)
|
||||
matches = torch.nonzero(mask)
|
||||
if torch.numel(matches) == 0:
|
||||
raise Exception("No sample_sigmas matched current timestep; something went wrong.")
|
||||
@ -180,7 +210,7 @@ class IndexListContextHandler(ContextHandlerABC):
|
||||
def get_context_windows(self, model: BaseModel, x_in: torch.Tensor, model_options: dict[str]) -> list[IndexListContextWindow]:
|
||||
full_length = x_in.size(self.dim) # TODO: choose dim based on model
|
||||
context_windows = self.context_schedule.func(full_length, self, model_options)
|
||||
context_windows = [IndexListContextWindow(window, dim=self.dim) for window in context_windows]
|
||||
context_windows = [IndexListContextWindow(window, dim=self.dim, total_frames=full_length) for window in context_windows]
|
||||
return context_windows
|
||||
|
||||
def execute(self, calc_cond_batch: Callable, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]):
|
||||
@ -256,8 +286,8 @@ class IndexListContextHandler(ContextHandlerABC):
|
||||
prev_weight = (bias_total / (bias_total + bias))
|
||||
new_weight = (bias / (bias_total + bias))
|
||||
# account for dims of tensors
|
||||
idx_window = [slice(None)] * self.dim + [idx]
|
||||
pos_window = [slice(None)] * self.dim + [pos]
|
||||
idx_window = tuple([slice(None)] * self.dim + [idx])
|
||||
pos_window = tuple([slice(None)] * self.dim + [pos])
|
||||
# apply new values
|
||||
conds_final[i][idx_window] = conds_final[i][idx_window] * prev_weight + sub_conds_out[i][pos_window] * new_weight
|
||||
biases_final[i][idx] = bias_total + bias
|
||||
@ -293,6 +323,28 @@ def create_prepare_sampling_wrapper(model: ModelPatcher):
|
||||
)
|
||||
|
||||
|
||||
def _sampler_sample_wrapper(executor, guider, sigmas, extra_args, callback, noise, *args, **kwargs):
|
||||
model_options = extra_args.get("model_options", None)
|
||||
if model_options is None:
|
||||
raise Exception("model_options not found in sampler_sample_wrapper; this should never happen, something went wrong.")
|
||||
handler: IndexListContextHandler = model_options.get("context_handler", None)
|
||||
if handler is None:
|
||||
raise Exception("context_handler not found in sampler_sample_wrapper; this should never happen, something went wrong.")
|
||||
if not handler.freenoise:
|
||||
return executor(guider, sigmas, extra_args, callback, noise, *args, **kwargs)
|
||||
noise = apply_freenoise(noise, handler.dim, handler.context_length, handler.context_overlap, extra_args["seed"])
|
||||
|
||||
return executor(guider, sigmas, extra_args, callback, noise, *args, **kwargs)
|
||||
|
||||
|
||||
def create_sampler_sample_wrapper(model: ModelPatcher):
|
||||
model.add_wrapper_with_key(
|
||||
patcher_extension.WrappersMP.SAMPLER_SAMPLE,
|
||||
"ContextWindows_sampler_sample",
|
||||
_sampler_sample_wrapper
|
||||
)
|
||||
|
||||
|
||||
def match_weights_to_dim(weights: list[float], x_in: torch.Tensor, dim: int, device=None) -> torch.Tensor:
|
||||
total_dims = len(x_in.shape)
|
||||
weights_tensor = torch.Tensor(weights).to(device=device)
|
||||
@ -552,3 +604,29 @@ def shift_window_to_end(window: list[int], num_frames: int):
|
||||
for i in range(len(window)):
|
||||
# 2) add end_delta to each val to slide windows to end
|
||||
window[i] = window[i] + end_delta
|
||||
|
||||
|
||||
# https://github.com/Kosinkadink/ComfyUI-AnimateDiff-Evolved/blob/90fb1331201a4b29488089e4fbffc0d82cc6d0a9/animatediff/sample_settings.py#L465
|
||||
def apply_freenoise(noise: torch.Tensor, dim: int, context_length: int, context_overlap: int, seed: int):
|
||||
logger.info("Context windows: Applying FreeNoise")
|
||||
generator = torch.Generator(device='cpu').manual_seed(seed)
|
||||
latent_video_length = noise.shape[dim]
|
||||
delta = context_length - context_overlap
|
||||
|
||||
for start_idx in range(0, latent_video_length - context_length, delta):
|
||||
place_idx = start_idx + context_length
|
||||
|
||||
actual_delta = min(delta, latent_video_length - place_idx)
|
||||
if actual_delta <= 0:
|
||||
break
|
||||
|
||||
list_idx = torch.randperm(actual_delta, generator=generator, device='cpu') + start_idx
|
||||
|
||||
source_slice = [slice(None)] * noise.ndim
|
||||
source_slice[dim] = list_idx
|
||||
target_slice = [slice(None)] * noise.ndim
|
||||
target_slice[dim] = slice(place_idx, place_idx + actual_delta)
|
||||
|
||||
noise[tuple(target_slice)] = noise[tuple(source_slice)]
|
||||
|
||||
return noise
|
||||
|
||||
@ -316,11 +316,13 @@ class ControlLoraOps:
|
||||
self.bias = None
|
||||
|
||||
def forward(self, input):
|
||||
weight, bias = ops.cast_bias_weight(self, input)
|
||||
weight, bias, offload_stream = ops.cast_bias_weight(self, input, offloadable=True)
|
||||
if self.up is not None:
|
||||
return torch.nn.functional.linear(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias)
|
||||
x = torch.nn.functional.linear(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias)
|
||||
else:
|
||||
return torch.nn.functional.linear(input, weight, bias)
|
||||
x = torch.nn.functional.linear(input, weight, bias)
|
||||
ops.uncast_bias_weight(self, weight, bias, offload_stream)
|
||||
return x
|
||||
|
||||
class Conv2d(torch.nn.Module, ops.CastWeightBiasOp):
|
||||
def __init__(
|
||||
@ -355,12 +357,13 @@ class ControlLoraOps:
|
||||
self.down = None
|
||||
|
||||
def forward(self, input):
|
||||
weight, bias = ops.cast_bias_weight(self, input)
|
||||
weight, bias, offload_stream = ops.cast_bias_weight(self, input, offloadable=True)
|
||||
if self.up is not None:
|
||||
return torch.nn.functional.conv2d(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias, self.stride, self.padding, self.dilation, self.groups)
|
||||
x = torch.nn.functional.conv2d(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias, self.stride, self.padding, self.dilation, self.groups)
|
||||
else:
|
||||
return torch.nn.functional.conv2d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups)
|
||||
|
||||
x = torch.nn.functional.conv2d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups)
|
||||
ops.uncast_bias_weight(self, weight, bias, offload_stream)
|
||||
return x
|
||||
|
||||
class ControlLora(ControlNet):
|
||||
def __init__(self, control_weights, global_average_pooling=False, model_options={}): # TODO? model_options
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from ..cmd.main_pre import tracer
|
||||
|
||||
import typing
|
||||
import asyncio
|
||||
import time
|
||||
import uuid
|
||||
@ -22,7 +22,7 @@ from .server_stub import ServerStub
|
||||
from ..auth.permissions import jwt_decode
|
||||
from ..cmd.server import PromptServer
|
||||
from ..component_model.abstract_prompt_queue import AsyncAbstractPromptQueue, AbstractPromptQueue
|
||||
from ..component_model.executor_types import ExecutorToClientProgress, SendSyncEvent, SendSyncData, HistoryResultDict
|
||||
from ..component_model.executor_types import ExecutorToClientProgress, SendSyncEvent, SendSyncData, HistoryResultDict, ExecutionErrorMessage
|
||||
from ..component_model.queue_types import Flags, HistoryEntry, QueueTuple, QueueItem, ExecutionStatus, TaskInvocation, \
|
||||
ExecutionError
|
||||
|
||||
@ -163,7 +163,8 @@ class DistributedPromptQueue(AbstractPromptQueue, AsyncAbstractPromptQueue):
|
||||
|
||||
return item, item[1]
|
||||
|
||||
def task_done(self, item_id: int, outputs: dict, status: Optional[ExecutionStatus], error_details: Optional['ExecutionErrorMessage'] = None):
|
||||
def task_done(self, item_id: int, outputs: dict, status: Optional[ExecutionStatus], error_details: Optional[ExecutionErrorMessage] = None, process_item: typing.Optional[typing.Callable[[QueueTuple], QueueItem]] = None):
|
||||
# todo: should we do the process_item? it's clearing sensitive data. but what is the idea? why do things this way, it's crazy
|
||||
# callee: executed on the worker thread
|
||||
if "outputs" in outputs:
|
||||
outputs: HistoryResultDict
|
||||
|
||||
@ -5,7 +5,7 @@ from typing import Tuple, Literal, List
|
||||
|
||||
from ..api.components.schema.prompt import PromptDict, Prompt
|
||||
from ..auth.permissions import ComfyJwt, jwt_decode
|
||||
from ..component_model.queue_types import NamedQueueTuple, TaskInvocation, ExecutionStatus
|
||||
from ..component_model.queue_types import QueueDict, TaskInvocation, ExecutionStatus, QueueTuple, ExtraData
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -26,14 +26,14 @@ class DistributedBase:
|
||||
class RpcRequest(DistributedBase):
|
||||
prompt: dict | PromptDict
|
||||
|
||||
async def as_queue_tuple(self) -> NamedQueueTuple:
|
||||
async def as_queue_tuple(self) -> QueueDict:
|
||||
# this loads the nodes in this instance
|
||||
# should always be okay to call in an executor
|
||||
from ..cmd.execution import validate_prompt
|
||||
from ..component_model.make_mutable import make_mutable
|
||||
mutated_prompt_dict = make_mutable(self.prompt)
|
||||
validation_tuple = await validate_prompt(self.prompt_id, mutated_prompt_dict)
|
||||
return NamedQueueTuple(queue_tuple=(0, self.prompt_id, mutated_prompt_dict, {}, validation_tuple[2]))
|
||||
return QueueDict(queue_tuple=QueueTuple(0, self.prompt_id, mutated_prompt_dict, ExtraData(), validation_tuple[2]))
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, request_dict):
|
||||
|
||||
160
comfy/gguf.py
160
comfy/gguf.py
@ -18,6 +18,8 @@ import argparse
|
||||
import logging
|
||||
import os
|
||||
import warnings
|
||||
import numpy as np
|
||||
import re
|
||||
|
||||
import gguf
|
||||
import torch
|
||||
@ -39,6 +41,19 @@ TORCH_COMPATIBLE_QTYPES = (None, gguf.GGMLQuantizationType.F32, gguf.GGMLQuantiz
|
||||
IMG_ARCH_LIST = {"flux", "sd1", "sdxl", "sd3", "aura", "hidream", "cosmos", "ltxv", "hyvid", "wan", "qwen_image"}
|
||||
TXT_ARCH_LIST = {"t5", "t5encoder", "llama", "qwen2vl"}
|
||||
|
||||
CLIP_VISION_SD_MAP = {
|
||||
"mm.": "visual.merger.mlp.",
|
||||
"v.post_ln.": "visual.merger.ln_q.",
|
||||
"v.patch_embd": "visual.patch_embed.proj",
|
||||
"v.blk.": "visual.blocks.",
|
||||
"ffn_up": "mlp.up_proj",
|
||||
"ffn_down": "mlp.down_proj",
|
||||
"ffn_gate": "mlp.gate_proj",
|
||||
"attn_out.": "attn.proj.",
|
||||
"ln1.": "norm1.",
|
||||
"ln2.": "norm2.",
|
||||
}
|
||||
|
||||
|
||||
class ModelTemplate:
|
||||
arch = "invalid" # string describing architecture
|
||||
@ -419,7 +434,7 @@ def dequantize_tensor(tensor, dtype=None, dequant_dtype=None):
|
||||
return dequantize(tensor.data, qtype, oshape, dtype=dequant_dtype).to(dtype)
|
||||
else:
|
||||
# this is incredibly slow
|
||||
tqdm.write(f"Falling back to numpy dequant for qtype: {qtype}")
|
||||
tqdm.write(f"Falling back to numpy dequant for qtype: {getattr(qtype, 'name', repr(qtype))}")
|
||||
new = gguf.quants.dequantize(tensor.cpu().numpy(), qtype)
|
||||
return torch.from_numpy(new).to(tensor.device, dtype=dtype)
|
||||
|
||||
@ -892,6 +907,131 @@ def gguf_tokenizer_loader(path, temb_shape):
|
||||
return torch.ByteTensor(list(spm.SerializeToString()))
|
||||
|
||||
|
||||
def strip_quant_suffix(name):
|
||||
pattern = r"[-_]?(?:ud-)?i?q[0-9]_[a-z0-9_\-]{1,8}$"
|
||||
match = re.search(pattern, name, re.IGNORECASE)
|
||||
if match:
|
||||
name = name[:match.start()]
|
||||
return name
|
||||
|
||||
|
||||
def gguf_mmproj_loader(path):
|
||||
# Reverse version of Qwen2VLVisionModel.modify_tensors
|
||||
logger.info("Attempting to find mmproj file for text encoder...")
|
||||
|
||||
# get name to match w/o quant suffix
|
||||
tenc_fname = os.path.basename(path)
|
||||
tenc = os.path.splitext(tenc_fname)[0].lower()
|
||||
tenc = strip_quant_suffix(tenc)
|
||||
|
||||
# try and find matching mmproj
|
||||
target = []
|
||||
root = os.path.dirname(path)
|
||||
for fname in os.listdir(root):
|
||||
name, ext = os.path.splitext(fname)
|
||||
if ext.lower() != ".gguf":
|
||||
continue
|
||||
if "mmproj" not in name.lower():
|
||||
continue
|
||||
if tenc in name.lower():
|
||||
target.append(fname)
|
||||
|
||||
if len(target) == 0:
|
||||
logger.error(f"Error: Can't find mmproj file for '{tenc_fname}' (matching:'{tenc}')! Qwen-Image-Edit will be broken!")
|
||||
return {}
|
||||
if len(target) > 1:
|
||||
logger.error(f"Ambiguous mmproj for text encoder '{tenc_fname}', will use first match.")
|
||||
|
||||
logger.info(f"Using mmproj '{target[0]}' for text encoder '{tenc_fname}'.")
|
||||
target = os.path.join(root, target[0])
|
||||
vsd = gguf_sd_loader(target, is_text_model=True)
|
||||
|
||||
# concat 4D to 5D
|
||||
if "v.patch_embd.weight.1" in vsd:
|
||||
w1 = dequantize_tensor(vsd.pop("v.patch_embd.weight"), dtype=torch.float32)
|
||||
w2 = dequantize_tensor(vsd.pop("v.patch_embd.weight.1"), dtype=torch.float32)
|
||||
vsd["v.patch_embd.weight"] = torch.stack([w1, w2], dim=2)
|
||||
|
||||
# run main replacement
|
||||
vsd = sd_map_replace(vsd, CLIP_VISION_SD_MAP)
|
||||
|
||||
# handle split Q/K/V
|
||||
if "visual.blocks.0.attn_q.weight" in vsd:
|
||||
attns = {}
|
||||
# filter out attentions + group
|
||||
for k,v in vsd.items():
|
||||
if any(x in k for x in ["attn_q", "attn_k", "attn_v"]):
|
||||
k_attn, k_name = k.rsplit(".attn_", 1)
|
||||
k_attn += ".attn.qkv." + k_name.split(".")[-1]
|
||||
if k_attn not in attns:
|
||||
attns[k_attn] = {}
|
||||
attns[k_attn][k_name] = dequantize_tensor(
|
||||
v, dtype=(torch.bfloat16 if is_quantized(v) else torch.float16)
|
||||
)
|
||||
|
||||
# recombine
|
||||
for k,v in attns.items():
|
||||
suffix = k.split(".")[-1]
|
||||
vsd[k] = torch.cat([
|
||||
v[f"q.{suffix}"],
|
||||
v[f"k.{suffix}"],
|
||||
v[f"v.{suffix}"],
|
||||
], dim=0)
|
||||
del attns
|
||||
|
||||
return vsd
|
||||
|
||||
|
||||
def gguf_tekken_tokenizer_loader(path, temb_shape):
|
||||
# convert ggml (hf) tokenizer metadata to tekken/comfy data
|
||||
logger.info("Attempting to recreate tekken tokenizer from GGUF file metadata...")
|
||||
import json
|
||||
import base64
|
||||
try:
|
||||
from transformers.convert_slow_tokenizer import bytes_to_unicode
|
||||
except ImportError:
|
||||
try:
|
||||
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
|
||||
except ImportError:
|
||||
raise NotImplementedError("Could not import bytes_to_unicode from transformers")
|
||||
|
||||
reader = gguf.GGUFReader(path)
|
||||
|
||||
model_str = get_field(reader, "tokenizer.ggml.model", str)
|
||||
if model_str == "gpt2":
|
||||
if temb_shape == (131072, 5120): # probably Mistral
|
||||
data = {
|
||||
"config": {"num_vocab_tokens": 150000, "default_vocab_size": 131072},
|
||||
"vocab": [],
|
||||
"special_tokens": [],
|
||||
}
|
||||
else:
|
||||
raise NotImplementedError("Unknown model, can't set tokenizer!")
|
||||
else:
|
||||
raise NotImplementedError("Unknown model, can't set tokenizer!")
|
||||
|
||||
tokens = get_list_field(reader, "tokenizer.ggml.tokens", str)
|
||||
toktypes = get_list_field(reader, "tokenizer.ggml.token_type", int)
|
||||
|
||||
decoder = {v: k for k, v in bytes_to_unicode().items()}
|
||||
for idx, (token, toktype) in enumerate(zip(tokens, toktypes)):
|
||||
if toktype == 3:
|
||||
data["special_tokens"].append(
|
||||
{'rank': idx, 'token_str': token, 'is_control': True}
|
||||
)
|
||||
else:
|
||||
tok = bytes([decoder[char] for char in token])
|
||||
data["vocab"].append({
|
||||
"rank": len(data["vocab"]),
|
||||
"token_bytes": base64.b64encode(tok).decode("ascii"),
|
||||
"token_str": tok.decode("utf-8", errors="replace") # ?
|
||||
})
|
||||
|
||||
logger.info(f"Created tekken tokenizer with vocab size of {len(data['vocab'])} (+{len(data['special_tokens'])})")
|
||||
del reader
|
||||
return torch.ByteTensor(list(json.dumps(data).encode('utf-8')))
|
||||
|
||||
|
||||
def gguf_clip_loader(path):
|
||||
sd, arch = gguf_sd_loader(path, return_arch=True, is_text_model=True)
|
||||
if arch in {"t5", "t5encoder"}:
|
||||
@ -907,12 +1047,18 @@ def gguf_clip_loader(path):
|
||||
# TODO: pass model_options["vocab_size"] to loader somehow
|
||||
temb_key = "token_embd.weight"
|
||||
if temb_key in sd and sd[temb_key].shape[0] >= (64 * 1024):
|
||||
if arch == "llama" and sd[temb_key].shape == (131072, 5120):
|
||||
# non-standard Comfy-Org tokenizer
|
||||
sd["tekken_model"] = gguf_tekken_tokenizer_loader(path, sd[temb_key].shape)
|
||||
# See note above for T5.
|
||||
logger.warning(f"Dequantizing {temb_key} to prevent runtime OOM.")
|
||||
sd[temb_key] = dequantize_tensor(sd[temb_key], dtype=torch.float16)
|
||||
sd = sd_map_replace(sd, LLAMA_SD_MAP)
|
||||
if arch == "llama":
|
||||
sd = llama_permute(sd, 32, 8) # L3
|
||||
sd = llama_permute(sd, 32, 8) # L3 / Mistral
|
||||
if arch == "qwen2vl":
|
||||
vsd = gguf_mmproj_loader(path)
|
||||
sd.update(vsd)
|
||||
else:
|
||||
pass
|
||||
return sd
|
||||
@ -1072,7 +1218,7 @@ class GGMLLayer(torch.nn.Module):
|
||||
# Take into account space required for dequantizing the largest tensor
|
||||
if self.largest_layer:
|
||||
shape = getattr(self.weight, "tensor_shape", self.weight.shape)
|
||||
dtype = self.dequant_dtype or torch.float16
|
||||
dtype = self.dequant_dtype if self.dequant_dtype and self.dequant_dtype != "target" else torch.float16
|
||||
temp = torch.empty(*shape, device=torch.device("meta"), dtype=dtype)
|
||||
destination[prefix + "temp.weight"] = temp
|
||||
|
||||
@ -1106,7 +1252,7 @@ class GGMLLayer(torch.nn.Module):
|
||||
return weight
|
||||
|
||||
@torch_compiler_disable()
|
||||
def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None):
|
||||
def cast_bias_weight(self, input=None, dtype=None, device=None, bias_dtype=None):
|
||||
if input is not None:
|
||||
if dtype is None:
|
||||
dtype = getattr(input, "dtype", torch.float32)
|
||||
@ -1117,11 +1263,11 @@ class GGMLLayer(torch.nn.Module):
|
||||
|
||||
bias = None
|
||||
non_blocking = device_supports_non_blocking(device)
|
||||
if s.bias is not None:
|
||||
bias = s.get_weight(s.bias.to(device), dtype)
|
||||
if self.bias is not None:
|
||||
bias = self.get_weight(self.bias.to(device), dtype)
|
||||
bias = cast_to(bias, bias_dtype, device, non_blocking=non_blocking, copy=False)
|
||||
|
||||
weight = s.get_weight(s.weight.to(device), dtype)
|
||||
weight = self.get_weight(self.weight.to(device), dtype)
|
||||
weight = cast_to(weight, dtype, device, non_blocking=non_blocking, copy=False)
|
||||
return weight, bias
|
||||
|
||||
|
||||
@ -50,19 +50,6 @@ class TransformerStreamedProgress(TypedDict):
|
||||
next_token: str
|
||||
|
||||
|
||||
LLaVAProcessor = Callable[
|
||||
[
|
||||
Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]], # text parameter
|
||||
Union[Image, np.ndarray, torch.Tensor, List[Image], List[np.ndarray], List[torch.Tensor]], # images parameter
|
||||
Union[bool, str, PaddingStrategy], # padding parameter
|
||||
Union[bool, str, TruncationStrategy], # truncation parameter
|
||||
Optional[int], # max_length parameter
|
||||
Optional[Union[str, TensorType]] # return_tensors parameter
|
||||
],
|
||||
BatchFeature
|
||||
]
|
||||
|
||||
|
||||
class LanguageMessage(TypedDict):
|
||||
role: Literal["system", "user", "assistant"]
|
||||
content: str | MessageContent
|
||||
@ -92,14 +79,13 @@ class LanguageModel(Protocol):
|
||||
|
||||
def generate(self, tokens: TOKENS_TYPE = None,
|
||||
max_new_tokens: int = 512,
|
||||
repetition_penalty: float = 0.0,
|
||||
seed: int = 0,
|
||||
sampler: Optional[GENERATION_KWARGS_TYPE] = None,
|
||||
*args,
|
||||
**kwargs) -> str:
|
||||
...
|
||||
|
||||
def tokenize(self, prompt: str | LanguagePrompt, images: RGBImageBatch | None, chat_template: str | None = None) -> ProcessorResult:
|
||||
def tokenize(self, prompt: str | LanguagePrompt, images: RGBImageBatch | None, videos: list[torch.Tensor] | None, chat_template: str | None = None) -> ProcessorResult:
|
||||
...
|
||||
|
||||
@property
|
||||
|
||||
@ -12,19 +12,17 @@ from typing import Optional, Any, Callable
|
||||
|
||||
import torch
|
||||
import transformers
|
||||
from huggingface_hub.errors import EntryNotFoundError
|
||||
from transformers import PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, AutoProcessor, AutoTokenizer, \
|
||||
BatchFeature, AutoModelForVision2Seq, AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoModel, \
|
||||
BatchFeature, AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoModel, \
|
||||
PretrainedConfig, TextStreamer, LogitsProcessor
|
||||
from huggingface_hub import hf_api
|
||||
from huggingface_hub.file_download import hf_hub_download
|
||||
from transformers.models.auto.modeling_auto import MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES, \
|
||||
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
|
||||
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, AutoModelForImageTextToText
|
||||
|
||||
from .chat_templates import KNOWN_CHAT_TEMPLATES
|
||||
from .language_types import ProcessorResult, TOKENS_TYPE, GENERATION_KWARGS_TYPE, TransformerStreamedProgress, \
|
||||
LLaVAProcessor, LanguageModel, LanguagePrompt
|
||||
LanguageModel, LanguagePrompt
|
||||
from .. import model_management
|
||||
from ..cli_args import args
|
||||
from ..component_model.tensor_types import RGBImageBatch
|
||||
from ..model_downloader import get_or_download_huggingface_repo
|
||||
from ..model_management import unet_offload_device, get_torch_device, unet_dtype, load_models_gpu
|
||||
@ -62,11 +60,93 @@ class TransformersManagedModel(ModelManageableStub, LanguageModel):
|
||||
self._on_set_processor(self._processor)
|
||||
self._model_type = ""
|
||||
self._original_transformers_managed_model: weakref.ReferenceType["TransformersManagedModel"] = weakref.ref(self)
|
||||
self.wrappers = {}
|
||||
self.callbacks = {}
|
||||
self._hook_mode = None
|
||||
self._model_options = {"transformer_options": {}}
|
||||
|
||||
if model.device != self.offload_device:
|
||||
model.to(device=self.offload_device)
|
||||
|
||||
@property
|
||||
def hook_mode(self):
|
||||
from ..hooks import EnumHookMode
|
||||
if self._hook_mode is None:
|
||||
self._hook_mode = EnumHookMode.MaxSpeed
|
||||
return self._hook_mode
|
||||
|
||||
@hook_mode.setter
|
||||
def hook_mode(self, value):
|
||||
self._hook_mode = value
|
||||
|
||||
def prepare_hook_patches_current_keyframe(self, t, hook_group, model_options):
|
||||
return
|
||||
|
||||
def model_patches_models(self):
|
||||
return []
|
||||
|
||||
def restore_hook_patches(self):
|
||||
return
|
||||
|
||||
def cleanup(self):
|
||||
pass
|
||||
|
||||
def pre_run(self):
|
||||
pass
|
||||
|
||||
def prepare_state(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
def register_all_hook_patches(self, a, b, c, d):
|
||||
pass
|
||||
|
||||
def get_nested_additional_models(self):
|
||||
return []
|
||||
|
||||
def apply_hooks(self, *args, **kwargs):
|
||||
return {}
|
||||
|
||||
def add_wrapper(self, wrapper_type: str, wrapper: Callable):
|
||||
self.add_wrapper_with_key(wrapper_type, None, wrapper)
|
||||
|
||||
def add_wrapper_with_key(self, wrapper_type: str, key: str, wrapper: Callable):
|
||||
w = self.wrappers.setdefault(wrapper_type, {}).setdefault(key, [])
|
||||
w.append(wrapper)
|
||||
|
||||
def remove_wrappers_with_key(self, wrapper_type: str, key: str):
|
||||
w = self.wrappers.get(wrapper_type, {})
|
||||
if key in w:
|
||||
w.pop(key)
|
||||
|
||||
def get_wrappers_with_key(self, wrapper_type: str, key: str):
|
||||
w_list = []
|
||||
w_list.extend(self.wrappers.get(wrapper_type, {}).get(key, []))
|
||||
return w_list
|
||||
|
||||
def get_all_wrappers(self, wrapper_type: str):
|
||||
w_list = []
|
||||
for w in self.wrappers.get(wrapper_type, {}).values():
|
||||
w_list.extend(w)
|
||||
return w_list
|
||||
|
||||
@property
|
||||
def model_options(self):
|
||||
return self._model_options
|
||||
|
||||
@model_options.setter
|
||||
def model_options(self, value):
|
||||
self._model_options = value
|
||||
|
||||
@property
|
||||
def diffusion_model(self):
|
||||
return self.model
|
||||
|
||||
@diffusion_model.setter
|
||||
def diffusion_model(self, value):
|
||||
self.add_object_patch("model", value)
|
||||
|
||||
@staticmethod
|
||||
def from_pretrained(ckpt_name: str, subfolder: Optional[str] = None, config_dict: PretrainedConfig | dict | None = None) -> "TransformersManagedModel":
|
||||
def from_pretrained(ckpt_name: str, subfolder: Optional[str] = None, config_dict: PretrainedConfig | dict | None = None, **kwargs) -> "TransformersManagedModel":
|
||||
hub_kwargs = {}
|
||||
if subfolder is not None and subfolder.strip() != "":
|
||||
hub_kwargs["subfolder"] = subfolder
|
||||
@ -89,7 +169,8 @@ class TransformersManagedModel(ModelManageableStub, LanguageModel):
|
||||
|
||||
from_pretrained_kwargs = {
|
||||
"pretrained_model_name_or_path": ckpt_name,
|
||||
**hub_kwargs
|
||||
**hub_kwargs,
|
||||
**kwargs,
|
||||
}
|
||||
|
||||
# language models prefer to use bfloat16 over float16
|
||||
@ -122,7 +203,7 @@ class TransformersManagedModel(ModelManageableStub, LanguageModel):
|
||||
for i, kwargs_to_try in enumerate(kwargses_to_try):
|
||||
try:
|
||||
if model_type in MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES:
|
||||
model = AutoModelForVision2Seq.from_pretrained(**from_pretrained_kwargs, **kwargs_to_try)
|
||||
model = AutoModelForImageTextToText.from_pretrained(**from_pretrained_kwargs, **kwargs_to_try)
|
||||
elif model_type in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES:
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained(**from_pretrained_kwargs, **kwargs_to_try)
|
||||
elif model_type in _OVERRIDDEN_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
|
||||
@ -149,7 +230,16 @@ class TransformersManagedModel(ModelManageableStub, LanguageModel):
|
||||
tokenizer = processor
|
||||
processor = None
|
||||
else:
|
||||
tokenizer = getattr(processor, "tokenizer") if processor is not None and hasattr(processor, "tokenizer") else AutoTokenizer.from_pretrained(ckpt_name, **hub_kwargs, **kwargs_to_try)
|
||||
try:
|
||||
tokenizer = getattr(processor, "tokenizer") if processor is not None and hasattr(processor, "tokenizer") else AutoTokenizer.from_pretrained(ckpt_name, **hub_kwargs, **kwargs_to_try)
|
||||
except Exception:
|
||||
try:
|
||||
tokenizer = AutoTokenizer.from_pretrained(ckpt_name, use_fast=True, legacy=False, **hub_kwargs, **kwargs_to_try)
|
||||
except Exception:
|
||||
if repo_id != ckpt_name:
|
||||
tokenizer = AutoTokenizer.from_pretrained(repo_id, use_fast=True, legacy=False, **hub_kwargs, **kwargs_to_try)
|
||||
else:
|
||||
raise
|
||||
if tokenizer is not None or processor is not None:
|
||||
break
|
||||
except Exception as exc_info:
|
||||
@ -176,7 +266,6 @@ class TransformersManagedModel(ModelManageableStub, LanguageModel):
|
||||
|
||||
def generate(self, tokens: TOKENS_TYPE = None,
|
||||
max_new_tokens: int = 512,
|
||||
repetition_penalty: float = 0.0,
|
||||
seed: int = 0,
|
||||
sampler: Optional[GENERATION_KWARGS_TYPE] = None,
|
||||
*args,
|
||||
@ -253,13 +342,22 @@ class TransformersManagedModel(ModelManageableStub, LanguageModel):
|
||||
with seed_for_block(seed), torch.inference_mode(mode=True) if has_triton else contextlib.nullcontext():
|
||||
if hasattr(inputs, "encodings") and inputs.encodings is not None and all(hasattr(encoding, "attention_mask") for encoding in inputs.encodings) and "attention_mask" in inputs:
|
||||
inputs.pop("attention_mask")
|
||||
output_ids = transformers_model.generate(
|
||||
**inputs,
|
||||
streamer=text_streamer if num_beams <= 1 else None,
|
||||
max_new_tokens=max_new_tokens,
|
||||
repetition_penalty=repetition_penalty if repetition_penalty != 0 else None,
|
||||
**generate_kwargs
|
||||
)
|
||||
|
||||
from ..patcher_extension import WrapperExecutor, WrappersMP, get_all_wrappers
|
||||
|
||||
def _generate(inputs, streamer, max_new_tokens, **generate_kwargs):
|
||||
return transformers_model.generate(
|
||||
**inputs,
|
||||
streamer=streamer,
|
||||
max_new_tokens=max_new_tokens,
|
||||
**generate_kwargs
|
||||
)
|
||||
|
||||
output_ids = WrapperExecutor.new_class_executor(
|
||||
_generate,
|
||||
self,
|
||||
get_all_wrappers(WrappersMP.APPLY_MODEL, self.model_options)
|
||||
).execute(inputs, text_streamer if num_beams <= 1 else None, max_new_tokens, **generate_kwargs)
|
||||
|
||||
if not transformers_model.config.is_encoder_decoder:
|
||||
start_position = inputs["input_ids" if "input_ids" in inputs else "inputs"].shape[1]
|
||||
@ -292,7 +390,7 @@ class TransformersManagedModel(ModelManageableStub, LanguageModel):
|
||||
return self._tokenizer
|
||||
|
||||
@property
|
||||
def processor(self) -> AutoProcessor | ProcessorMixin | LLaVAProcessor | None:
|
||||
def processor(self) -> AutoProcessor | ProcessorMixin | None:
|
||||
return self._processor
|
||||
|
||||
@property
|
||||
@ -338,6 +436,9 @@ class TransformersManagedModel(ModelManageableStub, LanguageModel):
|
||||
return self._size
|
||||
|
||||
def model_patches_to(self, arg: torch.device | torch.dtype):
|
||||
if getattr(self.model, "is_loaded_in_4bit", False) or getattr(self.model, "is_loaded_in_8bit", False):
|
||||
return
|
||||
|
||||
if isinstance(arg, torch.device):
|
||||
self.model.to(device=arg)
|
||||
else:
|
||||
@ -364,7 +465,7 @@ class TransformersManagedModel(ModelManageableStub, LanguageModel):
|
||||
if processor is not None and hasattr(processor, "image_processor") and hasattr(processor.image_processor, "do_rescale"):
|
||||
processor.image_processor.do_rescale = False
|
||||
|
||||
def tokenize(self, prompt: str | LanguagePrompt, images: RGBImageBatch | None, chat_template: str | None = None) -> ProcessorResult:
|
||||
def tokenize(self, prompt: str | LanguagePrompt, images: RGBImageBatch | None, videos: list[torch.Tensor] | None = None, chat_template: str | None = None) -> ProcessorResult:
|
||||
tokenizer = self.processor if self.processor is not None else self.tokenizer
|
||||
assert tokenizer is not None
|
||||
assert hasattr(tokenizer, "decode")
|
||||
@ -391,16 +492,18 @@ class TransformersManagedModel(ModelManageableStub, LanguageModel):
|
||||
messages: LanguagePrompt
|
||||
if isinstance(prompt, list) and len(prompt) > 0 and isinstance(prompt[0], dict):
|
||||
messages = prompt
|
||||
elif "content[" in chat_template:
|
||||
elif images is not None and len(images) > 0 or videos is not None and len(videos) > 0:
|
||||
messages = [
|
||||
{"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": prompt
|
||||
"text": prompt if isinstance(prompt, str) else ""
|
||||
}
|
||||
] + [
|
||||
{"type": "image"} for _ in range(len(images))
|
||||
] + [
|
||||
{"type": "video"} for _ in range(len(videos))
|
||||
]
|
||||
|
||||
}
|
||||
@ -409,10 +512,25 @@ class TransformersManagedModel(ModelManageableStub, LanguageModel):
|
||||
messages = [
|
||||
{"role": "user", "content": prompt},
|
||||
]
|
||||
|
||||
prompt = tokenizer.apply_chat_template(messages, chat_template=chat_template, add_generation_prompt=True, tokenize=False)
|
||||
except Exception as exc:
|
||||
logger.debug("Could not apply chat template", exc_info=exc)
|
||||
|
||||
if isinstance(prompt, list):
|
||||
# Fallback: extract text from messages if chat template application failed or wasn't available
|
||||
extracted_text = []
|
||||
for message in prompt:
|
||||
if isinstance(message, dict) and "content" in message:
|
||||
content = message["content"]
|
||||
if isinstance(content, str):
|
||||
extracted_text.append(content)
|
||||
elif isinstance(content, list):
|
||||
for item in content:
|
||||
if isinstance(item, dict) and item.get("type") == "text":
|
||||
extracted_text.append(item.get("text", ""))
|
||||
prompt = "\n".join(extracted_text)
|
||||
|
||||
if self.processor is None and isinstance(prompt, str):
|
||||
batch_encoding = tokenizer(prompt, return_tensors="pt").to(device=self.load_device)
|
||||
return {**batch_encoding}
|
||||
@ -421,8 +539,60 @@ class TransformersManagedModel(ModelManageableStub, LanguageModel):
|
||||
self.processor.to(device=self.load_device)
|
||||
# convert tuple to list from images.unbind() for paligemma workaround
|
||||
image_tensor_list = list(images.unbind()) if images is not None and len(images) > 0 else None
|
||||
|
||||
# Convert videos to list of list of frames (uint8)
|
||||
if videos is not None and len(videos) > 0:
|
||||
new_videos = []
|
||||
for v in videos:
|
||||
# Convert to uint8 0-255 if float
|
||||
if v.dtype == torch.float32 or v.dtype == torch.float16 or v.dtype == torch.bfloat16:
|
||||
v = (v * 255).to(torch.uint8)
|
||||
# Convert (T, H, W, C) tensor to list of (H, W, C) tensors
|
||||
if v.ndim == 4:
|
||||
new_videos.append(list(v))
|
||||
else:
|
||||
new_videos.append([v]) # Fallback if not 4D
|
||||
videos = new_videos
|
||||
|
||||
# Check if processor accepts 'videos' argument
|
||||
import inspect
|
||||
processor_params = inspect.signature(self.processor).parameters
|
||||
has_videos_arg = "videos" in processor_params
|
||||
|
||||
kwargs = {
|
||||
"text": [prompt],
|
||||
"images": image_tensor_list,
|
||||
"return_tensors": "pt",
|
||||
"padding": True,
|
||||
}
|
||||
|
||||
if videos is None or len(videos) == 0:
|
||||
pass
|
||||
elif has_videos_arg:
|
||||
kwargs["videos"] = videos
|
||||
if "input_data_format" in processor_params:
|
||||
kwargs["input_data_format"] = "channels_last"
|
||||
elif videos is not None and len(videos) > 0:
|
||||
if args.enable_video_to_image_fallback:
|
||||
# Fallback: flatten video frames into images if processor doesn't support 'videos'
|
||||
# videos is List[List[Frame]] where Frame is (H, W, C)
|
||||
flattened_frames = []
|
||||
for video in videos:
|
||||
flattened_frames.extend(video)
|
||||
|
||||
# Convert list of frames to list of tensors if needed, or just append to images list
|
||||
# images is currently a list of tensors
|
||||
if kwargs["images"] is None:
|
||||
kwargs["images"] = []
|
||||
|
||||
# Ensure frames are in the same format as images (tensors)
|
||||
# Frames in videos are already tensors (uint8)
|
||||
kwargs["images"].extend(flattened_frames)
|
||||
else:
|
||||
logger.warning(f"Model {self.model.name_or_path} does not support video inputs and video-to-image fallback is disabled. Use --enable-video-to-image-fallback to enable it.")
|
||||
|
||||
try:
|
||||
batch_feature: BatchFeature = self.processor(text=[prompt], images=image_tensor_list, return_tensors="pt", padding=True)
|
||||
batch_feature: BatchFeature = self.processor(**kwargs)
|
||||
except TypeError as exc_info:
|
||||
logger.warning(f"Exception while trying to run processor. Your transformers package is version {transformers.__version__} and may need to be updated")
|
||||
raise exc_info
|
||||
|
||||
@ -7,6 +7,7 @@ class LatentFormat:
|
||||
latent_dimensions = 2
|
||||
latent_rgb_factors = None
|
||||
latent_rgb_factors_bias = None
|
||||
latent_rgb_factors_reshape = None
|
||||
taesd_decoder_name = None
|
||||
|
||||
def process_in(self, latent):
|
||||
@ -191,6 +192,55 @@ class Flux(SD3):
|
||||
return (latent / self.scale_factor) + self.shift_factor
|
||||
|
||||
|
||||
class Flux2(LatentFormat):
|
||||
latent_channels = 128
|
||||
|
||||
def __init__(self):
|
||||
self.latent_rgb_factors = [
|
||||
[0.0058, 0.0113, 0.0073],
|
||||
[0.0495, 0.0443, 0.0836],
|
||||
[-0.0099, 0.0096, 0.0644],
|
||||
[0.2144, 0.3009, 0.3652],
|
||||
[0.0166, -0.0039, -0.0054],
|
||||
[0.0157, 0.0103, -0.0160],
|
||||
[-0.0398, 0.0902, -0.0235],
|
||||
[-0.0052, 0.0095, 0.0109],
|
||||
[-0.3527, -0.2712, -0.1666],
|
||||
[-0.0301, -0.0356, -0.0180],
|
||||
[-0.0107, 0.0078, 0.0013],
|
||||
[0.0746, 0.0090, -0.0941],
|
||||
[0.0156, 0.0169, 0.0070],
|
||||
[-0.0034, -0.0040, -0.0114],
|
||||
[0.0032, 0.0181, 0.0080],
|
||||
[-0.0939, -0.0008, 0.0186],
|
||||
[0.0018, 0.0043, 0.0104],
|
||||
[0.0284, 0.0056, -0.0127],
|
||||
[-0.0024, -0.0022, -0.0030],
|
||||
[0.1207, -0.0026, 0.0065],
|
||||
[0.0128, 0.0101, 0.0142],
|
||||
[0.0137, -0.0072, -0.0007],
|
||||
[0.0095, 0.0092, -0.0059],
|
||||
[0.0000, -0.0077, -0.0049],
|
||||
[-0.0465, -0.0204, -0.0312],
|
||||
[0.0095, 0.0012, -0.0066],
|
||||
[0.0290, -0.0034, 0.0025],
|
||||
[0.0220, 0.0169, -0.0048],
|
||||
[-0.0332, -0.0457, -0.0468],
|
||||
[-0.0085, 0.0389, 0.0609],
|
||||
[-0.0076, 0.0003, -0.0043],
|
||||
[-0.0111, -0.0460, -0.0614],
|
||||
]
|
||||
|
||||
self.latent_rgb_factors_bias = [-0.0329, -0.0718, -0.0851]
|
||||
self.latent_rgb_factors_reshape = lambda t: t.reshape(t.shape[0], 32, 2, 2, t.shape[-2], t.shape[-1]).permute(0, 1, 4, 2, 5, 3).reshape(t.shape[0], 32, t.shape[-2] * 2, t.shape[-1] * 2)
|
||||
|
||||
def process_in(self, latent):
|
||||
return latent
|
||||
|
||||
def process_out(self, latent):
|
||||
return latent
|
||||
|
||||
|
||||
class Mochi(LatentFormat):
|
||||
latent_channels = 12
|
||||
latent_dimensions = 3
|
||||
@ -240,210 +290,214 @@ class LTXV(LatentFormat):
|
||||
|
||||
def __init__(self):
|
||||
self.latent_rgb_factors = [
|
||||
[ 1.1202e-02, -6.3815e-04, -1.0021e-02],
|
||||
[ 8.6031e-02, 6.5813e-02, 9.5409e-04],
|
||||
[1.1202e-02, -6.3815e-04, -1.0021e-02],
|
||||
[8.6031e-02, 6.5813e-02, 9.5409e-04],
|
||||
[-1.2576e-02, -7.5734e-03, -4.0528e-03],
|
||||
[ 9.4063e-03, -2.1688e-03, 2.6093e-03],
|
||||
[ 3.7636e-03, 1.2765e-02, 9.1548e-03],
|
||||
[ 2.1024e-02, -5.2973e-03, 3.4373e-03],
|
||||
[9.4063e-03, -2.1688e-03, 2.6093e-03],
|
||||
[3.7636e-03, 1.2765e-02, 9.1548e-03],
|
||||
[2.1024e-02, -5.2973e-03, 3.4373e-03],
|
||||
[-8.8896e-03, -1.9703e-02, -1.8761e-02],
|
||||
[-1.3160e-02, -1.0523e-02, 1.9709e-03],
|
||||
[-1.3160e-02, -1.0523e-02, 1.9709e-03],
|
||||
[-1.5152e-03, -6.9891e-03, -7.5810e-03],
|
||||
[-1.7247e-03, 4.6560e-04, -3.3839e-03],
|
||||
[ 1.3617e-02, 4.7077e-03, -2.0045e-03],
|
||||
[ 1.0256e-02, 7.7318e-03, 1.3948e-02],
|
||||
[-1.6108e-02, -6.2151e-03, 1.1561e-03],
|
||||
[ 7.3407e-03, 1.5628e-02, 4.4865e-04],
|
||||
[ 9.5357e-04, -2.9518e-03, -1.4760e-02],
|
||||
[ 1.9143e-02, 1.0868e-02, 1.2264e-02],
|
||||
[ 4.4575e-03, 3.6682e-05, -6.8508e-03],
|
||||
[-4.5681e-04, 3.2570e-03, 7.7929e-03],
|
||||
[ 3.3902e-02, 3.3405e-02, 3.7454e-02],
|
||||
[-1.7247e-03, 4.6560e-04, -3.3839e-03],
|
||||
[1.3617e-02, 4.7077e-03, -2.0045e-03],
|
||||
[1.0256e-02, 7.7318e-03, 1.3948e-02],
|
||||
[-1.6108e-02, -6.2151e-03, 1.1561e-03],
|
||||
[7.3407e-03, 1.5628e-02, 4.4865e-04],
|
||||
[9.5357e-04, -2.9518e-03, -1.4760e-02],
|
||||
[1.9143e-02, 1.0868e-02, 1.2264e-02],
|
||||
[4.4575e-03, 3.6682e-05, -6.8508e-03],
|
||||
[-4.5681e-04, 3.2570e-03, 7.7929e-03],
|
||||
[3.3902e-02, 3.3405e-02, 3.7454e-02],
|
||||
[-2.3001e-02, -2.4877e-03, -3.1033e-03],
|
||||
[ 5.0265e-02, 3.8841e-02, 3.3539e-02],
|
||||
[-4.1018e-03, -1.1095e-03, 1.5859e-03],
|
||||
[5.0265e-02, 3.8841e-02, 3.3539e-02],
|
||||
[-4.1018e-03, -1.1095e-03, 1.5859e-03],
|
||||
[-1.2689e-01, -1.3107e-01, -2.1005e-01],
|
||||
[ 2.6276e-02, 1.4189e-02, -3.5963e-03],
|
||||
[-4.8679e-03, 8.8486e-03, 7.8029e-03],
|
||||
[2.6276e-02, 1.4189e-02, -3.5963e-03],
|
||||
[-4.8679e-03, 8.8486e-03, 7.8029e-03],
|
||||
[-1.6610e-03, -4.8597e-03, -5.2060e-03],
|
||||
[-2.1010e-03, 2.3610e-03, 9.3796e-03],
|
||||
[-2.1010e-03, 2.3610e-03, 9.3796e-03],
|
||||
[-2.2482e-02, -2.1305e-02, -1.5087e-02],
|
||||
[-1.5753e-02, -1.0646e-02, -6.5083e-03],
|
||||
[-4.6975e-03, 5.0288e-03, -6.7390e-03],
|
||||
[ 1.1951e-02, 2.0712e-02, 1.6191e-02],
|
||||
[-4.6975e-03, 5.0288e-03, -6.7390e-03],
|
||||
[1.1951e-02, 2.0712e-02, 1.6191e-02],
|
||||
[-6.3704e-03, -8.4827e-03, -9.5483e-03],
|
||||
[ 7.2610e-03, -9.9326e-03, -2.2978e-02],
|
||||
[-9.1904e-04, 6.2882e-03, 9.5720e-03],
|
||||
[7.2610e-03, -9.9326e-03, -2.2978e-02],
|
||||
[-9.1904e-04, 6.2882e-03, 9.5720e-03],
|
||||
[-3.7178e-02, -3.7123e-02, -5.6713e-02],
|
||||
[-1.3373e-01, -1.0720e-01, -5.3801e-02],
|
||||
[-5.3702e-03, 8.1256e-03, 8.8397e-03],
|
||||
[-5.3702e-03, 8.1256e-03, 8.8397e-03],
|
||||
[-1.5247e-01, -2.1437e-01, -2.1843e-01],
|
||||
[ 3.1441e-02, 7.0335e-03, -9.7541e-03],
|
||||
[ 2.1528e-03, -8.9817e-03, -2.1023e-02],
|
||||
[ 3.8461e-03, -5.8957e-03, -1.5014e-02],
|
||||
[3.1441e-02, 7.0335e-03, -9.7541e-03],
|
||||
[2.1528e-03, -8.9817e-03, -2.1023e-02],
|
||||
[3.8461e-03, -5.8957e-03, -1.5014e-02],
|
||||
[-4.3470e-03, -1.2940e-02, -1.5972e-02],
|
||||
[-5.4781e-03, -1.0842e-02, -3.0204e-03],
|
||||
[-6.5347e-03, 3.0806e-03, -1.0163e-02],
|
||||
[-6.5347e-03, 3.0806e-03, -1.0163e-02],
|
||||
[-5.0414e-03, -7.1503e-03, -8.9686e-04],
|
||||
[-8.5851e-03, -2.4351e-03, 1.0674e-03],
|
||||
[-9.0016e-03, -9.6493e-03, 1.5692e-03],
|
||||
[ 5.0914e-03, 1.2099e-02, 1.9968e-02],
|
||||
[ 1.3758e-02, 1.1669e-02, 8.1958e-03],
|
||||
[-8.5851e-03, -2.4351e-03, 1.0674e-03],
|
||||
[-9.0016e-03, -9.6493e-03, 1.5692e-03],
|
||||
[5.0914e-03, 1.2099e-02, 1.9968e-02],
|
||||
[1.3758e-02, 1.1669e-02, 8.1958e-03],
|
||||
[-1.0518e-02, -1.1575e-02, -4.1307e-03],
|
||||
[-2.8410e-02, -3.1266e-02, -2.2149e-02],
|
||||
[ 2.9336e-03, 3.6511e-02, 1.8717e-02],
|
||||
[2.9336e-03, 3.6511e-02, 1.8717e-02],
|
||||
[-1.6703e-02, -1.6696e-02, -4.4529e-03],
|
||||
[ 4.8818e-02, 4.0063e-02, 8.7410e-03],
|
||||
[-1.5066e-02, -5.7328e-04, 2.9785e-03],
|
||||
[-1.7613e-02, -8.1034e-03, 1.3086e-02],
|
||||
[-9.2633e-03, 1.0803e-02, -6.3489e-03],
|
||||
[ 3.0851e-03, 4.7750e-04, 1.2347e-02],
|
||||
[4.8818e-02, 4.0063e-02, 8.7410e-03],
|
||||
[-1.5066e-02, -5.7328e-04, 2.9785e-03],
|
||||
[-1.7613e-02, -8.1034e-03, 1.3086e-02],
|
||||
[-9.2633e-03, 1.0803e-02, -6.3489e-03],
|
||||
[3.0851e-03, 4.7750e-04, 1.2347e-02],
|
||||
[-2.2785e-02, -2.3043e-02, -2.6005e-02],
|
||||
[-2.4787e-02, -1.5389e-02, -2.2104e-02],
|
||||
[-2.3572e-02, 1.0544e-03, 1.2361e-02],
|
||||
[-2.3572e-02, 1.0544e-03, 1.2361e-02],
|
||||
[-7.8915e-03, -1.2271e-03, -6.0968e-03],
|
||||
[-1.1478e-02, -1.2543e-03, 6.2679e-03],
|
||||
[-5.4229e-02, 2.6644e-02, 6.3394e-03],
|
||||
[ 4.4216e-03, -7.3338e-03, -1.0464e-02],
|
||||
[-4.5013e-03, 1.6082e-03, 1.4420e-02],
|
||||
[ 1.3673e-02, 8.8877e-03, 4.1253e-03],
|
||||
[-1.0145e-02, 9.0072e-03, 1.5695e-02],
|
||||
[-5.6234e-03, 1.1847e-03, 8.1261e-03],
|
||||
[-3.7171e-03, -5.3538e-03, 1.2590e-03],
|
||||
[ 2.9476e-02, 2.1424e-02, 3.0424e-02],
|
||||
[-1.1478e-02, -1.2543e-03, 6.2679e-03],
|
||||
[-5.4229e-02, 2.6644e-02, 6.3394e-03],
|
||||
[4.4216e-03, -7.3338e-03, -1.0464e-02],
|
||||
[-4.5013e-03, 1.6082e-03, 1.4420e-02],
|
||||
[1.3673e-02, 8.8877e-03, 4.1253e-03],
|
||||
[-1.0145e-02, 9.0072e-03, 1.5695e-02],
|
||||
[-5.6234e-03, 1.1847e-03, 8.1261e-03],
|
||||
[-3.7171e-03, -5.3538e-03, 1.2590e-03],
|
||||
[2.9476e-02, 2.1424e-02, 3.0424e-02],
|
||||
[-3.4925e-02, -2.4340e-02, -2.5316e-02],
|
||||
[-3.4127e-02, -2.2406e-02, -1.0589e-02],
|
||||
[-1.7342e-02, -1.3249e-02, -1.0719e-02],
|
||||
[-2.1478e-03, -8.6051e-03, -2.9878e-03],
|
||||
[ 1.2089e-03, -4.2391e-03, -6.8569e-03],
|
||||
[ 9.0411e-04, -6.6886e-03, -6.7547e-05],
|
||||
[ 1.6048e-02, -1.0057e-02, -2.8929e-02],
|
||||
[ 1.2290e-03, 1.0163e-02, 1.8861e-02],
|
||||
[ 1.7264e-02, 2.7257e-04, 1.3785e-02],
|
||||
[-1.3482e-02, -3.6427e-03, 6.7481e-04],
|
||||
[ 4.6782e-03, -5.2423e-03, 2.4467e-03],
|
||||
[1.2089e-03, -4.2391e-03, -6.8569e-03],
|
||||
[9.0411e-04, -6.6886e-03, -6.7547e-05],
|
||||
[1.6048e-02, -1.0057e-02, -2.8929e-02],
|
||||
[1.2290e-03, 1.0163e-02, 1.8861e-02],
|
||||
[1.7264e-02, 2.7257e-04, 1.3785e-02],
|
||||
[-1.3482e-02, -3.6427e-03, 6.7481e-04],
|
||||
[4.6782e-03, -5.2423e-03, 2.4467e-03],
|
||||
[-5.9113e-03, -6.2244e-03, -1.8162e-03],
|
||||
[ 1.5496e-02, 1.4582e-02, 1.9514e-03],
|
||||
[ 7.4958e-03, 1.5886e-03, -8.2305e-03],
|
||||
[ 1.9086e-02, 1.6360e-03, -3.9674e-03],
|
||||
[1.5496e-02, 1.4582e-02, 1.9514e-03],
|
||||
[7.4958e-03, 1.5886e-03, -8.2305e-03],
|
||||
[1.9086e-02, 1.6360e-03, -3.9674e-03],
|
||||
[-5.7021e-03, -2.7307e-03, -4.1066e-03],
|
||||
[ 1.7450e-03, 1.4602e-02, 2.5794e-02],
|
||||
[-8.2788e-04, 2.2902e-03, 4.5161e-03],
|
||||
[ 1.1632e-02, 8.9193e-03, -7.2813e-03],
|
||||
[ 7.5721e-03, 2.6784e-03, 1.1393e-02],
|
||||
[ 5.1939e-03, 3.6903e-03, 1.4049e-02],
|
||||
[1.7450e-03, 1.4602e-02, 2.5794e-02],
|
||||
[-8.2788e-04, 2.2902e-03, 4.5161e-03],
|
||||
[1.1632e-02, 8.9193e-03, -7.2813e-03],
|
||||
[7.5721e-03, 2.6784e-03, 1.1393e-02],
|
||||
[5.1939e-03, 3.6903e-03, 1.4049e-02],
|
||||
[-1.8383e-02, -2.2529e-02, -2.4477e-02],
|
||||
[ 5.8842e-04, -5.7874e-03, -1.4770e-02],
|
||||
[5.8842e-04, -5.7874e-03, -1.4770e-02],
|
||||
[-1.6125e-02, -8.6101e-03, -1.4533e-02],
|
||||
[ 2.0540e-02, 2.0729e-02, 6.4338e-03],
|
||||
[ 3.3587e-03, -1.1226e-02, -1.6444e-02],
|
||||
[-1.4742e-03, -1.0489e-02, 1.7097e-03],
|
||||
[ 2.8130e-02, 2.3546e-02, 3.2791e-02],
|
||||
[2.0540e-02, 2.0729e-02, 6.4338e-03],
|
||||
[3.3587e-03, -1.1226e-02, -1.6444e-02],
|
||||
[-1.4742e-03, -1.0489e-02, 1.7097e-03],
|
||||
[2.8130e-02, 2.3546e-02, 3.2791e-02],
|
||||
[-1.8532e-02, -1.2842e-02, -8.7756e-03],
|
||||
[-8.0533e-03, -1.0771e-02, -1.7536e-02],
|
||||
[-3.9009e-03, 1.6150e-02, 3.3359e-02],
|
||||
[-3.9009e-03, 1.6150e-02, 3.3359e-02],
|
||||
[-7.4554e-03, -1.4154e-02, -6.1910e-03],
|
||||
[ 3.4734e-03, -1.1370e-02, -1.0581e-02],
|
||||
[ 1.1476e-02, 3.9281e-03, 2.8231e-03],
|
||||
[ 7.1639e-03, -1.4741e-03, -3.8066e-03],
|
||||
[ 2.2250e-03, -8.7552e-03, -9.5719e-03],
|
||||
[ 2.4146e-02, 2.1696e-02, 2.8056e-02],
|
||||
[3.4734e-03, -1.1370e-02, -1.0581e-02],
|
||||
[1.1476e-02, 3.9281e-03, 2.8231e-03],
|
||||
[7.1639e-03, -1.4741e-03, -3.8066e-03],
|
||||
[2.2250e-03, -8.7552e-03, -9.5719e-03],
|
||||
[2.4146e-02, 2.1696e-02, 2.8056e-02],
|
||||
[-5.4365e-03, -2.4291e-02, -1.7802e-02],
|
||||
[ 7.4263e-03, 1.0510e-02, 1.2705e-02],
|
||||
[ 6.2669e-03, 6.2658e-03, 1.9211e-02],
|
||||
[ 1.6378e-02, 9.4933e-03, 6.6971e-03],
|
||||
[ 1.7173e-02, 2.3601e-02, 2.3296e-02],
|
||||
[7.4263e-03, 1.0510e-02, 1.2705e-02],
|
||||
[6.2669e-03, 6.2658e-03, 1.9211e-02],
|
||||
[1.6378e-02, 9.4933e-03, 6.6971e-03],
|
||||
[1.7173e-02, 2.3601e-02, 2.3296e-02],
|
||||
[-1.4568e-02, -9.8279e-03, -1.1556e-02],
|
||||
[ 1.4431e-02, 1.4430e-02, 6.6362e-03],
|
||||
[-6.8230e-03, 1.8863e-02, 1.4555e-02],
|
||||
[ 6.1156e-03, 3.4700e-03, -2.6662e-03],
|
||||
[1.4431e-02, 1.4430e-02, 6.6362e-03],
|
||||
[-6.8230e-03, 1.8863e-02, 1.4555e-02],
|
||||
[6.1156e-03, 3.4700e-03, -2.6662e-03],
|
||||
[-2.6983e-03, -5.9402e-03, -9.2276e-03],
|
||||
[ 1.0235e-02, 7.4173e-03, -7.6243e-03],
|
||||
[-1.3255e-02, 1.9322e-02, -9.2153e-04],
|
||||
[ 2.4222e-03, -4.8039e-03, -1.5759e-02],
|
||||
[ 2.6244e-02, 2.5951e-02, 2.0249e-02],
|
||||
[ 1.5711e-02, 1.8498e-02, 2.7407e-03],
|
||||
[-2.1714e-03, 4.7214e-03, -2.2443e-02],
|
||||
[-7.4747e-03, 7.4166e-03, 1.4430e-02],
|
||||
[-8.3906e-03, -7.9776e-03, 9.7927e-03],
|
||||
[ 3.8321e-02, 9.6622e-03, -1.9268e-02],
|
||||
[-1.4605e-02, -6.7032e-03, 3.9675e-03]
|
||||
[1.0235e-02, 7.4173e-03, -7.6243e-03],
|
||||
[-1.3255e-02, 1.9322e-02, -9.2153e-04],
|
||||
[2.4222e-03, -4.8039e-03, -1.5759e-02],
|
||||
[2.6244e-02, 2.5951e-02, 2.0249e-02],
|
||||
[1.5711e-02, 1.8498e-02, 2.7407e-03],
|
||||
[-2.1714e-03, 4.7214e-03, -2.2443e-02],
|
||||
[-7.4747e-03, 7.4166e-03, 1.4430e-02],
|
||||
[-8.3906e-03, -7.9776e-03, 9.7927e-03],
|
||||
[3.8321e-02, 9.6622e-03, -1.9268e-02],
|
||||
[-1.4605e-02, -6.7032e-03, 3.9675e-03]
|
||||
]
|
||||
|
||||
self.latent_rgb_factors_bias = [-0.0571, -0.1657, -0.2512]
|
||||
|
||||
|
||||
class HunyuanVideo(LatentFormat):
|
||||
latent_channels = 16
|
||||
latent_dimensions = 3
|
||||
scale_factor = 0.476986
|
||||
latent_rgb_factors = [
|
||||
[-0.0395, -0.0331, 0.0445],
|
||||
[ 0.0696, 0.0795, 0.0518],
|
||||
[ 0.0135, -0.0945, -0.0282],
|
||||
[ 0.0108, -0.0250, -0.0765],
|
||||
[-0.0209, 0.0032, 0.0224],
|
||||
[-0.0395, -0.0331, 0.0445],
|
||||
[0.0696, 0.0795, 0.0518],
|
||||
[0.0135, -0.0945, -0.0282],
|
||||
[0.0108, -0.0250, -0.0765],
|
||||
[-0.0209, 0.0032, 0.0224],
|
||||
[-0.0804, -0.0254, -0.0639],
|
||||
[-0.0991, 0.0271, -0.0669],
|
||||
[-0.0991, 0.0271, -0.0669],
|
||||
[-0.0646, -0.0422, -0.0400],
|
||||
[-0.0696, -0.0595, -0.0894],
|
||||
[-0.0799, -0.0208, -0.0375],
|
||||
[ 0.1166, 0.1627, 0.0962],
|
||||
[ 0.1165, 0.0432, 0.0407],
|
||||
[0.1166, 0.1627, 0.0962],
|
||||
[0.1165, 0.0432, 0.0407],
|
||||
[-0.2315, -0.1920, -0.1355],
|
||||
[-0.0270, 0.0401, -0.0821],
|
||||
[-0.0270, 0.0401, -0.0821],
|
||||
[-0.0616, -0.0997, -0.0727],
|
||||
[ 0.0249, -0.0469, -0.1703]
|
||||
[0.0249, -0.0469, -0.1703]
|
||||
]
|
||||
|
||||
latent_rgb_factors_bias = [ 0.0259, -0.0192, -0.0761]
|
||||
latent_rgb_factors_bias = [0.0259, -0.0192, -0.0761]
|
||||
taesd_decoder_name = "taehv"
|
||||
|
||||
|
||||
class Cosmos1CV8x8x8(LatentFormat):
|
||||
latent_channels = 16
|
||||
latent_dimensions = 3
|
||||
|
||||
latent_rgb_factors = [
|
||||
[ 0.1817, 0.2284, 0.2423],
|
||||
[0.1817, 0.2284, 0.2423],
|
||||
[-0.0586, -0.0862, -0.3108],
|
||||
[-0.4703, -0.4255, -0.3995],
|
||||
[ 0.0803, 0.1963, 0.1001],
|
||||
[-0.0820, -0.1050, 0.0400],
|
||||
[ 0.2511, 0.3098, 0.2787],
|
||||
[0.0803, 0.1963, 0.1001],
|
||||
[-0.0820, -0.1050, 0.0400],
|
||||
[0.2511, 0.3098, 0.2787],
|
||||
[-0.1830, -0.2117, -0.0040],
|
||||
[-0.0621, -0.2187, -0.0939],
|
||||
[ 0.3619, 0.1082, 0.1455],
|
||||
[ 0.3164, 0.3922, 0.2575],
|
||||
[ 0.1152, 0.0231, -0.0462],
|
||||
[0.3619, 0.1082, 0.1455],
|
||||
[0.3164, 0.3922, 0.2575],
|
||||
[0.1152, 0.0231, -0.0462],
|
||||
[-0.1434, -0.3609, -0.3665],
|
||||
[ 0.0635, 0.1471, 0.1680],
|
||||
[0.0635, 0.1471, 0.1680],
|
||||
[-0.3635, -0.1963, -0.3248],
|
||||
[-0.1865, 0.0365, 0.2346],
|
||||
[ 0.0447, 0.0994, 0.0881]
|
||||
[-0.1865, 0.0365, 0.2346],
|
||||
[0.0447, 0.0994, 0.0881]
|
||||
]
|
||||
|
||||
latent_rgb_factors_bias = [-0.1223, -0.1889, -0.1976]
|
||||
|
||||
|
||||
class Wan21(LatentFormat):
|
||||
latent_channels = 16
|
||||
latent_dimensions = 3
|
||||
|
||||
latent_rgb_factors = [
|
||||
[-0.1299, -0.1692, 0.2932],
|
||||
[ 0.0671, 0.0406, 0.0442],
|
||||
[ 0.3568, 0.2548, 0.1747],
|
||||
[ 0.0372, 0.2344, 0.1420],
|
||||
[ 0.0313, 0.0189, -0.0328],
|
||||
[ 0.0296, -0.0956, -0.0665],
|
||||
[-0.3477, -0.4059, -0.2925],
|
||||
[ 0.0166, 0.1902, 0.1975],
|
||||
[-0.0412, 0.0267, -0.1364],
|
||||
[-0.1293, 0.0740, 0.1636],
|
||||
[ 0.0680, 0.3019, 0.1128],
|
||||
[ 0.0032, 0.0581, 0.0639],
|
||||
[-0.1251, 0.0927, 0.1699],
|
||||
[ 0.0060, -0.0633, 0.0005],
|
||||
[ 0.3477, 0.2275, 0.2950],
|
||||
[ 0.1984, 0.0913, 0.1861]
|
||||
]
|
||||
[-0.1299, -0.1692, 0.2932],
|
||||
[0.0671, 0.0406, 0.0442],
|
||||
[0.3568, 0.2548, 0.1747],
|
||||
[0.0372, 0.2344, 0.1420],
|
||||
[0.0313, 0.0189, -0.0328],
|
||||
[0.0296, -0.0956, -0.0665],
|
||||
[-0.3477, -0.4059, -0.2925],
|
||||
[0.0166, 0.1902, 0.1975],
|
||||
[-0.0412, 0.0267, -0.1364],
|
||||
[-0.1293, 0.0740, 0.1636],
|
||||
[0.0680, 0.3019, 0.1128],
|
||||
[0.0032, 0.0581, 0.0639],
|
||||
[-0.1251, 0.0927, 0.1699],
|
||||
[0.0060, -0.0633, 0.0005],
|
||||
[0.3477, 0.2275, 0.2950],
|
||||
[0.1984, 0.0913, 0.1861]
|
||||
]
|
||||
|
||||
latent_rgb_factors_bias = [-0.1835, -0.0868, -0.3360]
|
||||
|
||||
@ -458,8 +512,7 @@ class Wan21(LatentFormat):
|
||||
3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160
|
||||
]).view(1, self.latent_channels, 1, 1, 1)
|
||||
|
||||
|
||||
self.taesd_decoder_name = None #TODO
|
||||
self.taesd_decoder_name = "lighttaew2_1"
|
||||
|
||||
def process_in(self, latent):
|
||||
latents_mean = self.latents_mean.to(latent.device, latent.dtype)
|
||||
@ -471,81 +524,84 @@ class Wan21(LatentFormat):
|
||||
latents_std = self.latents_std.to(latent.device, latent.dtype)
|
||||
return latent * latents_std / self.scale_factor + latents_mean
|
||||
|
||||
|
||||
class Wan22(Wan21):
|
||||
latent_channels = 48
|
||||
latent_dimensions = 3
|
||||
|
||||
latent_rgb_factors = [
|
||||
[ 0.0119, 0.0103, 0.0046],
|
||||
[-0.1062, -0.0504, 0.0165],
|
||||
[ 0.0140, 0.0409, 0.0491],
|
||||
[-0.0813, -0.0677, 0.0607],
|
||||
[ 0.0656, 0.0851, 0.0808],
|
||||
[ 0.0264, 0.0463, 0.0912],
|
||||
[ 0.0295, 0.0326, 0.0590],
|
||||
[-0.0244, -0.0270, 0.0025],
|
||||
[ 0.0443, -0.0102, 0.0288],
|
||||
[-0.0465, -0.0090, -0.0205],
|
||||
[ 0.0359, 0.0236, 0.0082],
|
||||
[-0.0776, 0.0854, 0.1048],
|
||||
[ 0.0564, 0.0264, 0.0561],
|
||||
[ 0.0006, 0.0594, 0.0418],
|
||||
[-0.0319, -0.0542, -0.0637],
|
||||
[-0.0268, 0.0024, 0.0260],
|
||||
[ 0.0539, 0.0265, 0.0358],
|
||||
[-0.0359, -0.0312, -0.0287],
|
||||
[-0.0285, -0.1032, -0.1237],
|
||||
[ 0.1041, 0.0537, 0.0622],
|
||||
[-0.0086, -0.0374, -0.0051],
|
||||
[ 0.0390, 0.0670, 0.2863],
|
||||
[ 0.0069, 0.0144, 0.0082],
|
||||
[ 0.0006, -0.0167, 0.0079],
|
||||
[ 0.0313, -0.0574, -0.0232],
|
||||
[-0.1454, -0.0902, -0.0481],
|
||||
[ 0.0714, 0.0827, 0.0447],
|
||||
[-0.0304, -0.0574, -0.0196],
|
||||
[ 0.0401, 0.0384, 0.0204],
|
||||
[-0.0758, -0.0297, -0.0014],
|
||||
[ 0.0568, 0.1307, 0.1372],
|
||||
[-0.0055, -0.0310, -0.0380],
|
||||
[ 0.0239, -0.0305, 0.0325],
|
||||
[-0.0663, -0.0673, -0.0140],
|
||||
[-0.0416, -0.0047, -0.0023],
|
||||
[ 0.0166, 0.0112, -0.0093],
|
||||
[-0.0211, 0.0011, 0.0331],
|
||||
[ 0.1833, 0.1466, 0.2250],
|
||||
[-0.0368, 0.0370, 0.0295],
|
||||
[-0.3441, -0.3543, -0.2008],
|
||||
[-0.0479, -0.0489, -0.0420],
|
||||
[-0.0660, -0.0153, 0.0800],
|
||||
[-0.0101, 0.0068, 0.0156],
|
||||
[-0.0690, -0.0452, -0.0927],
|
||||
[-0.0145, 0.0041, 0.0015],
|
||||
[ 0.0421, 0.0451, 0.0373],
|
||||
[ 0.0504, -0.0483, -0.0356],
|
||||
[-0.0837, 0.0168, 0.0055]
|
||||
]
|
||||
[0.0119, 0.0103, 0.0046],
|
||||
[-0.1062, -0.0504, 0.0165],
|
||||
[0.0140, 0.0409, 0.0491],
|
||||
[-0.0813, -0.0677, 0.0607],
|
||||
[0.0656, 0.0851, 0.0808],
|
||||
[0.0264, 0.0463, 0.0912],
|
||||
[0.0295, 0.0326, 0.0590],
|
||||
[-0.0244, -0.0270, 0.0025],
|
||||
[0.0443, -0.0102, 0.0288],
|
||||
[-0.0465, -0.0090, -0.0205],
|
||||
[0.0359, 0.0236, 0.0082],
|
||||
[-0.0776, 0.0854, 0.1048],
|
||||
[0.0564, 0.0264, 0.0561],
|
||||
[0.0006, 0.0594, 0.0418],
|
||||
[-0.0319, -0.0542, -0.0637],
|
||||
[-0.0268, 0.0024, 0.0260],
|
||||
[0.0539, 0.0265, 0.0358],
|
||||
[-0.0359, -0.0312, -0.0287],
|
||||
[-0.0285, -0.1032, -0.1237],
|
||||
[0.1041, 0.0537, 0.0622],
|
||||
[-0.0086, -0.0374, -0.0051],
|
||||
[0.0390, 0.0670, 0.2863],
|
||||
[0.0069, 0.0144, 0.0082],
|
||||
[0.0006, -0.0167, 0.0079],
|
||||
[0.0313, -0.0574, -0.0232],
|
||||
[-0.1454, -0.0902, -0.0481],
|
||||
[0.0714, 0.0827, 0.0447],
|
||||
[-0.0304, -0.0574, -0.0196],
|
||||
[0.0401, 0.0384, 0.0204],
|
||||
[-0.0758, -0.0297, -0.0014],
|
||||
[0.0568, 0.1307, 0.1372],
|
||||
[-0.0055, -0.0310, -0.0380],
|
||||
[0.0239, -0.0305, 0.0325],
|
||||
[-0.0663, -0.0673, -0.0140],
|
||||
[-0.0416, -0.0047, -0.0023],
|
||||
[0.0166, 0.0112, -0.0093],
|
||||
[-0.0211, 0.0011, 0.0331],
|
||||
[0.1833, 0.1466, 0.2250],
|
||||
[-0.0368, 0.0370, 0.0295],
|
||||
[-0.3441, -0.3543, -0.2008],
|
||||
[-0.0479, -0.0489, -0.0420],
|
||||
[-0.0660, -0.0153, 0.0800],
|
||||
[-0.0101, 0.0068, 0.0156],
|
||||
[-0.0690, -0.0452, -0.0927],
|
||||
[-0.0145, 0.0041, 0.0015],
|
||||
[0.0421, 0.0451, 0.0373],
|
||||
[0.0504, -0.0483, -0.0356],
|
||||
[-0.0837, 0.0168, 0.0055]
|
||||
]
|
||||
|
||||
latent_rgb_factors_bias = [0.0317, -0.0878, -0.1388]
|
||||
|
||||
def __init__(self):
|
||||
self.scale_factor = 1.0
|
||||
self.taesd_decoder_name = "lighttaew2_2"
|
||||
self.latents_mean = torch.tensor([
|
||||
-0.2289, -0.0052, -0.1323, -0.2339, -0.2799, 0.0174, 0.1838, 0.1557,
|
||||
-0.1382, 0.0542, 0.2813, 0.0891, 0.1570, -0.0098, 0.0375, -0.1825,
|
||||
-0.2246, -0.1207, -0.0698, 0.5109, 0.2665, -0.2108, -0.2158, 0.2502,
|
||||
-0.2055, -0.0322, 0.1109, 0.1567, -0.0729, 0.0899, -0.2799, -0.1230,
|
||||
-0.0313, -0.1649, 0.0117, 0.0723, -0.2839, -0.2083, -0.0520, 0.3748,
|
||||
0.0152, 0.1957, 0.1433, -0.2944, 0.3573, -0.0548, -0.1681, -0.0667,
|
||||
]).view(1, self.latent_channels, 1, 1, 1)
|
||||
-0.2289, -0.0052, -0.1323, -0.2339, -0.2799, 0.0174, 0.1838, 0.1557,
|
||||
-0.1382, 0.0542, 0.2813, 0.0891, 0.1570, -0.0098, 0.0375, -0.1825,
|
||||
-0.2246, -0.1207, -0.0698, 0.5109, 0.2665, -0.2108, -0.2158, 0.2502,
|
||||
-0.2055, -0.0322, 0.1109, 0.1567, -0.0729, 0.0899, -0.2799, -0.1230,
|
||||
-0.0313, -0.1649, 0.0117, 0.0723, -0.2839, -0.2083, -0.0520, 0.3748,
|
||||
0.0152, 0.1957, 0.1433, -0.2944, 0.3573, -0.0548, -0.1681, -0.0667,
|
||||
]).view(1, self.latent_channels, 1, 1, 1)
|
||||
self.latents_std = torch.tensor([
|
||||
0.4765, 1.0364, 0.4514, 1.1677, 0.5313, 0.4990, 0.4818, 0.5013,
|
||||
0.8158, 1.0344, 0.5894, 1.0901, 0.6885, 0.6165, 0.8454, 0.4978,
|
||||
0.5759, 0.3523, 0.7135, 0.6804, 0.5833, 1.4146, 0.8986, 0.5659,
|
||||
0.7069, 0.5338, 0.4889, 0.4917, 0.4069, 0.4999, 0.6866, 0.4093,
|
||||
0.5709, 0.6065, 0.6415, 0.4944, 0.5726, 1.2042, 0.5458, 1.6887,
|
||||
0.3971, 1.0600, 0.3943, 0.5537, 0.5444, 0.4089, 0.7468, 0.7744
|
||||
]).view(1, self.latent_channels, 1, 1, 1)
|
||||
0.4765, 1.0364, 0.4514, 1.1677, 0.5313, 0.4990, 0.4818, 0.5013,
|
||||
0.8158, 1.0344, 0.5894, 1.0901, 0.6885, 0.6165, 0.8454, 0.4978,
|
||||
0.5759, 0.3523, 0.7135, 0.6804, 0.5833, 1.4146, 0.8986, 0.5659,
|
||||
0.7069, 0.5338, 0.4889, 0.4917, 0.4069, 0.4999, 0.6866, 0.4093,
|
||||
0.5709, 0.6065, 0.6415, 0.4944, 0.5726, 1.2042, 0.5458, 1.6887,
|
||||
0.3971, 1.0600, 0.3943, 0.5537, 0.5444, 0.4089, 0.7468, 0.7744
|
||||
]).view(1, self.latent_channels, 1, 1, 1)
|
||||
|
||||
|
||||
class HunyuanImage21(LatentFormat):
|
||||
latent_channels = 64
|
||||
@ -554,105 +610,173 @@ class HunyuanImage21(LatentFormat):
|
||||
|
||||
latent_rgb_factors = [
|
||||
[-0.0154, -0.0397, -0.0521],
|
||||
[ 0.0005, 0.0093, 0.0006],
|
||||
[0.0005, 0.0093, 0.0006],
|
||||
[-0.0805, -0.0773, -0.0586],
|
||||
[-0.0494, -0.0487, -0.0498],
|
||||
[-0.0212, -0.0076, -0.0261],
|
||||
[-0.0179, -0.0417, -0.0505],
|
||||
[ 0.0158, 0.0310, 0.0239],
|
||||
[ 0.0409, 0.0516, 0.0201],
|
||||
[ 0.0350, 0.0553, 0.0036],
|
||||
[0.0158, 0.0310, 0.0239],
|
||||
[0.0409, 0.0516, 0.0201],
|
||||
[0.0350, 0.0553, 0.0036],
|
||||
[-0.0447, -0.0327, -0.0479],
|
||||
[-0.0038, -0.0221, -0.0365],
|
||||
[-0.0423, -0.0718, -0.0654],
|
||||
[ 0.0039, 0.0368, 0.0104],
|
||||
[ 0.0655, 0.0217, 0.0122],
|
||||
[ 0.0490, 0.1638, 0.2053],
|
||||
[ 0.0932, 0.0829, 0.0650],
|
||||
[0.0039, 0.0368, 0.0104],
|
||||
[0.0655, 0.0217, 0.0122],
|
||||
[0.0490, 0.1638, 0.2053],
|
||||
[0.0932, 0.0829, 0.0650],
|
||||
[-0.0186, -0.0209, -0.0135],
|
||||
[-0.0080, -0.0076, -0.0148],
|
||||
[-0.0284, -0.0201, 0.0011],
|
||||
[-0.0284, -0.0201, 0.0011],
|
||||
[-0.0642, -0.0294, -0.0777],
|
||||
[-0.0035, 0.0076, -0.0140],
|
||||
[ 0.0519, 0.0731, 0.0887],
|
||||
[-0.0102, 0.0095, 0.0704],
|
||||
[ 0.0068, 0.0218, -0.0023],
|
||||
[-0.0035, 0.0076, -0.0140],
|
||||
[0.0519, 0.0731, 0.0887],
|
||||
[-0.0102, 0.0095, 0.0704],
|
||||
[0.0068, 0.0218, -0.0023],
|
||||
[-0.0726, -0.0486, -0.0519],
|
||||
[ 0.0260, 0.0295, 0.0263],
|
||||
[ 0.0250, 0.0333, 0.0341],
|
||||
[ 0.0168, -0.0120, -0.0174],
|
||||
[ 0.0226, 0.1037, 0.0114],
|
||||
[ 0.2577, 0.1906, 0.1604],
|
||||
[0.0260, 0.0295, 0.0263],
|
||||
[0.0250, 0.0333, 0.0341],
|
||||
[0.0168, -0.0120, -0.0174],
|
||||
[0.0226, 0.1037, 0.0114],
|
||||
[0.2577, 0.1906, 0.1604],
|
||||
[-0.0646, -0.0137, -0.0018],
|
||||
[-0.0112, 0.0309, 0.0358],
|
||||
[-0.0347, 0.0146, -0.0481],
|
||||
[ 0.0234, 0.0179, 0.0201],
|
||||
[ 0.0157, 0.0313, 0.0225],
|
||||
[ 0.0423, 0.0675, 0.0524],
|
||||
[-0.0031, 0.0027, -0.0255],
|
||||
[ 0.0447, 0.0555, 0.0330],
|
||||
[-0.0152, 0.0103, 0.0299],
|
||||
[-0.0112, 0.0309, 0.0358],
|
||||
[-0.0347, 0.0146, -0.0481],
|
||||
[0.0234, 0.0179, 0.0201],
|
||||
[0.0157, 0.0313, 0.0225],
|
||||
[0.0423, 0.0675, 0.0524],
|
||||
[-0.0031, 0.0027, -0.0255],
|
||||
[0.0447, 0.0555, 0.0330],
|
||||
[-0.0152, 0.0103, 0.0299],
|
||||
[-0.0755, -0.0489, -0.0635],
|
||||
[ 0.0853, 0.0788, 0.1017],
|
||||
[0.0853, 0.0788, 0.1017],
|
||||
[-0.0272, -0.0294, -0.0471],
|
||||
[ 0.0440, 0.0400, -0.0137],
|
||||
[ 0.0335, 0.0317, -0.0036],
|
||||
[0.0440, 0.0400, -0.0137],
|
||||
[0.0335, 0.0317, -0.0036],
|
||||
[-0.0344, -0.0621, -0.0984],
|
||||
[-0.0127, -0.0630, -0.0620],
|
||||
[-0.0648, 0.0360, 0.0924],
|
||||
[-0.0648, 0.0360, 0.0924],
|
||||
[-0.0781, -0.0801, -0.0409],
|
||||
[ 0.0363, 0.0613, 0.0499],
|
||||
[ 0.0238, 0.0034, 0.0041],
|
||||
[-0.0135, 0.0258, 0.0310],
|
||||
[ 0.0614, 0.1086, 0.0589],
|
||||
[ 0.0428, 0.0350, 0.0205],
|
||||
[ 0.0153, 0.0173, -0.0018],
|
||||
[0.0363, 0.0613, 0.0499],
|
||||
[0.0238, 0.0034, 0.0041],
|
||||
[-0.0135, 0.0258, 0.0310],
|
||||
[0.0614, 0.1086, 0.0589],
|
||||
[0.0428, 0.0350, 0.0205],
|
||||
[0.0153, 0.0173, -0.0018],
|
||||
[-0.0288, -0.0455, -0.0091],
|
||||
[ 0.0344, 0.0109, -0.0157],
|
||||
[0.0344, 0.0109, -0.0157],
|
||||
[-0.0205, -0.0247, -0.0187],
|
||||
[ 0.0487, 0.0126, 0.0064],
|
||||
[-0.0220, -0.0013, 0.0074],
|
||||
[0.0487, 0.0126, 0.0064],
|
||||
[-0.0220, -0.0013, 0.0074],
|
||||
[-0.0203, -0.0094, -0.0048],
|
||||
[-0.0719, 0.0429, -0.0442],
|
||||
[ 0.1042, 0.0497, 0.0356],
|
||||
[-0.0719, 0.0429, -0.0442],
|
||||
[0.1042, 0.0497, 0.0356],
|
||||
[-0.0659, -0.0578, -0.0280],
|
||||
[-0.0060, -0.0322, -0.0234]]
|
||||
|
||||
latent_rgb_factors_bias = [0.0007, -0.0256, -0.0206]
|
||||
|
||||
|
||||
class HunyuanImage21Refiner(LatentFormat):
|
||||
latent_channels = 64
|
||||
latent_dimensions = 3
|
||||
scale_factor = 1.03682
|
||||
|
||||
def process_in(self, latent):
|
||||
out = latent * self.scale_factor
|
||||
out = torch.cat((out[:, :, :1], out), dim=2)
|
||||
out = out.permute(0, 2, 1, 3, 4)
|
||||
b, f_times_2, c, h, w = out.shape
|
||||
out = out.reshape(b, f_times_2 // 2, 2 * c, h, w)
|
||||
out = out.permute(0, 2, 1, 3, 4).contiguous()
|
||||
return out
|
||||
|
||||
def process_out(self, latent):
|
||||
z = latent / self.scale_factor
|
||||
z = z.permute(0, 2, 1, 3, 4)
|
||||
b, f, c, h, w = z.shape
|
||||
z = z.reshape(b, f, 2, c // 2, h, w)
|
||||
z = z.permute(0, 1, 2, 3, 4, 5).reshape(b, f * 2, c // 2, h, w)
|
||||
z = z.permute(0, 2, 1, 3, 4)
|
||||
z = z[:, :, 1:]
|
||||
return z
|
||||
|
||||
|
||||
class HunyuanVideo15(LatentFormat):
|
||||
latent_rgb_factors = [
|
||||
[0.0568, -0.0521, -0.0131],
|
||||
[0.0014, 0.0735, 0.0326],
|
||||
[0.0186, 0.0531, -0.0138],
|
||||
[-0.0031, 0.0051, 0.0288],
|
||||
[0.0110, 0.0556, 0.0432],
|
||||
[-0.0041, -0.0023, -0.0485],
|
||||
[0.0530, 0.0413, 0.0253],
|
||||
[0.0283, 0.0251, 0.0339],
|
||||
[0.0277, -0.0372, -0.0093],
|
||||
[0.0393, 0.0944, 0.1131],
|
||||
[0.0020, 0.0251, 0.0037],
|
||||
[-0.0017, 0.0012, 0.0234],
|
||||
[0.0468, 0.0436, 0.0203],
|
||||
[0.0354, 0.0439, -0.0233],
|
||||
[0.0090, 0.0123, 0.0346],
|
||||
[0.0382, 0.0029, 0.0217],
|
||||
[0.0261, -0.0300, 0.0030],
|
||||
[-0.0088, -0.0220, -0.0283],
|
||||
[-0.0272, -0.0121, -0.0363],
|
||||
[-0.0664, -0.0622, 0.0144],
|
||||
[0.0414, 0.0479, 0.0529],
|
||||
[0.0355, 0.0612, -0.0247],
|
||||
[0.0147, 0.0264, 0.0174],
|
||||
[0.0438, 0.0038, 0.0542],
|
||||
[0.0431, -0.0573, -0.0033],
|
||||
[-0.0162, -0.0211, -0.0406],
|
||||
[-0.0487, -0.0295, -0.0393],
|
||||
[0.0005, -0.0109, 0.0253],
|
||||
[0.0296, 0.0591, 0.0353],
|
||||
[0.0119, 0.0181, -0.0306],
|
||||
[-0.0085, -0.0362, 0.0229],
|
||||
[0.0005, -0.0106, 0.0242]
|
||||
]
|
||||
|
||||
latent_rgb_factors_bias = [0.0456, -0.0202, -0.0644]
|
||||
latent_channels = 32
|
||||
latent_dimensions = 3
|
||||
scale_factor = 1.03682
|
||||
taesd_decoder_name = "lighttaehy1_5"
|
||||
|
||||
|
||||
class Hunyuan3Dv2(LatentFormat):
|
||||
latent_channels = 64
|
||||
latent_dimensions = 1
|
||||
scale_factor = 0.9990943042622529
|
||||
|
||||
|
||||
class Hunyuan3Dv2_1(LatentFormat):
|
||||
scale_factor = 1.0039506158752403
|
||||
latent_channels = 64
|
||||
latent_dimensions = 1
|
||||
|
||||
|
||||
class Hunyuan3Dv2mini(LatentFormat):
|
||||
latent_channels = 64
|
||||
latent_dimensions = 1
|
||||
scale_factor = 1.0188137142395404
|
||||
|
||||
|
||||
class ACEAudio(LatentFormat):
|
||||
latent_channels = 8
|
||||
latent_dimensions = 2
|
||||
|
||||
|
||||
class ChromaRadiance(LatentFormat):
|
||||
latent_channels = 3
|
||||
|
||||
def __init__(self):
|
||||
self.latent_rgb_factors = [
|
||||
# R G B
|
||||
[ 1.0, 0.0, 0.0 ],
|
||||
[ 0.0, 1.0, 0.0 ],
|
||||
[ 0.0, 0.0, 1.0 ]
|
||||
[1.0, 0.0, 0.0],
|
||||
[0.0, 1.0, 0.0],
|
||||
[0.0, 0.0, 1.0]
|
||||
]
|
||||
|
||||
def process_in(self, latent):
|
||||
|
||||
@ -1,29 +1,33 @@
|
||||
import torch
|
||||
from torch import Tensor, nn
|
||||
|
||||
from ..flux.math import attention
|
||||
from ..flux.layers import MLPEmbedder, RMSNorm, QKNorm, SelfAttention, ModulationOut
|
||||
from ..flux.layers import (
|
||||
MLPEmbedder,
|
||||
RMSNorm,
|
||||
ModulationOut,
|
||||
)
|
||||
|
||||
# TODO: remove this in a few months
|
||||
SingleStreamBlock = None
|
||||
DoubleStreamBlock = None
|
||||
|
||||
|
||||
class ChromaModulationOut(ModulationOut):
|
||||
@classmethod
|
||||
def from_offset(cls, tensor: torch.Tensor, offset: int = 0) -> ModulationOut:
|
||||
return cls(
|
||||
shift=tensor[:, offset : offset + 1, :],
|
||||
scale=tensor[:, offset + 1 : offset + 2, :],
|
||||
gate=tensor[:, offset + 2 : offset + 3, :],
|
||||
shift=tensor[:, offset: offset + 1, :],
|
||||
scale=tensor[:, offset + 1: offset + 2, :],
|
||||
gate=tensor[:, offset + 2: offset + 3, :],
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
class Approximator(nn.Module):
|
||||
def __init__(self, in_dim: int, out_dim: int, hidden_dim: int, n_layers = 5, dtype=None, device=None, operations=None):
|
||||
def __init__(self, in_dim: int, out_dim: int, hidden_dim: int, n_layers=5, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
self.in_proj = operations.Linear(in_dim, hidden_dim, bias=True, dtype=dtype, device=device)
|
||||
self.layers = nn.ModuleList([MLPEmbedder(hidden_dim, hidden_dim, dtype=dtype, device=device, operations=operations) for x in range( n_layers)])
|
||||
self.norms = nn.ModuleList([RMSNorm(hidden_dim, dtype=dtype, device=device, operations=operations) for x in range( n_layers)])
|
||||
self.layers = nn.ModuleList([MLPEmbedder(hidden_dim, hidden_dim, dtype=dtype, device=device, operations=operations) for x in range(n_layers)])
|
||||
self.norms = nn.ModuleList([RMSNorm(hidden_dim, dtype=dtype, device=device, operations=operations) for x in range(n_layers)])
|
||||
self.out_proj = operations.Linear(hidden_dim, out_dim, dtype=dtype, device=device)
|
||||
|
||||
@property
|
||||
@ -42,124 +46,6 @@ class Approximator(nn.Module):
|
||||
return x
|
||||
|
||||
|
||||
class DoubleStreamBlock(nn.Module):
|
||||
def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
|
||||
mlp_hidden_dim = int(hidden_size * mlp_ratio)
|
||||
self.num_heads = num_heads
|
||||
self.hidden_size = hidden_size
|
||||
self.img_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
|
||||
self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
|
||||
|
||||
self.img_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
|
||||
self.img_mlp = nn.Sequential(
|
||||
operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
|
||||
nn.GELU(approximate="tanh"),
|
||||
operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
|
||||
)
|
||||
|
||||
self.txt_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
|
||||
self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
|
||||
|
||||
self.txt_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
|
||||
self.txt_mlp = nn.Sequential(
|
||||
operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
|
||||
nn.GELU(approximate="tanh"),
|
||||
operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
|
||||
)
|
||||
self.flipped_img_txt = flipped_img_txt
|
||||
|
||||
def forward(self, img: Tensor, txt: Tensor, pe: Tensor, vec: Tensor, attn_mask=None, transformer_options={}):
|
||||
(img_mod1, img_mod2), (txt_mod1, txt_mod2) = vec
|
||||
|
||||
# prepare image for attention
|
||||
img_modulated = torch.addcmul(img_mod1.shift, 1 + img_mod1.scale, self.img_norm1(img))
|
||||
img_qkv = self.img_attn.qkv(img_modulated)
|
||||
img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
|
||||
img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
|
||||
|
||||
# prepare txt for attention
|
||||
txt_modulated = torch.addcmul(txt_mod1.shift, 1 + txt_mod1.scale, self.txt_norm1(txt))
|
||||
txt_qkv = self.txt_attn.qkv(txt_modulated)
|
||||
txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
|
||||
txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
|
||||
|
||||
# run actual attention
|
||||
attn = attention(torch.cat((txt_q, img_q), dim=2),
|
||||
torch.cat((txt_k, img_k), dim=2),
|
||||
torch.cat((txt_v, img_v), dim=2),
|
||||
pe=pe, mask=attn_mask, transformer_options=transformer_options)
|
||||
|
||||
txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
|
||||
|
||||
# calculate the img bloks
|
||||
img.addcmul_(img_mod1.gate, self.img_attn.proj(img_attn))
|
||||
img.addcmul_(img_mod2.gate, self.img_mlp(torch.addcmul(img_mod2.shift, 1 + img_mod2.scale, self.img_norm2(img))))
|
||||
|
||||
# calculate the txt bloks
|
||||
txt.addcmul_(txt_mod1.gate, self.txt_attn.proj(txt_attn))
|
||||
txt.addcmul_(txt_mod2.gate, self.txt_mlp(torch.addcmul(txt_mod2.shift, 1 + txt_mod2.scale, self.txt_norm2(txt))))
|
||||
|
||||
if txt.dtype == torch.float16:
|
||||
txt = torch.nan_to_num(txt, nan=0.0, posinf=65504, neginf=-65504)
|
||||
|
||||
return img, txt
|
||||
|
||||
|
||||
class SingleStreamBlock(nn.Module):
|
||||
"""
|
||||
A DiT block with parallel linear layers as described in
|
||||
https://arxiv.org/abs/2302.05442 and adapted modulation interface.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size: int,
|
||||
num_heads: int,
|
||||
mlp_ratio: float = 4.0,
|
||||
qk_scale: float = None,
|
||||
dtype=None,
|
||||
device=None,
|
||||
operations=None
|
||||
):
|
||||
super().__init__()
|
||||
self.hidden_dim = hidden_size
|
||||
self.num_heads = num_heads
|
||||
head_dim = hidden_size // num_heads
|
||||
self.scale = qk_scale or head_dim**-0.5
|
||||
|
||||
self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
|
||||
# qkv and mlp_in
|
||||
self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim, dtype=dtype, device=device)
|
||||
# proj and mlp_out
|
||||
self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, dtype=dtype, device=device)
|
||||
|
||||
self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.pre_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
|
||||
|
||||
self.mlp_act = nn.GELU(approximate="tanh")
|
||||
|
||||
def forward(self, x: Tensor, pe: Tensor, vec: Tensor, attn_mask=None, transformer_options={}) -> Tensor:
|
||||
mod = vec
|
||||
x_mod = torch.addcmul(mod.shift, 1 + mod.scale, self.pre_norm(x))
|
||||
qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
|
||||
|
||||
q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
|
||||
q, k = self.norm(q, k, v)
|
||||
|
||||
# compute attention
|
||||
attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options)
|
||||
# compute activation in mlp stream, cat again and run second linear layer
|
||||
output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
|
||||
x.addcmul_(mod.gate, output)
|
||||
if x.dtype == torch.float16:
|
||||
x = torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504)
|
||||
return x
|
||||
|
||||
|
||||
class LastLayer(nn.Module):
|
||||
def __init__(self, hidden_size: int, patch_size: int, out_channels: int, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
|
||||
@ -8,12 +8,15 @@ from einops import rearrange, repeat
|
||||
from ..common_dit import pad_to_patch_size
|
||||
from ...patcher_extension import WrapperExecutor, get_all_wrappers, WrappersMP
|
||||
|
||||
from ..flux.layers import EmbedND, timestep_embedding
|
||||
from ..flux.layers import (
|
||||
EmbedND,
|
||||
timestep_embedding,
|
||||
DoubleStreamBlock,
|
||||
SingleStreamBlock,
|
||||
)
|
||||
|
||||
from .layers import (
|
||||
DoubleStreamBlock,
|
||||
LastLayer,
|
||||
SingleStreamBlock,
|
||||
Approximator,
|
||||
ChromaModulationOut,
|
||||
)
|
||||
@ -37,6 +40,8 @@ class ChromaParams:
|
||||
out_dim: int
|
||||
hidden_dim: int
|
||||
n_layers: int
|
||||
txt_ids_dims: list
|
||||
vec_in_dim: int
|
||||
|
||||
|
||||
class Chroma(nn.Module):
|
||||
@ -84,6 +89,7 @@ class Chroma(nn.Module):
|
||||
self.num_heads,
|
||||
mlp_ratio=params.mlp_ratio,
|
||||
qkv_bias=params.qkv_bias,
|
||||
modulation=False,
|
||||
dtype=dtype, device=device, operations=operations
|
||||
)
|
||||
for _ in range(params.depth)
|
||||
@ -92,7 +98,7 @@ class Chroma(nn.Module):
|
||||
|
||||
self.single_blocks = nn.ModuleList(
|
||||
[
|
||||
SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, dtype=dtype, device=device, operations=operations)
|
||||
SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, modulation=False, dtype=dtype, device=device, operations=operations)
|
||||
for _ in range(params.depth_single_blocks)
|
||||
]
|
||||
)
|
||||
@ -173,7 +179,10 @@ class Chroma(nn.Module):
|
||||
pe = self.pe_embedder(ids)
|
||||
|
||||
blocks_replace = patches_replace.get("dit", {})
|
||||
transformer_options["total_blocks"] = len(self.double_blocks)
|
||||
transformer_options["block_type"] = "double"
|
||||
for i, block in enumerate(self.double_blocks):
|
||||
transformer_options["block_index"] = i
|
||||
if i not in self.skip_mmdit:
|
||||
double_mod = (
|
||||
self.get_modulations(mod_vectors, "double_img", idx=i),
|
||||
@ -216,7 +225,10 @@ class Chroma(nn.Module):
|
||||
|
||||
img = torch.cat((txt, img), 1)
|
||||
|
||||
transformer_options["total_blocks"] = len(self.single_blocks)
|
||||
transformer_options["block_type"] = "single"
|
||||
for i, block in enumerate(self.single_blocks):
|
||||
transformer_options["block_index"] = i
|
||||
if i not in self.skip_dit:
|
||||
single_mod = self.get_modulations(mod_vectors, "single", idx=i)
|
||||
if ("single_block", i) in blocks_replace:
|
||||
|
||||
@ -10,10 +10,10 @@ from torch import Tensor, nn
|
||||
from einops import repeat
|
||||
from ..common_dit import pad_to_patch_size
|
||||
|
||||
from ..flux.layers import EmbedND
|
||||
from ..flux.layers import EmbedND, DoubleStreamBlock, SingleStreamBlock
|
||||
|
||||
from ..chroma.model import Chroma, ChromaParams
|
||||
from ..chroma.layers import DoubleStreamBlock, SingleStreamBlock, Approximator
|
||||
from ..chroma.layers import Approximator
|
||||
from .layers import (
|
||||
NerfEmbedder,
|
||||
NerfGLUBlock,
|
||||
@ -35,7 +35,7 @@ class ChromaRadianceParams(ChromaParams):
|
||||
nerf_final_head_type: str
|
||||
# None means use the same dtype as the model.
|
||||
nerf_embedder_dtype: Optional[torch.dtype]
|
||||
|
||||
use_x0: bool
|
||||
|
||||
class ChromaRadiance(Chroma):
|
||||
"""
|
||||
@ -94,6 +94,7 @@ class ChromaRadiance(Chroma):
|
||||
self.num_heads,
|
||||
mlp_ratio=params.mlp_ratio,
|
||||
qkv_bias=params.qkv_bias,
|
||||
modulation=False,
|
||||
dtype=dtype, device=device, operations=operations
|
||||
)
|
||||
for _ in range(params.depth)
|
||||
@ -106,6 +107,7 @@ class ChromaRadiance(Chroma):
|
||||
self.hidden_size,
|
||||
self.num_heads,
|
||||
mlp_ratio=params.mlp_ratio,
|
||||
modulation=False,
|
||||
dtype=dtype, device=device, operations=operations,
|
||||
)
|
||||
for _ in range(params.depth_single_blocks)
|
||||
@ -157,6 +159,9 @@ class ChromaRadiance(Chroma):
|
||||
self.skip_dit = []
|
||||
self.lite = False
|
||||
|
||||
if params.use_x0:
|
||||
self.register_buffer("__x0__", torch.tensor([]))
|
||||
|
||||
@property
|
||||
def _nerf_final_layer(self) -> nn.Module:
|
||||
if self.params.nerf_final_head_type == "linear":
|
||||
@ -274,6 +279,12 @@ class ChromaRadiance(Chroma):
|
||||
params_dict |= overrides
|
||||
return params.__class__(**params_dict)
|
||||
|
||||
def _apply_x0_residual(self, predicted, noisy, timesteps):
|
||||
|
||||
# non zero during training to prevent 0 div
|
||||
eps = 0.0
|
||||
return (noisy - predicted) / (timesteps.view(-1,1,1,1) + eps)
|
||||
|
||||
def _forward(
|
||||
self,
|
||||
x: Tensor,
|
||||
@ -314,4 +325,11 @@ class ChromaRadiance(Chroma):
|
||||
transformer_options,
|
||||
attn_mask=kwargs.get("attention_mask", None),
|
||||
)
|
||||
return self.forward_nerf(img, img_out, params)[:, :, :h, :w]
|
||||
|
||||
out = self.forward_nerf(img, img_out, params)[:, :, :h, :w]
|
||||
|
||||
# If x0 variant → v-pred, just return this instead
|
||||
if hasattr(self, "__x0__"):
|
||||
out = self._apply_x0_residual(out, img, timestep)
|
||||
return out
|
||||
|
||||
|
||||
@ -47,15 +47,44 @@ def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 10
|
||||
return embedding
|
||||
|
||||
class MLPEmbedder(nn.Module):
|
||||
def __init__(self, in_dim: int, hidden_dim: int, dtype=None, device=None, operations=None):
|
||||
def __init__(self, in_dim: int, hidden_dim: int, bias=True, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
self.in_layer = operations.Linear(in_dim, hidden_dim, bias=True, dtype=dtype, device=device)
|
||||
self.in_layer = operations.Linear(in_dim, hidden_dim, bias=bias, dtype=dtype, device=device)
|
||||
self.silu = nn.SiLU()
|
||||
self.out_layer = operations.Linear(hidden_dim, hidden_dim, bias=True, dtype=dtype, device=device)
|
||||
self.out_layer = operations.Linear(hidden_dim, hidden_dim, bias=bias, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, x: Tensor) -> Tensor:
|
||||
return self.out_layer(self.silu(self.in_layer(x)))
|
||||
|
||||
class YakMLP(nn.Module):
|
||||
def __init__(self, hidden_size: int, intermediate_size: int, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.gate_proj = operations.Linear(self.hidden_size, self.intermediate_size, bias=True, dtype=dtype, device=device)
|
||||
self.up_proj = operations.Linear(self.hidden_size, self.intermediate_size, bias=True, dtype=dtype, device=device)
|
||||
self.down_proj = operations.Linear(self.intermediate_size, self.hidden_size, bias=True, dtype=dtype, device=device)
|
||||
self.act_fn = nn.SiLU()
|
||||
|
||||
def forward(self, x: Tensor) -> Tensor:
|
||||
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
|
||||
return down_proj
|
||||
|
||||
def build_mlp(hidden_size, mlp_hidden_dim, mlp_silu_act=False, yak_mlp=False, dtype=None, device=None, operations=None):
|
||||
if yak_mlp:
|
||||
return YakMLP(hidden_size, mlp_hidden_dim, dtype=dtype, device=device, operations=operations)
|
||||
if mlp_silu_act:
|
||||
return nn.Sequential(
|
||||
operations.Linear(hidden_size, mlp_hidden_dim * 2, bias=False, dtype=dtype, device=device),
|
||||
SiLUActivation(),
|
||||
operations.Linear(mlp_hidden_dim, hidden_size, bias=False, dtype=dtype, device=device),
|
||||
)
|
||||
else:
|
||||
return nn.Sequential(
|
||||
operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
|
||||
nn.GELU(approximate="tanh"),
|
||||
operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
|
||||
)
|
||||
|
||||
class RMSNorm(torch.nn.Module):
|
||||
def __init__(self, dim: int, dtype=None, device=None, operations=None):
|
||||
@ -79,14 +108,14 @@ class QKNorm(torch.nn.Module):
|
||||
|
||||
|
||||
class SelfAttention(nn.Module):
|
||||
def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False, dtype=None, device=None, operations=None):
|
||||
def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False, proj_bias: bool = True, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
self.num_heads = num_heads
|
||||
head_dim = dim // num_heads
|
||||
|
||||
self.qkv = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device)
|
||||
self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)
|
||||
self.proj = operations.Linear(dim, dim, dtype=dtype, device=device)
|
||||
self.proj = operations.Linear(dim, dim, bias=proj_bias, dtype=dtype, device=device)
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -97,11 +126,11 @@ class ModulationOut:
|
||||
|
||||
|
||||
class Modulation(nn.Module):
|
||||
def __init__(self, dim: int, double: bool, dtype=None, device=None, operations=None):
|
||||
def __init__(self, dim: int, double: bool, bias=True, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
self.is_double = double
|
||||
self.multiplier = 6 if double else 3
|
||||
self.lin = operations.Linear(dim, self.multiplier * dim, bias=True, dtype=dtype, device=device)
|
||||
self.lin = operations.Linear(dim, self.multiplier * dim, bias=bias, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, vec: Tensor) -> tuple:
|
||||
if vec.ndim == 2:
|
||||
@ -128,80 +157,110 @@ def apply_mod(tensor, m_mult, m_add=None, modulation_dims=None):
|
||||
return tensor
|
||||
|
||||
|
||||
class SiLUActivation(nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.gate_fn = nn.SiLU()
|
||||
|
||||
def forward(self, x: Tensor) -> Tensor:
|
||||
x1, x2 = x.chunk(2, dim=-1)
|
||||
return self.gate_fn(x1) * x2
|
||||
|
||||
|
||||
class DoubleStreamBlock(nn.Module):
|
||||
def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, dtype=None, device=None, operations=None):
|
||||
def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, modulation=True, mlp_silu_act=False, proj_bias=True, yak_mlp=False, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
|
||||
mlp_hidden_dim = int(hidden_size * mlp_ratio)
|
||||
self.num_heads = num_heads
|
||||
self.hidden_size = hidden_size
|
||||
self.img_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
|
||||
self.modulation = modulation
|
||||
|
||||
if self.modulation:
|
||||
self.img_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
|
||||
|
||||
self.img_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
|
||||
self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
|
||||
self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, proj_bias=proj_bias, dtype=dtype, device=device, operations=operations)
|
||||
|
||||
self.img_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
|
||||
self.img_mlp = nn.Sequential(
|
||||
operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
|
||||
nn.GELU(approximate="tanh"),
|
||||
operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
|
||||
)
|
||||
|
||||
self.txt_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
|
||||
self.img_mlp = build_mlp(hidden_size, mlp_hidden_dim, mlp_silu_act=mlp_silu_act, yak_mlp=yak_mlp, dtype=dtype, device=device, operations=operations)
|
||||
|
||||
if self.modulation:
|
||||
self.txt_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
|
||||
|
||||
self.txt_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
|
||||
self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
|
||||
self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, proj_bias=proj_bias, dtype=dtype, device=device, operations=operations)
|
||||
|
||||
self.txt_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
|
||||
self.txt_mlp = nn.Sequential(
|
||||
operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
|
||||
nn.GELU(approximate="tanh"),
|
||||
operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
|
||||
)
|
||||
|
||||
self.txt_mlp = build_mlp(hidden_size, mlp_hidden_dim, mlp_silu_act=mlp_silu_act, yak_mlp=yak_mlp, dtype=dtype, device=device, operations=operations)
|
||||
|
||||
self.flipped_img_txt = flipped_img_txt
|
||||
|
||||
def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims_img=None, modulation_dims_txt=None, transformer_options={}):
|
||||
img_mod1, img_mod2 = self.img_mod(vec)
|
||||
txt_mod1, txt_mod2 = self.txt_mod(vec)
|
||||
if self.modulation:
|
||||
img_mod1, img_mod2 = self.img_mod(vec)
|
||||
txt_mod1, txt_mod2 = self.txt_mod(vec)
|
||||
else:
|
||||
(img_mod1, img_mod2), (txt_mod1, txt_mod2) = vec
|
||||
|
||||
# prepare image for attention
|
||||
img_modulated = self.img_norm1(img)
|
||||
img_modulated = apply_mod(img_modulated, (1 + img_mod1.scale), img_mod1.shift, modulation_dims_img)
|
||||
img_qkv = self.img_attn.qkv(img_modulated)
|
||||
img_qkv = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
|
||||
img_q, img_k, img_v = torch.unbind(img_qkv, dim=0)
|
||||
del img_modulated
|
||||
img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
|
||||
del img_qkv
|
||||
img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
|
||||
|
||||
# prepare txt for attention
|
||||
txt_modulated = self.txt_norm1(txt)
|
||||
txt_modulated = apply_mod(txt_modulated, (1 + txt_mod1.scale), txt_mod1.shift, modulation_dims_txt)
|
||||
txt_qkv = self.txt_attn.qkv(txt_modulated)
|
||||
txt_qkv = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
|
||||
txt_q, txt_k, txt_v = torch.unbind(txt_qkv, dim=0)
|
||||
del txt_modulated
|
||||
txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
|
||||
del txt_qkv
|
||||
txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
|
||||
|
||||
if self.flipped_img_txt:
|
||||
q = torch.cat((img_q, txt_q), dim=2)
|
||||
del img_q, txt_q
|
||||
k = torch.cat((img_k, txt_k), dim=2)
|
||||
del img_k, txt_k
|
||||
v = torch.cat((img_v, txt_v), dim=2)
|
||||
del img_v, txt_v
|
||||
# run actual attention
|
||||
attn = attention(torch.cat((img_q, txt_q), dim=2),
|
||||
torch.cat((img_k, txt_k), dim=2),
|
||||
torch.cat((img_v, txt_v), dim=2),
|
||||
attn = attention(q, k, v,
|
||||
pe=pe, mask=attn_mask, transformer_options=transformer_options)
|
||||
del q, k, v
|
||||
|
||||
img_attn, txt_attn = attn[:, : img.shape[1]], attn[:, img.shape[1]:]
|
||||
else:
|
||||
q = torch.cat((txt_q, img_q), dim=2)
|
||||
del txt_q, img_q
|
||||
k = torch.cat((txt_k, img_k), dim=2)
|
||||
del txt_k, img_k
|
||||
v = torch.cat((txt_v, img_v), dim=2)
|
||||
del txt_v, img_v
|
||||
# run actual attention
|
||||
attn = attention(torch.cat((txt_q, img_q), dim=2),
|
||||
torch.cat((txt_k, img_k), dim=2),
|
||||
torch.cat((txt_v, img_v), dim=2),
|
||||
attn = attention(q, k, v,
|
||||
pe=pe, mask=attn_mask, transformer_options=transformer_options)
|
||||
del q, k, v
|
||||
|
||||
txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:]
|
||||
|
||||
# calculate the img bloks
|
||||
img = img + apply_mod(self.img_attn.proj(img_attn), img_mod1.gate, None, modulation_dims_img)
|
||||
img = img + apply_mod(self.img_mlp(apply_mod(self.img_norm2(img), (1 + img_mod2.scale), img_mod2.shift, modulation_dims_img)), img_mod2.gate, None, modulation_dims_img)
|
||||
# calculate the img blocks
|
||||
# todo: do we have to re-investigate this += versus img = img + ... op?
|
||||
img += apply_mod(self.img_attn.proj(img_attn), img_mod1.gate, None, modulation_dims_img)
|
||||
del img_attn
|
||||
img += apply_mod(self.img_mlp(apply_mod(self.img_norm2(img), (1 + img_mod2.scale), img_mod2.shift, modulation_dims_img)), img_mod2.gate, None, modulation_dims_img)
|
||||
|
||||
# calculate the txt bloks
|
||||
txt = txt + apply_mod(self.txt_attn.proj(txt_attn), txt_mod1.gate, None, modulation_dims_txt)
|
||||
txt = txt + apply_mod(self.txt_mlp(apply_mod(self.txt_norm2(txt), (1 + txt_mod2.scale), txt_mod2.shift, modulation_dims_txt)), txt_mod2.gate, None, modulation_dims_txt)
|
||||
# calculate the txt blocks
|
||||
# todo: do we have to re-investigate this += versus txt = txt + ... op?
|
||||
txt += apply_mod(self.txt_attn.proj(txt_attn), txt_mod1.gate, None, modulation_dims_txt)
|
||||
del txt_attn
|
||||
txt += apply_mod(self.txt_mlp(apply_mod(self.txt_norm2(txt), (1 + txt_mod2.scale), txt_mod2.shift, modulation_dims_txt)), txt_mod2.gate, None, modulation_dims_txt)
|
||||
|
||||
if txt.dtype == torch.float16:
|
||||
txt = torch.nan_to_num(txt, nan=0.0, posinf=65504, neginf=-65504)
|
||||
@ -221,6 +280,10 @@ class SingleStreamBlock(nn.Module):
|
||||
num_heads: int,
|
||||
mlp_ratio: float = 4.0,
|
||||
qk_scale: float = None,
|
||||
modulation=True,
|
||||
mlp_silu_act=False,
|
||||
bias=True,
|
||||
yak_mlp=False,
|
||||
dtype=None,
|
||||
device=None,
|
||||
operations=None
|
||||
@ -232,31 +295,57 @@ class SingleStreamBlock(nn.Module):
|
||||
self.scale = qk_scale or head_dim**-0.5
|
||||
|
||||
self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
|
||||
|
||||
self.mlp_hidden_dim_first = self.mlp_hidden_dim
|
||||
self.yak_mlp = yak_mlp
|
||||
if mlp_silu_act:
|
||||
self.mlp_hidden_dim_first = int(hidden_size * mlp_ratio * 2)
|
||||
self.mlp_act = SiLUActivation()
|
||||
else:
|
||||
self.mlp_act = nn.GELU(approximate="tanh")
|
||||
|
||||
if self.yak_mlp:
|
||||
self.mlp_hidden_dim_first *= 2
|
||||
self.mlp_act = nn.SiLU()
|
||||
|
||||
# qkv and mlp_in
|
||||
self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim, dtype=dtype, device=device)
|
||||
self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim_first, bias=bias, dtype=dtype, device=device)
|
||||
# proj and mlp_out
|
||||
self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, dtype=dtype, device=device)
|
||||
self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, bias=bias, dtype=dtype, device=device)
|
||||
|
||||
self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.pre_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
|
||||
|
||||
self.mlp_act = nn.GELU(approximate="tanh")
|
||||
self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations)
|
||||
if modulation:
|
||||
self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations)
|
||||
else:
|
||||
self.modulation = None
|
||||
|
||||
def forward(self, x: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims=None, transformer_options={}) -> Tensor:
|
||||
mod, _ = self.modulation(vec)
|
||||
qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
|
||||
if self.modulation:
|
||||
mod, _ = self.modulation(vec)
|
||||
else:
|
||||
mod = vec
|
||||
|
||||
qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim_first], dim=-1)
|
||||
|
||||
qkv = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
|
||||
q, k, v = torch.unbind(qkv, dim=0)
|
||||
del qkv
|
||||
|
||||
q, k = self.norm(q, k, v)
|
||||
|
||||
# compute attention
|
||||
attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options)
|
||||
del q, k, v
|
||||
# compute activation in mlp stream, cat again and run second linear layer
|
||||
output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
|
||||
if self.yak_mlp:
|
||||
mlp = self.mlp_act(mlp[..., self.mlp_hidden_dim_first // 2:]) * mlp[..., :self.mlp_hidden_dim_first // 2]
|
||||
else:
|
||||
mlp = self.mlp_act(mlp)
|
||||
output = self.linear2(torch.cat((attn, mlp), 2))
|
||||
x = x + apply_mod(output, mod.gate, None, modulation_dims)
|
||||
if x.dtype == torch.float16:
|
||||
x = torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504)
|
||||
@ -264,11 +353,11 @@ class SingleStreamBlock(nn.Module):
|
||||
|
||||
|
||||
class LastLayer(nn.Module):
|
||||
def __init__(self, hidden_size: int, patch_size: int, out_channels: int, dtype=None, device=None, operations=None):
|
||||
def __init__(self, hidden_size: int, patch_size: int, out_channels: int, bias=True, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
self.norm_final = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
|
||||
self.linear = operations.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True, dtype=dtype, device=device)
|
||||
self.adaLN_modulation = nn.Sequential(nn.SiLU(), operations.Linear(hidden_size, 2 * hidden_size, bias=True, dtype=dtype, device=device))
|
||||
self.linear = operations.Linear(hidden_size, patch_size * patch_size * out_channels, bias=bias, dtype=dtype, device=device)
|
||||
self.adaLN_modulation = nn.Sequential(nn.SiLU(), operations.Linear(hidden_size, 2 * hidden_size, bias=bias, dtype=dtype, device=device))
|
||||
|
||||
def forward(self, x: Tensor, vec: Tensor, modulation_dims=None) -> Tensor:
|
||||
if vec.ndim == 2:
|
||||
|
||||
@ -7,15 +7,8 @@ from ... import model_management
|
||||
|
||||
|
||||
def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None, transformer_options={}) -> Tensor:
|
||||
q_shape = q.shape
|
||||
k_shape = k.shape
|
||||
|
||||
if pe is not None:
|
||||
q = q.to(dtype=pe.dtype).reshape(*q.shape[:-1], -1, 1, 2)
|
||||
k = k.to(dtype=pe.dtype).reshape(*k.shape[:-1], -1, 1, 2)
|
||||
q = (pe[..., 0] * q[..., 0] + pe[..., 1] * q[..., 1]).reshape(*q_shape).type_as(v)
|
||||
k = (pe[..., 0] * k[..., 0] + pe[..., 1] * k[..., 1]).reshape(*k_shape).type_as(v)
|
||||
|
||||
q, k = apply_rope(q, k, pe)
|
||||
heads = q.shape[1]
|
||||
x = optimized_attention(q, k, v, heads, skip_reshape=True, mask=mask, transformer_options=transformer_options)
|
||||
return x
|
||||
|
||||
@ -1,9 +1,11 @@
|
||||
# Original code can be found on: https://github.com/black-forest-labs/flux
|
||||
|
||||
import torch
|
||||
from dataclasses import dataclass
|
||||
from einops import rearrange, repeat
|
||||
|
||||
import torch
|
||||
from torch import Tensor, nn
|
||||
from einops import rearrange, repeat
|
||||
from ..common_dit import pad_to_patch_size
|
||||
from ...patcher_extension import WrapperExecutor, get_all_wrappers, WrappersMP
|
||||
|
||||
from .layers import (
|
||||
@ -13,8 +15,9 @@ from .layers import (
|
||||
MLPEmbedder,
|
||||
SingleStreamBlock,
|
||||
timestep_embedding,
|
||||
Modulation,
|
||||
RMSNorm
|
||||
)
|
||||
from .. import common_dit
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -33,6 +36,14 @@ class FluxParams:
|
||||
patch_size: int
|
||||
qkv_bias: bool
|
||||
guidance_embed: bool
|
||||
txt_ids_dims: list
|
||||
global_modulation: bool = False
|
||||
mlp_silu_act: bool = False
|
||||
ops_bias: bool = True
|
||||
default_ref_method: str = "offset"
|
||||
ref_index_scale: float = 1.0
|
||||
yak_mlp: bool = False
|
||||
txt_norm: bool = False
|
||||
|
||||
|
||||
class Flux(nn.Module):
|
||||
@ -42,8 +53,6 @@ class Flux(nn.Module):
|
||||
|
||||
def __init__(self, image_model=None, final_layer=True, dtype=None, device=None, operations=None, **kwargs):
|
||||
super().__init__()
|
||||
# todo: should this be here?
|
||||
self.device = device
|
||||
self.dtype = dtype
|
||||
params = FluxParams(**kwargs)
|
||||
self.params = params
|
||||
@ -60,13 +69,22 @@ class Flux(nn.Module):
|
||||
self.hidden_size = params.hidden_size
|
||||
self.num_heads = params.num_heads
|
||||
self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
|
||||
self.img_in = operations.Linear(self.in_channels, self.hidden_size, bias=True, dtype=dtype, device=device)
|
||||
self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations)
|
||||
self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size, dtype=dtype, device=device, operations=operations)
|
||||
self.img_in = operations.Linear(self.in_channels, self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device)
|
||||
self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device, operations=operations)
|
||||
if params.vec_in_dim is not None:
|
||||
self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size, dtype=dtype, device=device, operations=operations)
|
||||
else:
|
||||
self.vector_in = None
|
||||
|
||||
self.guidance_in = (
|
||||
MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations) if params.guidance_embed else nn.Identity()
|
||||
MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device, operations=operations) if params.guidance_embed else nn.Identity()
|
||||
)
|
||||
self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, dtype=dtype, device=device)
|
||||
self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device)
|
||||
|
||||
if params.txt_norm:
|
||||
self.txt_norm = RMSNorm(params.context_in_dim, dtype=dtype, device=device, operations=operations)
|
||||
else:
|
||||
self.txt_norm = None
|
||||
|
||||
self.double_blocks = nn.ModuleList(
|
||||
[
|
||||
@ -75,6 +93,10 @@ class Flux(nn.Module):
|
||||
self.num_heads,
|
||||
mlp_ratio=params.mlp_ratio,
|
||||
qkv_bias=params.qkv_bias,
|
||||
modulation=params.global_modulation is False,
|
||||
mlp_silu_act=params.mlp_silu_act,
|
||||
proj_bias=params.ops_bias,
|
||||
yak_mlp=params.yak_mlp,
|
||||
dtype=dtype, device=device, operations=operations
|
||||
)
|
||||
for _ in range(params.depth)
|
||||
@ -83,13 +105,30 @@ class Flux(nn.Module):
|
||||
|
||||
self.single_blocks = nn.ModuleList(
|
||||
[
|
||||
SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, dtype=dtype, device=device, operations=operations)
|
||||
SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, modulation=params.global_modulation is False, mlp_silu_act=params.mlp_silu_act, bias=params.ops_bias, yak_mlp=params.yak_mlp, dtype=dtype, device=device, operations=operations)
|
||||
for _ in range(params.depth_single_blocks)
|
||||
]
|
||||
)
|
||||
|
||||
if final_layer:
|
||||
self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels, dtype=dtype, device=device, operations=operations)
|
||||
self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels, bias=params.ops_bias, dtype=dtype, device=device, operations=operations)
|
||||
|
||||
if params.global_modulation:
|
||||
self.double_stream_modulation_img = Modulation(
|
||||
self.hidden_size,
|
||||
double=True,
|
||||
bias=False,
|
||||
dtype=dtype, device=device, operations=operations
|
||||
)
|
||||
self.double_stream_modulation_txt = Modulation(
|
||||
self.hidden_size,
|
||||
double=True,
|
||||
bias=False,
|
||||
dtype=dtype, device=device, operations=operations
|
||||
)
|
||||
self.single_stream_modulation = Modulation(
|
||||
self.hidden_size, double=False, bias=False, dtype=dtype, device=device, operations=operations
|
||||
)
|
||||
|
||||
def forward_orig(
|
||||
self,
|
||||
@ -107,9 +146,6 @@ class Flux(nn.Module):
|
||||
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
if y is None:
|
||||
y = torch.zeros((img.shape[0], self.params.vec_in_dim), device=img.device, dtype=img.dtype)
|
||||
|
||||
patches = transformer_options.get("patches", {})
|
||||
patches_replace = transformer_options.get("patches_replace", {})
|
||||
if img.ndim != 3 or txt.ndim != 3:
|
||||
@ -122,9 +158,19 @@ class Flux(nn.Module):
|
||||
if guidance is not None:
|
||||
vec = vec + self.guidance_in(timestep_embedding(guidance, 256).to(img.dtype))
|
||||
|
||||
vec = vec + self.vector_in(y[:, :self.params.vec_in_dim])
|
||||
if self.vector_in is not None:
|
||||
if y is None:
|
||||
y = torch.zeros((img.shape[0], self.params.vec_in_dim), device=img.device, dtype=img.dtype)
|
||||
vec = vec + self.vector_in(y[:, :self.params.vec_in_dim])
|
||||
|
||||
if self.txt_norm is not None:
|
||||
txt = self.txt_norm(txt)
|
||||
txt = self.txt_in(txt)
|
||||
|
||||
vec_orig = vec
|
||||
if self.params.global_modulation:
|
||||
vec = (self.double_stream_modulation_img(vec_orig), self.double_stream_modulation_txt(vec_orig))
|
||||
|
||||
if "post_input" in patches:
|
||||
for p in patches["post_input"]:
|
||||
out = p({"img": img, "txt": txt, "img_ids": img_ids, "txt_ids": txt_ids})
|
||||
@ -140,7 +186,10 @@ class Flux(nn.Module):
|
||||
pe = None
|
||||
|
||||
blocks_replace = patches_replace.get("dit", {})
|
||||
transformer_options["total_blocks"] = len(self.double_blocks)
|
||||
transformer_options["block_type"] = "double"
|
||||
for i, block in enumerate(self.double_blocks):
|
||||
transformer_options["block_index"] = i
|
||||
if ("double_block", i) in blocks_replace:
|
||||
def block_wrap_1(args):
|
||||
out = {}
|
||||
@ -181,7 +230,13 @@ class Flux(nn.Module):
|
||||
|
||||
img = torch.cat((txt, img), 1)
|
||||
|
||||
if self.params.global_modulation:
|
||||
vec, _ = self.single_stream_modulation(vec_orig)
|
||||
|
||||
transformer_options["total_blocks"] = len(self.single_blocks)
|
||||
transformer_options["block_type"] = "single"
|
||||
for i, block in enumerate(self.single_blocks):
|
||||
transformer_options["block_index"] = i
|
||||
if ("single_block", i) in blocks_replace:
|
||||
def block_wrap_2(args):
|
||||
out = {}
|
||||
@ -211,13 +266,15 @@ class Flux(nn.Module):
|
||||
|
||||
img = img[:, txt.shape[1]:, ...]
|
||||
|
||||
img = self.final_layer(img, vec) # (N, T, patch_size ** 2 * out_channels)
|
||||
img = self.final_layer(img, vec_orig) # (N, T, patch_size ** 2 * out_channels)
|
||||
return img
|
||||
|
||||
def process_img(self, x, index=0, h_offset=0, w_offset=0):
|
||||
def process_img(self, x, index=0, h_offset=0, w_offset=0, transformer_options=None):
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
bs, c, h, w = x.shape
|
||||
patch_size = self.patch_size
|
||||
x = common_dit.pad_to_patch_size(x, (patch_size, patch_size))
|
||||
x = pad_to_patch_size(x, (patch_size, patch_size))
|
||||
|
||||
img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
|
||||
h_len = ((h + (patch_size // 2)) // patch_size)
|
||||
@ -226,10 +283,22 @@ class Flux(nn.Module):
|
||||
h_offset = ((h_offset + (patch_size // 2)) // patch_size)
|
||||
w_offset = ((w_offset + (patch_size // 2)) // patch_size)
|
||||
|
||||
img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
|
||||
steps_h = h_len
|
||||
steps_w = w_len
|
||||
|
||||
rope_options = transformer_options.get("rope_options", None)
|
||||
if rope_options is not None:
|
||||
h_len = (h_len - 1.0) * rope_options.get("scale_y", 1.0) + 1.0
|
||||
w_len = (w_len - 1.0) * rope_options.get("scale_x", 1.0) + 1.0
|
||||
|
||||
index += rope_options.get("shift_t", 0.0)
|
||||
h_offset += rope_options.get("shift_y", 0.0)
|
||||
w_offset += rope_options.get("shift_x", 0.0)
|
||||
|
||||
img_ids = torch.zeros((steps_h, steps_w, len(self.params.axes_dim)), device=x.device, dtype=torch.float32)
|
||||
img_ids[:, :, 0] = img_ids[:, :, 1] + index
|
||||
img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1)
|
||||
img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
|
||||
img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=steps_h, device=x.device, dtype=torch.float32).unsqueeze(1)
|
||||
img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=steps_w, device=x.device, dtype=torch.float32).unsqueeze(0)
|
||||
return img, repeat(img_ids, "h w c -> b (h w) c", b=bs)
|
||||
|
||||
def forward(self, x, timestep, context, y=None, guidance=None, ref_latents=None, control=None, transformer_options=None, **kwargs):
|
||||
@ -249,16 +318,16 @@ class Flux(nn.Module):
|
||||
|
||||
h_len = ((h_orig + (patch_size // 2)) // patch_size)
|
||||
w_len = ((w_orig + (patch_size // 2)) // patch_size)
|
||||
img, img_ids = self.process_img(x)
|
||||
img, img_ids = self.process_img(x, transformer_options=transformer_options)
|
||||
img_tokens = img.shape[1]
|
||||
if ref_latents is not None:
|
||||
h = 0
|
||||
w = 0
|
||||
index = 0
|
||||
ref_latents_method = kwargs.get("ref_latents_method", "offset")
|
||||
ref_latents_method = kwargs.get("ref_latents_method", self.params.default_ref_method)
|
||||
for ref in ref_latents:
|
||||
if ref_latents_method == "index":
|
||||
index += 1
|
||||
index += self.params.ref_index_scale
|
||||
h_offset = 0
|
||||
w_offset = 0
|
||||
elif ref_latents_method == "uxo":
|
||||
@ -282,7 +351,12 @@ class Flux(nn.Module):
|
||||
img = torch.cat([img, kontext], dim=1)
|
||||
img_ids = torch.cat([img_ids, kontext_ids], dim=1)
|
||||
|
||||
txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
|
||||
txt_ids = torch.zeros((bs, context.shape[1], len(self.params.axes_dim)), device=x.device, dtype=torch.float32)
|
||||
|
||||
if len(self.params.txt_ids_dims) > 0:
|
||||
for i in self.params.txt_ids_dims:
|
||||
txt_ids[:, :, i] = torch.linspace(0, context.shape[1] - 1, steps=context.shape[1], device=x.device, dtype=torch.float32)
|
||||
|
||||
out = self.forward_orig(img, img_ids, context, txt_ids, timestep, y, guidance, control, transformer_options, attn_mask=kwargs.get("attention_mask", None))
|
||||
out = out[:, :img_tokens]
|
||||
return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)[:, :, :h_orig, :w_orig]
|
||||
return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=self.patch_size, pw=self.patch_size)[:, :, :h_orig, :w_orig]
|
||||
|
||||
@ -33,6 +33,9 @@ class HunyuanVideoParams:
|
||||
guidance_embed: bool
|
||||
byt5: bool
|
||||
meanflow: bool
|
||||
use_cond_type_embedding: bool
|
||||
vision_in_dim: int
|
||||
meanflow_sum: bool
|
||||
|
||||
|
||||
class SelfAttentionRef(nn.Module):
|
||||
@ -153,7 +156,10 @@ class TokenRefiner(nn.Module):
|
||||
t = self.t_embedder(timestep_embedding(timesteps, 256, time_factor=1.0).to(x.dtype))
|
||||
# m = mask.float().unsqueeze(-1)
|
||||
# c = (x.float() * m).sum(dim=1) / m.sum(dim=1) #TODO: the following works when the x.shape is the same length as the tokens but might break otherwise
|
||||
c = x.sum(dim=1) / x.shape[1]
|
||||
if x.dtype == torch.float16:
|
||||
c = x.float().sum(dim=1) / x.shape[1]
|
||||
else:
|
||||
c = x.sum(dim=1) / x.shape[1]
|
||||
|
||||
c = t + self.c_embedder(c.to(x.dtype))
|
||||
x = self.input_embedder(x)
|
||||
@ -193,11 +199,15 @@ class HunyuanVideo(nn.Module):
|
||||
def __init__(self, image_model=None, final_layer=True, dtype=None, device=None, operations=None, **kwargs):
|
||||
super().__init__()
|
||||
self.dtype = dtype
|
||||
operation_settings = {"operations": operations, "device": device, "dtype": dtype}
|
||||
|
||||
params = HunyuanVideoParams(**kwargs)
|
||||
self.params = params
|
||||
self.patch_size = params.patch_size
|
||||
self.in_channels = params.in_channels
|
||||
self.out_channels = params.out_channels
|
||||
self.use_cond_type_embedding = params.use_cond_type_embedding
|
||||
self.vision_in_dim = params.vision_in_dim
|
||||
if params.hidden_size % params.num_heads != 0:
|
||||
raise ValueError(
|
||||
f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
|
||||
@ -263,6 +273,18 @@ class HunyuanVideo(nn.Module):
|
||||
if final_layer:
|
||||
self.final_layer = LastLayer(self.hidden_size, self.patch_size[-1], self.out_channels, dtype=dtype, device=device, operations=operations)
|
||||
|
||||
# HunyuanVideo 1.5 specific modules
|
||||
if self.vision_in_dim is not None:
|
||||
from ..wan.model import MLPProj
|
||||
self.vision_in = MLPProj(in_dim=self.vision_in_dim, out_dim=self.hidden_size, operation_settings=operation_settings)
|
||||
else:
|
||||
self.vision_in = None
|
||||
if self.use_cond_type_embedding:
|
||||
# 0: text_encoder feature 1: byt5 feature 2: vision_encoder feature
|
||||
self.cond_type_embedding = nn.Embedding(3, self.hidden_size)
|
||||
else:
|
||||
self.cond_type_embedding = None
|
||||
|
||||
def forward_orig(
|
||||
self,
|
||||
img: Tensor,
|
||||
@ -273,7 +295,7 @@ class HunyuanVideo(nn.Module):
|
||||
timesteps: Tensor,
|
||||
y: Tensor = None,
|
||||
txt_byt5=None,
|
||||
guidance: Tensor = None,
|
||||
clip_fea=None,guidance: Tensor = None,
|
||||
guiding_frame_index=None,
|
||||
ref_latent=None,
|
||||
disable_time_r=False,
|
||||
@ -295,7 +317,7 @@ class HunyuanVideo(nn.Module):
|
||||
timesteps_r = transformer_options['sample_sigmas'][w[0] + 1]
|
||||
timesteps_r = timesteps_r.unsqueeze(0).to(device=timesteps.device, dtype=timesteps.dtype)
|
||||
vec_r = self.time_r_in(timestep_embedding(timesteps_r, 256, time_factor=1000.0).to(img.dtype))
|
||||
vec = (vec + vec_r) / 2
|
||||
vec = (vec + vec_r) if self.params.meanflow_sum else (vec + vec_r) / 2
|
||||
|
||||
if ref_latent is not None:
|
||||
ref_latent_ids = self.img_ids(ref_latent)
|
||||
@ -330,12 +352,31 @@ class HunyuanVideo(nn.Module):
|
||||
|
||||
txt = self.txt_in(txt, timesteps, txt_mask, transformer_options=transformer_options)
|
||||
|
||||
if self.cond_type_embedding is not None:
|
||||
self.cond_type_embedding.to(txt.device)
|
||||
cond_emb = self.cond_type_embedding(torch.zeros_like(txt[:, :, 0], device=txt.device, dtype=torch.long))
|
||||
txt = txt + cond_emb.to(txt.dtype)
|
||||
|
||||
if self.byt5_in is not None and txt_byt5 is not None:
|
||||
txt_byt5 = self.byt5_in(txt_byt5)
|
||||
if self.cond_type_embedding is not None:
|
||||
cond_emb = self.cond_type_embedding(torch.ones_like(txt_byt5[:, :, 0], device=txt_byt5.device, dtype=torch.long))
|
||||
txt_byt5 = txt_byt5 + cond_emb.to(txt_byt5.dtype)
|
||||
txt = torch.cat((txt_byt5, txt), dim=1) # byt5 first for HunyuanVideo1.5
|
||||
else:
|
||||
txt = torch.cat((txt, txt_byt5), dim=1)
|
||||
txt_byt5_ids = torch.zeros((txt_ids.shape[0], txt_byt5.shape[1], txt_ids.shape[-1]), device=txt_ids.device, dtype=txt_ids.dtype)
|
||||
txt = torch.cat((txt, txt_byt5), dim=1)
|
||||
txt_ids = torch.cat((txt_ids, txt_byt5_ids), dim=1)
|
||||
|
||||
if clip_fea is not None:
|
||||
txt_vision_states = self.vision_in(clip_fea)
|
||||
if self.cond_type_embedding is not None:
|
||||
cond_emb = self.cond_type_embedding(2 * torch.ones_like(txt_vision_states[:, :, 0], dtype=torch.long, device=txt_vision_states.device))
|
||||
txt_vision_states = txt_vision_states + cond_emb
|
||||
txt = torch.cat((txt_vision_states.to(txt.dtype), txt), dim=1)
|
||||
extra_txt_ids = torch.zeros((txt_ids.shape[0], txt_vision_states.shape[1], txt_ids.shape[-1]), device=txt_ids.device, dtype=txt_ids.dtype)
|
||||
txt_ids = torch.cat((txt_ids, extra_txt_ids), dim=1)
|
||||
|
||||
ids = torch.cat((img_ids, txt_ids), dim=1)
|
||||
pe = self.pe_embedder(ids)
|
||||
|
||||
@ -348,7 +389,10 @@ class HunyuanVideo(nn.Module):
|
||||
attn_mask = None
|
||||
|
||||
blocks_replace = patches_replace.get("dit", {})
|
||||
transformer_options["total_blocks"] = len(self.double_blocks)
|
||||
transformer_options["block_type"] = "double"
|
||||
for i, block in enumerate(self.double_blocks):
|
||||
transformer_options["block_index"] = i
|
||||
if ("double_block", i) in blocks_replace:
|
||||
def block_wrap_2(args):
|
||||
out = {}
|
||||
@ -370,7 +414,10 @@ class HunyuanVideo(nn.Module):
|
||||
|
||||
img = torch.cat((img, txt), 1)
|
||||
|
||||
transformer_options["total_blocks"] = len(self.single_blocks)
|
||||
transformer_options["block_type"] = "single"
|
||||
for i, block in enumerate(self.single_blocks):
|
||||
transformer_options["block_index"] = i
|
||||
if ("single_block", i) in blocks_replace:
|
||||
def block_wrap(args):
|
||||
out = {}
|
||||
@ -429,16 +476,16 @@ class HunyuanVideo(nn.Module):
|
||||
img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
|
||||
return repeat(img_ids, "h w c -> b (h w) c", b=bs)
|
||||
|
||||
def forward(self, x, timestep, context, y=None, txt_byt5=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options=None, **kwargs):
|
||||
def forward(self, x, timestep, context, y=None, txt_byt5=None, clip_fea=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options=None, **kwargs):
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
return WrapperExecutor.new_class_executor(
|
||||
self._forward,
|
||||
self,
|
||||
get_all_wrappers(WrappersMP.DIFFUSION_MODEL, transformer_options)
|
||||
).execute(x, timestep, context, y, txt_byt5, guidance, attention_mask, guiding_frame_index, ref_latent, disable_time_r, control, transformer_options, **kwargs)
|
||||
).execute(x, timestep, context, y, txt_byt5, clip_fea, guidance, attention_mask, guiding_frame_index, ref_latent, disable_time_r, control, transformer_options, **kwargs)
|
||||
|
||||
def _forward(self, x, timestep, context, y=None, txt_byt5=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options=None, **kwargs):
|
||||
def _forward(self, x, timestep, context, y=None, txt_byt5=None, clip_fea=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options=None, **kwargs):
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
bs = x.shape[0]
|
||||
@ -448,5 +495,5 @@ class HunyuanVideo(nn.Module):
|
||||
else:
|
||||
img_ids = self.img_ids_2d(x)
|
||||
txt_ids = torch.zeros((bs, context.shape[1], 2), device=x.device, dtype=x.dtype)
|
||||
out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, txt_byt5, guidance, guiding_frame_index, ref_latent, disable_time_r=disable_time_r, control=control, transformer_options=transformer_options)
|
||||
out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, txt_byt5, clip_fea, guidance, guiding_frame_index, ref_latent, disable_time_r=disable_time_r, control=control, transformer_options=transformer_options)
|
||||
return out
|
||||
|
||||
121
comfy/ldm/hunyuan_video/upsampler.py
Normal file
121
comfy/ldm/hunyuan_video/upsampler.py
Normal file
@ -0,0 +1,121 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from ..modules.diffusionmodules.model import ResnetBlock, VideoConv3d
|
||||
from .vae_refiner import RMS_norm
|
||||
from ... import model_management, model_patcher
|
||||
|
||||
class SRResidualCausalBlock3D(nn.Module):
|
||||
def __init__(self, channels: int):
|
||||
super().__init__()
|
||||
self.block = nn.Sequential(
|
||||
VideoConv3d(channels, channels, kernel_size=3),
|
||||
nn.SiLU(inplace=True),
|
||||
VideoConv3d(channels, channels, kernel_size=3),
|
||||
nn.SiLU(inplace=True),
|
||||
VideoConv3d(channels, channels, kernel_size=3),
|
||||
)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
return x + self.block(x)
|
||||
|
||||
class SRModel3DV2(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels: int,
|
||||
out_channels: int,
|
||||
hidden_channels: int = 64,
|
||||
num_blocks: int = 6,
|
||||
global_residual: bool = False,
|
||||
):
|
||||
super().__init__()
|
||||
self.in_conv = VideoConv3d(in_channels, hidden_channels, kernel_size=3)
|
||||
self.blocks = nn.ModuleList([SRResidualCausalBlock3D(hidden_channels) for _ in range(num_blocks)])
|
||||
self.out_conv = VideoConv3d(hidden_channels, out_channels, kernel_size=3)
|
||||
self.global_residual = bool(global_residual)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
residual = x
|
||||
y = self.in_conv(x)
|
||||
for blk in self.blocks:
|
||||
y = blk(y)
|
||||
y = self.out_conv(y)
|
||||
if self.global_residual and (y.shape == residual.shape):
|
||||
y = y + residual
|
||||
return y
|
||||
|
||||
|
||||
class Upsampler(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
z_channels: int,
|
||||
out_channels: int,
|
||||
block_out_channels: tuple[int, ...],
|
||||
num_res_blocks: int = 2,
|
||||
):
|
||||
super().__init__()
|
||||
self.num_res_blocks = num_res_blocks
|
||||
self.block_out_channels = block_out_channels
|
||||
self.z_channels = z_channels
|
||||
|
||||
ch = block_out_channels[0]
|
||||
self.conv_in = VideoConv3d(z_channels, ch, kernel_size=3)
|
||||
|
||||
self.up = nn.ModuleList()
|
||||
|
||||
for i, tgt in enumerate(block_out_channels):
|
||||
stage = nn.Module()
|
||||
stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt,
|
||||
out_channels=tgt,
|
||||
temb_channels=0,
|
||||
conv_shortcut=False,
|
||||
conv_op=VideoConv3d, norm_op=RMS_norm)
|
||||
for j in range(num_res_blocks + 1)])
|
||||
ch = tgt
|
||||
self.up.append(stage)
|
||||
|
||||
self.norm_out = RMS_norm(ch)
|
||||
self.conv_out = VideoConv3d(ch, out_channels, kernel_size=3)
|
||||
|
||||
def forward(self, z):
|
||||
"""
|
||||
Args:
|
||||
z: (B, C, T, H, W)
|
||||
target_shape: (H, W)
|
||||
"""
|
||||
# z to block_in
|
||||
repeats = self.block_out_channels[0] // (self.z_channels)
|
||||
x = self.conv_in(z) + z.repeat_interleave(repeats=repeats, dim=1)
|
||||
|
||||
# upsampling
|
||||
for stage in self.up:
|
||||
for blk in stage.block:
|
||||
x = blk(x)
|
||||
|
||||
out = self.conv_out(F.silu(self.norm_out(x)))
|
||||
return out
|
||||
|
||||
UPSAMPLERS = {
|
||||
"720p": SRModel3DV2,
|
||||
"1080p": Upsampler,
|
||||
}
|
||||
|
||||
class HunyuanVideo15SRModel():
|
||||
def __init__(self, model_type, config):
|
||||
self.load_device = model_management.vae_device()
|
||||
offload_device = model_management.vae_offload_device()
|
||||
self.dtype = model_management.vae_dtype(self.load_device)
|
||||
self.model_class = UPSAMPLERS.get(model_type)
|
||||
self.model = self.model_class(**config).eval()
|
||||
|
||||
self.patcher = model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
|
||||
|
||||
def load_sd(self, sd):
|
||||
return self.model.load_state_dict(sd, strict=True)
|
||||
|
||||
def get_sd(self):
|
||||
return self.model.state_dict()
|
||||
|
||||
def resample_latent(self, latent):
|
||||
model_management.load_model_gpu(self.patcher)
|
||||
return self.model(latent.to(self.load_device))
|
||||
@ -1,8 +1,11 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from ..modules.diffusionmodules.model import ResnetBlock, AttnBlock, VideoConv3d, Normalize
|
||||
|
||||
from ..models.autoencoder import DiagonalGaussianRegularizer
|
||||
from ..modules.diffusionmodules.model import ResnetBlock, AttnBlock, CarriedConv3d, Normalize, conv_carry_causal_3d, \
|
||||
torch_cat_if_needed
|
||||
from ...model_management import cast_to
|
||||
from ...ops import disable_weight_init as ops
|
||||
|
||||
|
||||
@ -14,11 +17,11 @@ class RMS_norm(nn.Module):
|
||||
self.gamma = nn.Parameter(torch.empty(shape))
|
||||
|
||||
def forward(self, x):
|
||||
return F.normalize(x, dim=1) * self.scale * self.gamma
|
||||
return F.normalize(x, dim=1) * self.scale * cast_to(self.gamma, dtype=x.dtype, device=x.device)
|
||||
|
||||
|
||||
class DnSmpl(nn.Module):
|
||||
def __init__(self, ic, oc, tds=True, refiner_vae=True, op=VideoConv3d):
|
||||
def __init__(self, ic, oc, tds, refiner_vae, op):
|
||||
super().__init__()
|
||||
fct = 2 * 2 * 2 if tds else 1 * 2 * 2
|
||||
assert oc % fct == 0
|
||||
@ -28,11 +31,11 @@ class DnSmpl(nn.Module):
|
||||
self.tds = tds
|
||||
self.gs = fct * ic // oc
|
||||
|
||||
def forward(self, x):
|
||||
def forward(self, x, conv_carry_in=None, conv_carry_out=None):
|
||||
r1 = 2 if self.tds else 1
|
||||
h = self.conv(x)
|
||||
h = conv_carry_causal_3d([x], self.conv, conv_carry_in, conv_carry_out)
|
||||
|
||||
if self.tds and self.refiner_vae:
|
||||
if self.tds and self.refiner_vae and conv_carry_in is None:
|
||||
hf = h[:, :, :1, :, :]
|
||||
b, c, f, ht, wd = hf.shape
|
||||
hf = hf.reshape(b, c, f, ht // 2, 2, wd // 2, 2)
|
||||
@ -40,14 +43,7 @@ class DnSmpl(nn.Module):
|
||||
hf = hf.reshape(b, 2 * 2 * c, f, ht // 2, wd // 2)
|
||||
hf = torch.cat([hf, hf], dim=1)
|
||||
|
||||
hn = h[:, :, 1:, :, :]
|
||||
b, c, frms, ht, wd = hn.shape
|
||||
nf = frms // r1
|
||||
hn = hn.reshape(b, c, nf, r1, ht // 2, 2, wd // 2, 2)
|
||||
hn = hn.permute(0, 3, 5, 7, 1, 2, 4, 6)
|
||||
hn = hn.reshape(b, r1 * 2 * 2 * c, nf, ht // 2, wd // 2)
|
||||
|
||||
h = torch.cat([hf, hn], dim=2)
|
||||
h = h[:, :, 1:, :, :]
|
||||
|
||||
xf = x[:, :, :1, :, :]
|
||||
b, ci, f, ht, wd = xf.shape
|
||||
@ -55,38 +51,36 @@ class DnSmpl(nn.Module):
|
||||
xf = xf.permute(0, 4, 6, 1, 2, 3, 5)
|
||||
xf = xf.reshape(b, 2 * 2 * ci, f, ht // 2, wd // 2)
|
||||
B, C, T, H, W = xf.shape
|
||||
xf = xf.view(B, h.shape[1], self.gs // 2, T, H, W).mean(dim=2)
|
||||
xf = xf.view(B, hf.shape[1], self.gs // 2, T, H, W).mean(dim=2)
|
||||
|
||||
xn = x[:, :, 1:, :, :]
|
||||
b, ci, frms, ht, wd = xn.shape
|
||||
nf = frms // r1
|
||||
xn = xn.reshape(b, ci, nf, r1, ht // 2, 2, wd // 2, 2)
|
||||
xn = xn.permute(0, 3, 5, 7, 1, 2, 4, 6)
|
||||
xn = xn.reshape(b, r1 * 2 * 2 * ci, nf, ht // 2, wd // 2)
|
||||
B, C, T, H, W = xn.shape
|
||||
xn = xn.view(B, h.shape[1], self.gs, T, H, W).mean(dim=2)
|
||||
sc = torch.cat([xf, xn], dim=2)
|
||||
else:
|
||||
b, c, frms, ht, wd = h.shape
|
||||
x = x[:, :, 1:, :, :]
|
||||
|
||||
nf = frms // r1
|
||||
h = h.reshape(b, c, nf, r1, ht // 2, 2, wd // 2, 2)
|
||||
h = h.permute(0, 3, 5, 7, 1, 2, 4, 6)
|
||||
h = h.reshape(b, r1 * 2 * 2 * c, nf, ht // 2, wd // 2)
|
||||
if h.shape[2] == 0:
|
||||
return hf + xf
|
||||
|
||||
b, ci, frms, ht, wd = x.shape
|
||||
nf = frms // r1
|
||||
sc = x.reshape(b, ci, nf, r1, ht // 2, 2, wd // 2, 2)
|
||||
sc = sc.permute(0, 3, 5, 7, 1, 2, 4, 6)
|
||||
sc = sc.reshape(b, r1 * 2 * 2 * ci, nf, ht // 2, wd // 2)
|
||||
B, C, T, H, W = sc.shape
|
||||
sc = sc.view(B, h.shape[1], self.gs, T, H, W).mean(dim=2)
|
||||
b, c, frms, ht, wd = h.shape
|
||||
nf = frms // r1
|
||||
h = h.reshape(b, c, nf, r1, ht // 2, 2, wd // 2, 2)
|
||||
h = h.permute(0, 3, 5, 7, 1, 2, 4, 6)
|
||||
h = h.reshape(b, r1 * 2 * 2 * c, nf, ht // 2, wd // 2)
|
||||
|
||||
return h + sc
|
||||
b, ci, frms, ht, wd = x.shape
|
||||
nf = frms // r1
|
||||
x = x.reshape(b, ci, nf, r1, ht // 2, 2, wd // 2, 2)
|
||||
x = x.permute(0, 3, 5, 7, 1, 2, 4, 6)
|
||||
x = x.reshape(b, r1 * 2 * 2 * ci, nf, ht // 2, wd // 2)
|
||||
B, C, T, H, W = x.shape
|
||||
x = x.view(B, h.shape[1], self.gs, T, H, W).mean(dim=2)
|
||||
|
||||
if self.tds and self.refiner_vae and conv_carry_in is None:
|
||||
h = torch.cat([hf, h], dim=2)
|
||||
x = torch.cat([xf, x], dim=2)
|
||||
|
||||
return h + x
|
||||
|
||||
|
||||
class UpSmpl(nn.Module):
|
||||
def __init__(self, ic, oc, tus=True, refiner_vae=True, op=VideoConv3d):
|
||||
def __init__(self, ic, oc, tus, refiner_vae, op):
|
||||
super().__init__()
|
||||
fct = 2 * 2 * 2 if tus else 1 * 2 * 2
|
||||
self.conv = op(ic, oc * fct, kernel_size=3, stride=1, padding=1)
|
||||
@ -95,11 +89,11 @@ class UpSmpl(nn.Module):
|
||||
self.tus = tus
|
||||
self.rp = fct * oc // ic
|
||||
|
||||
def forward(self, x):
|
||||
def forward(self, x, conv_carry_in=None, conv_carry_out=None):
|
||||
r1 = 2 if self.tus else 1
|
||||
h = self.conv(x)
|
||||
h = conv_carry_causal_3d([x], self.conv, conv_carry_in, conv_carry_out)
|
||||
|
||||
if self.tus and self.refiner_vae:
|
||||
if self.tus and self.refiner_vae and conv_carry_in is None:
|
||||
hf = h[:, :, :1, :, :]
|
||||
b, c, f, ht, wd = hf.shape
|
||||
nc = c // (2 * 2)
|
||||
@ -108,14 +102,7 @@ class UpSmpl(nn.Module):
|
||||
hf = hf.reshape(b, nc, f, ht * 2, wd * 2)
|
||||
hf = hf[:, : hf.shape[1] // 2]
|
||||
|
||||
hn = h[:, :, 1:, :, :]
|
||||
b, c, frms, ht, wd = hn.shape
|
||||
nc = c // (r1 * 2 * 2)
|
||||
hn = hn.reshape(b, r1, 2, 2, nc, frms, ht, wd)
|
||||
hn = hn.permute(0, 4, 5, 1, 6, 2, 7, 3)
|
||||
hn = hn.reshape(b, nc, frms * r1, ht * 2, wd * 2)
|
||||
|
||||
h = torch.cat([hf, hn], dim=2)
|
||||
h = h[:, :, 1:, :, :]
|
||||
|
||||
xf = x[:, :, :1, :, :]
|
||||
b, ci, f, ht, wd = xf.shape
|
||||
@ -126,29 +113,26 @@ class UpSmpl(nn.Module):
|
||||
xf = xf.permute(0, 3, 4, 5, 1, 6, 2)
|
||||
xf = xf.reshape(b, nc, f, ht * 2, wd * 2)
|
||||
|
||||
xn = x[:, :, 1:, :, :]
|
||||
xn = xn.repeat_interleave(repeats=self.rp, dim=1)
|
||||
b, c, frms, ht, wd = xn.shape
|
||||
nc = c // (r1 * 2 * 2)
|
||||
xn = xn.reshape(b, r1, 2, 2, nc, frms, ht, wd)
|
||||
xn = xn.permute(0, 4, 5, 1, 6, 2, 7, 3)
|
||||
xn = xn.reshape(b, nc, frms * r1, ht * 2, wd * 2)
|
||||
sc = torch.cat([xf, xn], dim=2)
|
||||
else:
|
||||
b, c, frms, ht, wd = h.shape
|
||||
nc = c // (r1 * 2 * 2)
|
||||
h = h.reshape(b, r1, 2, 2, nc, frms, ht, wd)
|
||||
h = h.permute(0, 4, 5, 1, 6, 2, 7, 3)
|
||||
h = h.reshape(b, nc, frms * r1, ht * 2, wd * 2)
|
||||
x = x[:, :, 1:, :, :]
|
||||
|
||||
sc = x.repeat_interleave(repeats=self.rp, dim=1)
|
||||
b, c, frms, ht, wd = sc.shape
|
||||
nc = c // (r1 * 2 * 2)
|
||||
sc = sc.reshape(b, r1, 2, 2, nc, frms, ht, wd)
|
||||
sc = sc.permute(0, 4, 5, 1, 6, 2, 7, 3)
|
||||
sc = sc.reshape(b, nc, frms * r1, ht * 2, wd * 2)
|
||||
b, c, frms, ht, wd = h.shape
|
||||
nc = c // (r1 * 2 * 2)
|
||||
h = h.reshape(b, r1, 2, 2, nc, frms, ht, wd)
|
||||
h = h.permute(0, 4, 5, 1, 6, 2, 7, 3)
|
||||
h = h.reshape(b, nc, frms * r1, ht * 2, wd * 2)
|
||||
|
||||
return h + sc
|
||||
x = x.repeat_interleave(repeats=self.rp, dim=1)
|
||||
b, c, frms, ht, wd = x.shape
|
||||
nc = c // (r1 * 2 * 2)
|
||||
x = x.reshape(b, r1, 2, 2, nc, frms, ht, wd)
|
||||
x = x.permute(0, 4, 5, 1, 6, 2, 7, 3)
|
||||
x = x.reshape(b, nc, frms * r1, ht * 2, wd * 2)
|
||||
|
||||
if self.tus and self.refiner_vae and conv_carry_in is None:
|
||||
h = torch.cat([hf, h], dim=2)
|
||||
x = torch.cat([xf, x], dim=2)
|
||||
|
||||
return h + x
|
||||
|
||||
|
||||
class Encoder(nn.Module):
|
||||
@ -162,7 +146,7 @@ class Encoder(nn.Module):
|
||||
|
||||
self.refiner_vae = refiner_vae
|
||||
if self.refiner_vae:
|
||||
conv_op = VideoConv3d
|
||||
conv_op = CarriedConv3d
|
||||
norm_op = RMS_norm
|
||||
else:
|
||||
conv_op = ops.Conv3d
|
||||
@ -190,9 +174,9 @@ class Encoder(nn.Module):
|
||||
self.down.append(stage)
|
||||
|
||||
self.mid = nn.Module()
|
||||
self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op)
|
||||
self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
|
||||
self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op)
|
||||
self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op)
|
||||
self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
|
||||
|
||||
self.norm_out = norm_op(ch)
|
||||
self.conv_out = conv_op(ch, z_channels << 1, 3, 1, 1)
|
||||
@ -203,31 +187,48 @@ class Encoder(nn.Module):
|
||||
if not self.refiner_vae and x.shape[2] == 1:
|
||||
x = x.expand(-1, -1, self.ffactor_temporal, -1, -1)
|
||||
|
||||
x = self.conv_in(x)
|
||||
if self.refiner_vae:
|
||||
xl = [x[:, :, :1, :, :]]
|
||||
if x.shape[2] > self.ffactor_temporal:
|
||||
xl += torch.split(x[:, :, 1: 1 + ((x.shape[2] - 1) // self.ffactor_temporal) * self.ffactor_temporal, :, :], self.ffactor_temporal * 2, dim=2)
|
||||
x = xl
|
||||
else:
|
||||
x = [x]
|
||||
out = []
|
||||
|
||||
for stage in self.down:
|
||||
for blk in stage.block:
|
||||
x = blk(x)
|
||||
if hasattr(stage, 'downsample'):
|
||||
x = stage.downsample(x)
|
||||
conv_carry_in = None
|
||||
|
||||
x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(x)))
|
||||
for i, x1 in enumerate(x):
|
||||
conv_carry_out = []
|
||||
if i == len(x) - 1:
|
||||
conv_carry_out = None
|
||||
|
||||
x1 = [x1]
|
||||
x1 = conv_carry_causal_3d(x1, self.conv_in, conv_carry_in, conv_carry_out)
|
||||
|
||||
for stage in self.down:
|
||||
for blk in stage.block:
|
||||
x1 = blk(x1, None, conv_carry_in, conv_carry_out)
|
||||
if hasattr(stage, 'downsample'):
|
||||
x1 = stage.downsample(x1, conv_carry_in, conv_carry_out)
|
||||
|
||||
out.append(x1)
|
||||
conv_carry_in = conv_carry_out
|
||||
|
||||
out = torch_cat_if_needed(out, dim=2)
|
||||
|
||||
x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(out)))
|
||||
del out
|
||||
|
||||
b, c, t, h, w = x.shape
|
||||
grp = c // (self.z_channels << 1)
|
||||
skip = x.view(b, c // grp, grp, t, h, w).mean(2)
|
||||
|
||||
out = self.conv_out(F.silu(self.norm_out(x))) + skip
|
||||
out = conv_carry_causal_3d([F.silu(self.norm_out(x))], self.conv_out) + skip
|
||||
|
||||
if self.refiner_vae:
|
||||
out = self.regul(out)[0]
|
||||
|
||||
out = torch.cat((out[:, :, :1], out), dim=2)
|
||||
out = out.permute(0, 2, 1, 3, 4)
|
||||
b, f_times_2, c, h, w = out.shape
|
||||
out = out.reshape(b, f_times_2 // 2, 2 * c, h, w)
|
||||
out = out.permute(0, 2, 1, 3, 4).contiguous()
|
||||
|
||||
return out
|
||||
|
||||
|
||||
@ -242,7 +243,7 @@ class Decoder(nn.Module):
|
||||
|
||||
self.refiner_vae = refiner_vae
|
||||
if self.refiner_vae:
|
||||
conv_op = VideoConv3d
|
||||
conv_op = CarriedConv3d
|
||||
norm_op = RMS_norm
|
||||
else:
|
||||
conv_op = ops.Conv3d
|
||||
@ -252,9 +253,9 @@ class Decoder(nn.Module):
|
||||
self.conv_in = conv_op(z_channels, ch, kernel_size=3, stride=1, padding=1)
|
||||
|
||||
self.mid = nn.Module()
|
||||
self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op)
|
||||
self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
|
||||
self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op)
|
||||
self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op)
|
||||
self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
|
||||
|
||||
self.up = nn.ModuleList()
|
||||
depth = (ffactor_spatial >> 1).bit_length()
|
||||
@ -278,24 +279,34 @@ class Decoder(nn.Module):
|
||||
self.conv_out = conv_op(ch, out_channels, 3, stride=1, padding=1)
|
||||
|
||||
def forward(self, z):
|
||||
if self.refiner_vae:
|
||||
z = z.permute(0, 2, 1, 3, 4)
|
||||
b, f, c, h, w = z.shape
|
||||
z = z.reshape(b, f, 2, c // 2, h, w)
|
||||
z = z.permute(0, 1, 2, 3, 4, 5).reshape(b, f * 2, c // 2, h, w)
|
||||
z = z.permute(0, 2, 1, 3, 4)
|
||||
z = z[:, :, 1:]
|
||||
|
||||
x = self.conv_in(z) + z.repeat_interleave(self.block_out_channels[0] // self.z_channels, 1)
|
||||
x = conv_carry_causal_3d([z], self.conv_in) + z.repeat_interleave(self.block_out_channels[0] // self.z_channels, 1)
|
||||
x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(x)))
|
||||
|
||||
for stage in self.up:
|
||||
for blk in stage.block:
|
||||
x = blk(x)
|
||||
if hasattr(stage, 'upsample'):
|
||||
x = stage.upsample(x)
|
||||
if self.refiner_vae:
|
||||
x = torch.split(x, 2, dim=2)
|
||||
else:
|
||||
x = [x]
|
||||
out = []
|
||||
|
||||
out = self.conv_out(F.silu(self.norm_out(x)))
|
||||
conv_carry_in = None
|
||||
|
||||
for i, x1 in enumerate(x):
|
||||
conv_carry_out = []
|
||||
if i == len(x) - 1:
|
||||
conv_carry_out = None
|
||||
for stage in self.up:
|
||||
for blk in stage.block:
|
||||
x1 = blk(x1, None, conv_carry_in, conv_carry_out)
|
||||
if hasattr(stage, 'upsample'):
|
||||
x1 = stage.upsample(x1, conv_carry_in, conv_carry_out)
|
||||
|
||||
x1 = [F.silu(self.norm_out(x1))]
|
||||
x1 = conv_carry_causal_3d(x1, self.conv_out, conv_carry_in, conv_carry_out)
|
||||
out.append(x1)
|
||||
conv_carry_in = conv_carry_out
|
||||
del x
|
||||
|
||||
out = torch_cat_if_needed(out, dim=2)
|
||||
|
||||
if not self.refiner_vae:
|
||||
if z.shape[-3] == 1:
|
||||
|
||||
0
comfy/ldm/kandinsky5/__init__.py
Normal file
0
comfy/ldm/kandinsky5/__init__.py
Normal file
445
comfy/ldm/kandinsky5/model.py
Normal file
445
comfy/ldm/kandinsky5/model.py
Normal file
@ -0,0 +1,445 @@
|
||||
import torch
|
||||
from torch import nn
|
||||
import math
|
||||
|
||||
from ..common_dit import pad_to_patch_size
|
||||
from ..modules.attention import optimized_attention
|
||||
from ..flux.math import apply_rope1
|
||||
from ..flux.layers import EmbedND
|
||||
from ... import patcher_extension
|
||||
|
||||
|
||||
def attention(q, k, v, heads, transformer_options=None):
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
return optimized_attention(
|
||||
q.transpose(1, 2),
|
||||
k.transpose(1, 2),
|
||||
v.transpose(1, 2),
|
||||
heads=heads,
|
||||
skip_reshape=True,
|
||||
transformer_options=transformer_options
|
||||
)
|
||||
|
||||
|
||||
def apply_scale_shift_norm(norm, x, scale, shift):
|
||||
return torch.addcmul(shift, norm(x), scale + 1.0)
|
||||
|
||||
|
||||
def apply_gate_sum(x, out, gate):
|
||||
return torch.addcmul(x, gate, out)
|
||||
|
||||
|
||||
def get_shift_scale_gate(params):
|
||||
shift, scale, gate = torch.chunk(params, 3, dim=-1)
|
||||
return tuple(x.unsqueeze(1) for x in (shift, scale, gate))
|
||||
|
||||
|
||||
def get_freqs(dim, max_period=10000.0):
|
||||
return torch.exp(-math.log(max_period) * torch.arange(start=0, end=dim, dtype=torch.float32) / dim)
|
||||
|
||||
|
||||
class TimeEmbeddings(nn.Module):
|
||||
def __init__(self, model_dim, time_dim, max_period=10000.0, operation_settings=None):
|
||||
super().__init__()
|
||||
assert model_dim % 2 == 0
|
||||
self.model_dim = model_dim
|
||||
self.max_period = max_period
|
||||
self.register_buffer("freqs", get_freqs(model_dim // 2, max_period), persistent=False)
|
||||
operations = operation_settings.get("operations")
|
||||
self.in_layer = operations.Linear(model_dim, time_dim, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||
self.activation = nn.SiLU()
|
||||
self.out_layer = operations.Linear(time_dim, time_dim, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||
|
||||
def forward(self, timestep, dtype):
|
||||
args = torch.outer(timestep, self.freqs.to(device=timestep.device))
|
||||
time_embed = torch.cat([torch.cos(args), torch.sin(args)], dim=-1).to(dtype)
|
||||
time_embed = self.out_layer(self.activation(self.in_layer(time_embed)))
|
||||
return time_embed
|
||||
|
||||
|
||||
class TextEmbeddings(nn.Module):
|
||||
def __init__(self, text_dim, model_dim, operation_settings=None):
|
||||
super().__init__()
|
||||
operations = operation_settings.get("operations")
|
||||
self.in_layer = operations.Linear(text_dim, model_dim, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||
self.norm = operations.LayerNorm(model_dim, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||
|
||||
def forward(self, text_embed):
|
||||
text_embed = self.in_layer(text_embed)
|
||||
return self.norm(text_embed).type_as(text_embed)
|
||||
|
||||
|
||||
class VisualEmbeddings(nn.Module):
|
||||
def __init__(self, visual_dim, model_dim, patch_size, operation_settings=None):
|
||||
super().__init__()
|
||||
self.patch_size = patch_size
|
||||
operations = operation_settings.get("operations")
|
||||
self.in_layer = operations.Linear(visual_dim, model_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||
|
||||
def forward(self, x):
|
||||
x = x.movedim(1, -1) # B C T H W -> B T H W C
|
||||
B, T, H, W, dim = x.shape
|
||||
pt, ph, pw = self.patch_size
|
||||
|
||||
x = x.view(
|
||||
B,
|
||||
T // pt, pt,
|
||||
H // ph, ph,
|
||||
W // pw, pw,
|
||||
dim,
|
||||
).permute(0, 1, 3, 5, 2, 4, 6, 7).flatten(4, 7)
|
||||
|
||||
return self.in_layer(x)
|
||||
|
||||
|
||||
class Modulation(nn.Module):
|
||||
def __init__(self, time_dim, model_dim, num_params, operation_settings=None):
|
||||
super().__init__()
|
||||
self.activation = nn.SiLU()
|
||||
self.out_layer = operation_settings.get("operations").Linear(time_dim, num_params * model_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||
|
||||
def forward(self, x):
|
||||
return self.out_layer(self.activation(x))
|
||||
|
||||
|
||||
class SelfAttention(nn.Module):
|
||||
def __init__(self, num_channels, head_dim, operation_settings=None):
|
||||
super().__init__()
|
||||
assert num_channels % head_dim == 0
|
||||
self.num_heads = num_channels // head_dim
|
||||
self.head_dim = head_dim
|
||||
|
||||
operations = operation_settings.get("operations")
|
||||
self.to_query = operations.Linear(num_channels, num_channels, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||
self.to_key = operations.Linear(num_channels, num_channels, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||
self.to_value = operations.Linear(num_channels, num_channels, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||
self.query_norm = operations.RMSNorm(head_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||
self.key_norm = operations.RMSNorm(head_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||
|
||||
self.out_layer = operations.Linear(num_channels, num_channels, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||
self.num_chunks = 2
|
||||
|
||||
def _compute_qk(self, x, freqs, proj_fn, norm_fn):
|
||||
result = proj_fn(x).view(*x.shape[:-1], self.num_heads, -1)
|
||||
return apply_rope1(norm_fn(result), freqs)
|
||||
|
||||
def _forward(self, x, freqs, transformer_options=None):
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
q = self._compute_qk(x, freqs, self.to_query, self.query_norm)
|
||||
k = self._compute_qk(x, freqs, self.to_key, self.key_norm)
|
||||
v = self.to_value(x).view(*x.shape[:-1], self.num_heads, -1)
|
||||
out = attention(q, k, v, self.num_heads, transformer_options=transformer_options)
|
||||
return self.out_layer(out)
|
||||
|
||||
def _forward_chunked(self, x, freqs, transformer_options=None):
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
|
||||
def process_chunks(proj_fn, norm_fn):
|
||||
x_chunks = torch.chunk(x, self.num_chunks, dim=1)
|
||||
freqs_chunks = torch.chunk(freqs, self.num_chunks, dim=1)
|
||||
chunks = []
|
||||
for x_chunk, freqs_chunk in zip(x_chunks, freqs_chunks):
|
||||
chunks.append(self._compute_qk(x_chunk, freqs_chunk, proj_fn, norm_fn))
|
||||
return torch.cat(chunks, dim=1)
|
||||
|
||||
q = process_chunks(self.to_query, self.query_norm)
|
||||
k = process_chunks(self.to_key, self.key_norm)
|
||||
v = self.to_value(x).view(*x.shape[:-1], self.num_heads, -1)
|
||||
out = attention(q, k, v, self.num_heads, transformer_options=transformer_options)
|
||||
return self.out_layer(out)
|
||||
|
||||
def forward(self, x, freqs, transformer_options=None):
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
if x.shape[1] > 8192:
|
||||
return self._forward_chunked(x, freqs, transformer_options=transformer_options)
|
||||
else:
|
||||
return self._forward(x, freqs, transformer_options=transformer_options)
|
||||
|
||||
|
||||
class CrossAttention(SelfAttention):
|
||||
def get_qkv(self, x, context):
|
||||
q = self.to_query(x).view(*x.shape[:-1], self.num_heads, -1)
|
||||
k = self.to_key(context).view(*context.shape[:-1], self.num_heads, -1)
|
||||
v = self.to_value(context).view(*context.shape[:-1], self.num_heads, -1)
|
||||
return q, k, v
|
||||
|
||||
def forward(self, x, context, transformer_options=None):
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
q, k, v = self.get_qkv(x, context)
|
||||
out = attention(self.query_norm(q), self.key_norm(k), v, self.num_heads, transformer_options=transformer_options)
|
||||
return self.out_layer(out)
|
||||
|
||||
|
||||
class FeedForward(nn.Module):
|
||||
def __init__(self, dim, ff_dim, operation_settings=None):
|
||||
super().__init__()
|
||||
operations = operation_settings.get("operations")
|
||||
self.in_layer = operations.Linear(dim, ff_dim, bias=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||
self.activation = nn.GELU()
|
||||
self.out_layer = operations.Linear(ff_dim, dim, bias=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||
self.num_chunks = 4
|
||||
|
||||
def _forward(self, x):
|
||||
return self.out_layer(self.activation(self.in_layer(x)))
|
||||
|
||||
def _forward_chunked(self, x):
|
||||
chunks = torch.chunk(x, self.num_chunks, dim=1)
|
||||
output_chunks = []
|
||||
for chunk in chunks:
|
||||
output_chunks.append(self._forward(chunk))
|
||||
return torch.cat(output_chunks, dim=1)
|
||||
|
||||
def forward(self, x):
|
||||
if x.shape[1] > 8192:
|
||||
return self._forward_chunked(x)
|
||||
else:
|
||||
return self._forward(x)
|
||||
|
||||
|
||||
class OutLayer(nn.Module):
|
||||
def __init__(self, model_dim, time_dim, visual_dim, patch_size, operation_settings=None):
|
||||
super().__init__()
|
||||
self.patch_size = patch_size
|
||||
self.modulation = Modulation(time_dim, model_dim, 2, operation_settings=operation_settings)
|
||||
operations = operation_settings.get("operations")
|
||||
self.norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||
self.out_layer = operations.Linear(model_dim, math.prod(patch_size) * visual_dim, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||
|
||||
def forward(self, visual_embed, time_embed):
|
||||
B, T, H, W, _ = visual_embed.shape
|
||||
shift, scale = torch.chunk(self.modulation(time_embed), 2, dim=-1)
|
||||
scale = scale[:, None, None, None, :]
|
||||
shift = shift[:, None, None, None, :]
|
||||
visual_embed = apply_scale_shift_norm(self.norm, visual_embed, scale, shift)
|
||||
x = self.out_layer(visual_embed)
|
||||
|
||||
out_dim = x.shape[-1] // (self.patch_size[0] * self.patch_size[1] * self.patch_size[2])
|
||||
x = x.view(
|
||||
B, T, H, W,
|
||||
out_dim,
|
||||
self.patch_size[0], self.patch_size[1], self.patch_size[2]
|
||||
)
|
||||
return x.permute(0, 4, 1, 5, 2, 6, 3, 7).flatten(2, 3).flatten(3, 4).flatten(4, 5)
|
||||
|
||||
|
||||
class TransformerEncoderBlock(nn.Module):
|
||||
def __init__(self, model_dim, time_dim, ff_dim, head_dim, operation_settings=None):
|
||||
super().__init__()
|
||||
self.text_modulation = Modulation(time_dim, model_dim, 6, operation_settings=operation_settings)
|
||||
operations = operation_settings.get("operations")
|
||||
|
||||
self.self_attention_norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||
self.self_attention = SelfAttention(model_dim, head_dim, operation_settings=operation_settings)
|
||||
|
||||
self.feed_forward_norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||
self.feed_forward = FeedForward(model_dim, ff_dim, operation_settings=operation_settings)
|
||||
|
||||
def forward(self, x, time_embed, freqs, transformer_options=None):
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
self_attn_params, ff_params = torch.chunk(self.text_modulation(time_embed), 2, dim=-1)
|
||||
shift, scale, gate = get_shift_scale_gate(self_attn_params)
|
||||
out = apply_scale_shift_norm(self.self_attention_norm, x, scale, shift)
|
||||
out = self.self_attention(out, freqs, transformer_options=transformer_options)
|
||||
x = apply_gate_sum(x, out, gate)
|
||||
|
||||
shift, scale, gate = get_shift_scale_gate(ff_params)
|
||||
out = apply_scale_shift_norm(self.feed_forward_norm, x, scale, shift)
|
||||
out = self.feed_forward(out)
|
||||
x = apply_gate_sum(x, out, gate)
|
||||
return x
|
||||
|
||||
|
||||
class TransformerDecoderBlock(nn.Module):
|
||||
def __init__(self, model_dim, time_dim, ff_dim, head_dim, operation_settings=None):
|
||||
super().__init__()
|
||||
self.visual_modulation = Modulation(time_dim, model_dim, 9, operation_settings=operation_settings)
|
||||
|
||||
operations = operation_settings.get("operations")
|
||||
self.self_attention_norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||
self.self_attention = SelfAttention(model_dim, head_dim, operation_settings=operation_settings)
|
||||
|
||||
self.cross_attention_norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||
self.cross_attention = CrossAttention(model_dim, head_dim, operation_settings=operation_settings)
|
||||
|
||||
self.feed_forward_norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||
self.feed_forward = FeedForward(model_dim, ff_dim, operation_settings=operation_settings)
|
||||
|
||||
def forward(self, visual_embed, text_embed, time_embed, freqs, transformer_options=None):
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
self_attn_params, cross_attn_params, ff_params = torch.chunk(self.visual_modulation(time_embed), 3, dim=-1)
|
||||
# self attention
|
||||
shift, scale, gate = get_shift_scale_gate(self_attn_params)
|
||||
visual_out = apply_scale_shift_norm(self.self_attention_norm, visual_embed, scale, shift)
|
||||
visual_out = self.self_attention(visual_out, freqs, transformer_options=transformer_options)
|
||||
visual_embed = apply_gate_sum(visual_embed, visual_out, gate)
|
||||
# cross attention
|
||||
shift, scale, gate = get_shift_scale_gate(cross_attn_params)
|
||||
visual_out = apply_scale_shift_norm(self.cross_attention_norm, visual_embed, scale, shift)
|
||||
visual_out = self.cross_attention(visual_out, text_embed, transformer_options=transformer_options)
|
||||
visual_embed = apply_gate_sum(visual_embed, visual_out, gate)
|
||||
# feed forward
|
||||
shift, scale, gate = get_shift_scale_gate(ff_params)
|
||||
visual_out = apply_scale_shift_norm(self.feed_forward_norm, visual_embed, scale, shift)
|
||||
visual_out = self.feed_forward(visual_out)
|
||||
visual_embed = apply_gate_sum(visual_embed, visual_out, gate)
|
||||
return visual_embed
|
||||
|
||||
|
||||
class Kandinsky5(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
in_visual_dim=16, out_visual_dim=16, in_text_dim=3584, in_text_dim2=768, time_dim=512,
|
||||
model_dim=1792, ff_dim=7168, visual_embed_dim=132, patch_size=(1, 2, 2), num_text_blocks=2, num_visual_blocks=32,
|
||||
axes_dims=(16, 24, 24), rope_scale_factor=(1.0, 2.0, 2.0),
|
||||
dtype=None, device=None, operations=None, **kwargs
|
||||
):
|
||||
super().__init__()
|
||||
head_dim = sum(axes_dims)
|
||||
self.rope_scale_factor = rope_scale_factor
|
||||
self.in_visual_dim = in_visual_dim
|
||||
self.model_dim = model_dim
|
||||
self.patch_size = patch_size
|
||||
self.visual_embed_dim = visual_embed_dim
|
||||
self.dtype = dtype
|
||||
self.device = device
|
||||
operation_settings = {"operations": operations, "device": device, "dtype": dtype}
|
||||
|
||||
self.time_embeddings = TimeEmbeddings(model_dim, time_dim, operation_settings=operation_settings)
|
||||
self.text_embeddings = TextEmbeddings(in_text_dim, model_dim, operation_settings=operation_settings)
|
||||
self.pooled_text_embeddings = TextEmbeddings(in_text_dim2, time_dim, operation_settings=operation_settings)
|
||||
self.visual_embeddings = VisualEmbeddings(visual_embed_dim, model_dim, patch_size, operation_settings=operation_settings)
|
||||
|
||||
self.text_transformer_blocks = nn.ModuleList(
|
||||
[TransformerEncoderBlock(model_dim, time_dim, ff_dim, head_dim, operation_settings=operation_settings) for _ in range(num_text_blocks)]
|
||||
)
|
||||
|
||||
self.visual_transformer_blocks = nn.ModuleList(
|
||||
[TransformerDecoderBlock(model_dim, time_dim, ff_dim, head_dim, operation_settings=operation_settings) for _ in range(num_visual_blocks)]
|
||||
)
|
||||
|
||||
self.out_layer = OutLayer(model_dim, time_dim, out_visual_dim, patch_size, operation_settings=operation_settings)
|
||||
|
||||
self.rope_embedder_3d = EmbedND(dim=head_dim, theta=10000.0, axes_dim=axes_dims)
|
||||
self.rope_embedder_1d = EmbedND(dim=head_dim, theta=10000.0, axes_dim=[head_dim])
|
||||
|
||||
def rope_encode_1d(self, seq_len, seq_start=0, steps=None, device=None, dtype=None, transformer_options=None):
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
steps = seq_len if steps is None else steps
|
||||
seq_ids = torch.linspace(seq_start, seq_start + (seq_len - 1), steps=steps, device=device, dtype=dtype)
|
||||
seq_ids = seq_ids.reshape(-1, 1).unsqueeze(0) # Shape: (1, steps, 1)
|
||||
freqs = self.rope_embedder_1d(seq_ids).movedim(1, 2)
|
||||
return freqs
|
||||
|
||||
def rope_encode_3d(self, t, h, w, t_start=0, steps_t=None, steps_h=None, steps_w=None, device=None, dtype=None, transformer_options=None):
|
||||
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
patch_size = self.patch_size
|
||||
t_len = ((t + (patch_size[0] // 2)) // patch_size[0])
|
||||
h_len = ((h + (patch_size[1] // 2)) // patch_size[1])
|
||||
w_len = ((w + (patch_size[2] // 2)) // patch_size[2])
|
||||
|
||||
if steps_t is None:
|
||||
steps_t = t_len
|
||||
if steps_h is None:
|
||||
steps_h = h_len
|
||||
if steps_w is None:
|
||||
steps_w = w_len
|
||||
|
||||
h_start = 0
|
||||
w_start = 0
|
||||
rope_options = transformer_options.get("rope_options", None)
|
||||
if rope_options is not None:
|
||||
t_len = (t_len - 1.0) * rope_options.get("scale_t", 1.0) + 1.0
|
||||
h_len = (h_len - 1.0) * rope_options.get("scale_y", 1.0) + 1.0
|
||||
w_len = (w_len - 1.0) * rope_options.get("scale_x", 1.0) + 1.0
|
||||
|
||||
t_start += rope_options.get("shift_t", 0.0)
|
||||
h_start += rope_options.get("shift_y", 0.0)
|
||||
w_start += rope_options.get("shift_x", 0.0)
|
||||
else:
|
||||
rope_scale_factor = self.rope_scale_factor
|
||||
if self.model_dim == 4096: # pro video model uses different rope scaling at higher resolutions
|
||||
if h * w >= 14080:
|
||||
rope_scale_factor = (1.0, 3.16, 3.16)
|
||||
|
||||
t_len = (t_len - 1.0) / rope_scale_factor[0] + 1.0
|
||||
h_len = (h_len - 1.0) / rope_scale_factor[1] + 1.0
|
||||
w_len = (w_len - 1.0) / rope_scale_factor[2] + 1.0
|
||||
|
||||
img_ids = torch.zeros((steps_t, steps_h, steps_w, 3), device=device, dtype=dtype)
|
||||
img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(t_start, t_start + (t_len - 1), steps=steps_t, device=device, dtype=dtype).reshape(-1, 1, 1)
|
||||
img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(h_start, h_start + (h_len - 1), steps=steps_h, device=device, dtype=dtype).reshape(1, -1, 1)
|
||||
img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(w_start, w_start + (w_len - 1), steps=steps_w, device=device, dtype=dtype).reshape(1, 1, -1)
|
||||
img_ids = img_ids.reshape(1, -1, img_ids.shape[-1])
|
||||
|
||||
freqs = self.rope_embedder_3d(img_ids).movedim(1, 2)
|
||||
return freqs
|
||||
|
||||
def forward_orig(self, x, timestep, context, y, freqs, freqs_text, transformer_options=None, **kwargs):
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
patches_replace = transformer_options.get("patches_replace", {})
|
||||
context = self.text_embeddings(context)
|
||||
time_embed = self.time_embeddings(timestep, x.dtype) + self.pooled_text_embeddings(y)
|
||||
|
||||
for block in self.text_transformer_blocks:
|
||||
context = block(context, time_embed, freqs_text, transformer_options=transformer_options)
|
||||
|
||||
visual_embed = self.visual_embeddings(x)
|
||||
visual_shape = visual_embed.shape[:-1]
|
||||
visual_embed = visual_embed.flatten(1, -2)
|
||||
|
||||
blocks_replace = patches_replace.get("dit", {})
|
||||
transformer_options["total_blocks"] = len(self.visual_transformer_blocks)
|
||||
transformer_options["block_type"] = "double"
|
||||
for i, block in enumerate(self.visual_transformer_blocks):
|
||||
transformer_options["block_index"] = i
|
||||
if ("double_block", i) in blocks_replace:
|
||||
def block_wrap(args):
|
||||
return block(x=args["x"], context=args["context"], time_embed=args["time_embed"], freqs=args["freqs"], transformer_options=args.get("transformer_options"))
|
||||
|
||||
visual_embed = blocks_replace[("double_block", i)]({"x": visual_embed, "context": context, "time_embed": time_embed, "freqs": freqs, "transformer_options": transformer_options}, {"original_block": block_wrap})["x"]
|
||||
else:
|
||||
visual_embed = block(visual_embed, context, time_embed, freqs=freqs, transformer_options=transformer_options)
|
||||
|
||||
visual_embed = visual_embed.reshape(*visual_shape, -1)
|
||||
return self.out_layer(visual_embed, time_embed)
|
||||
|
||||
def _forward(self, x, timestep, context, y, time_dim_replace=None, transformer_options=None, **kwargs):
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
original_dims = x.ndim
|
||||
if original_dims == 4:
|
||||
x = x.unsqueeze(2)
|
||||
bs, c, t_len, h, w = x.shape
|
||||
x = pad_to_patch_size(x, self.patch_size)
|
||||
|
||||
if time_dim_replace is not None:
|
||||
time_dim_replace = pad_to_patch_size(time_dim_replace, self.patch_size)
|
||||
x[:, :time_dim_replace.shape[1], :time_dim_replace.shape[2]] = time_dim_replace
|
||||
|
||||
freqs = self.rope_encode_3d(t_len, h, w, device=x.device, dtype=x.dtype, transformer_options=transformer_options)
|
||||
freqs_text = self.rope_encode_1d(context.shape[1], device=x.device, dtype=x.dtype, transformer_options=transformer_options)
|
||||
|
||||
out = self.forward_orig(x, timestep, context, y, freqs, freqs_text, transformer_options=transformer_options, **kwargs)
|
||||
if original_dims == 4:
|
||||
out = out.squeeze(2)
|
||||
return out
|
||||
|
||||
def forward(self, x, timestep, context, y, time_dim_replace=None, transformer_options=None, **kwargs):
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
return patcher_extension.WrapperExecutor.new_class_executor(
|
||||
self._forward,
|
||||
self,
|
||||
patcher_extension.get_all_wrappers(patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
|
||||
).execute(x, timestep, context, y, time_dim_replace=time_dim_replace, transformer_options=transformer_options, **kwargs)
|
||||
@ -1,12 +1,12 @@
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from ..common_dit import rms_norm
|
||||
from einops import rearrange
|
||||
import math
|
||||
from typing import Dict, Optional, Tuple
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from .symmetric_patchifier import SymmetricPatchifier, latent_to_pixel_coords
|
||||
from ..common_dit import rms_norm
|
||||
from ..flux.math import apply_rope1
|
||||
from ..modules.attention import optimized_attention, optimized_attention_masked
|
||||
from ...patcher_extension import WrapperExecutor, get_all_wrappers, WrappersMP
|
||||
|
||||
@ -181,10 +181,11 @@ class AdaLayerNormSingle(nn.Module):
|
||||
added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
|
||||
batch_size: Optional[int] = None,
|
||||
hidden_dtype: Optional[torch.dtype] = None,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]: #, torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
# No modulation happening here.
|
||||
added_cond_kwargs = added_cond_kwargs or {"resolution": None, "aspect_ratio": None}
|
||||
embedded_timestep = self.emb(timestep, **added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_dtype)
|
||||
# todo: whats going on with the signature?
|
||||
return self.linear(self.silu(embedded_timestep)), embedded_timestep
|
||||
|
||||
|
||||
@ -240,20 +241,6 @@ class FeedForward(nn.Module):
|
||||
return self.net(x)
|
||||
|
||||
|
||||
def apply_rotary_emb(input_tensor, freqs_cis): # TODO: remove duplicate funcs and pick the best/fastest one
|
||||
cos_freqs = freqs_cis[0]
|
||||
sin_freqs = freqs_cis[1]
|
||||
|
||||
t_dup = rearrange(input_tensor, "... (d r) -> ... d r", r=2)
|
||||
t1, t2 = t_dup.unbind(dim=-1)
|
||||
t_dup = torch.stack((-t2, t1), dim=-1)
|
||||
input_tensor_rot = rearrange(t_dup, "... d r -> ... (d r)")
|
||||
|
||||
out = input_tensor * cos_freqs + input_tensor_rot * sin_freqs
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class CrossAttention(nn.Module):
|
||||
def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0., attn_precision=None, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
@ -285,8 +272,8 @@ class CrossAttention(nn.Module):
|
||||
k = self.k_norm(k)
|
||||
|
||||
if pe is not None:
|
||||
q = apply_rotary_emb(q, pe)
|
||||
k = apply_rotary_emb(k, pe)
|
||||
q = apply_rope1(q.unsqueeze(1), pe).squeeze(1)
|
||||
k = apply_rope1(k.unsqueeze(1), pe).squeeze(1)
|
||||
|
||||
if mask is None:
|
||||
out = optimized_attention(q, k, v, self.heads, attn_precision=self.attn_precision, transformer_options=transformer_options)
|
||||
@ -312,12 +299,17 @@ class BasicTransformerBlock(nn.Module):
|
||||
transformer_options = {}
|
||||
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None, None].to(device=x.device, dtype=x.dtype) + timestep.reshape(x.shape[0], timestep.shape[1], self.scale_shift_table.shape[0], -1)).unbind(dim=2)
|
||||
|
||||
x += self.attn1(rms_norm(x) * (1 + scale_msa) + shift_msa, pe=pe, transformer_options=transformer_options) * gate_msa
|
||||
attn1_input = rms_norm(x)
|
||||
attn1_input = torch.addcmul(attn1_input, attn1_input, scale_msa).add_(shift_msa)
|
||||
attn1_input = self.attn1(attn1_input, pe=pe, transformer_options=transformer_options)
|
||||
x.addcmul_(attn1_input, gate_msa)
|
||||
del attn1_input
|
||||
|
||||
x += self.attn2(x, context=context, mask=attention_mask, transformer_options=transformer_options)
|
||||
|
||||
y = rms_norm(x) * (1 + scale_mlp) + shift_mlp
|
||||
x += self.ff(y) * gate_mlp
|
||||
y = rms_norm(x)
|
||||
y = torch.addcmul(y, y, scale_mlp).add_(shift_mlp)
|
||||
x.addcmul_(self.ff(y), gate_mlp)
|
||||
|
||||
return x
|
||||
|
||||
@ -336,41 +328,35 @@ def get_fractional_positions(indices_grid, max_pos):
|
||||
def precompute_freqs_cis(indices_grid, dim, out_dtype, theta=10000.0, max_pos=None):
|
||||
if max_pos is None:
|
||||
max_pos = [20, 2048, 2048]
|
||||
dtype = torch.float32 # self.dtype
|
||||
dtype = torch.float32
|
||||
device = indices_grid.device
|
||||
|
||||
# Get fractional positions and compute frequency indices
|
||||
fractional_positions = get_fractional_positions(indices_grid, max_pos)
|
||||
indices = theta ** torch.linspace(0, 1, dim // 6, device=device, dtype=dtype) * math.pi / 2
|
||||
|
||||
start = 1
|
||||
end = theta
|
||||
device = fractional_positions.device
|
||||
# Compute frequencies and apply cos/sin
|
||||
freqs = (indices * (fractional_positions.unsqueeze(-1) * 2 - 1)).transpose(-1, -2).flatten(2)
|
||||
cos_vals = freqs.cos().repeat_interleave(2, dim=-1)
|
||||
sin_vals = freqs.sin().repeat_interleave(2, dim=-1)
|
||||
|
||||
indices = theta ** (
|
||||
torch.linspace(
|
||||
math.log(start, theta),
|
||||
math.log(end, theta),
|
||||
dim // 6,
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
)
|
||||
indices = indices.to(dtype=dtype)
|
||||
|
||||
indices = indices * math.pi / 2
|
||||
|
||||
freqs = (
|
||||
(indices * (fractional_positions.unsqueeze(-1) * 2 - 1))
|
||||
.transpose(-1, -2)
|
||||
.flatten(2)
|
||||
)
|
||||
|
||||
cos_freq = freqs.cos().repeat_interleave(2, dim=-1)
|
||||
sin_freq = freqs.sin().repeat_interleave(2, dim=-1)
|
||||
# Pad if dim is not divisible by 6
|
||||
if dim % 6 != 0:
|
||||
cos_padding = torch.ones_like(cos_freq[:, :, : dim % 6])
|
||||
sin_padding = torch.zeros_like(cos_freq[:, :, : dim % 6])
|
||||
cos_freq = torch.cat([cos_padding, cos_freq], dim=-1)
|
||||
sin_freq = torch.cat([sin_padding, sin_freq], dim=-1)
|
||||
return cos_freq.to(out_dtype), sin_freq.to(out_dtype)
|
||||
padding_size = dim % 6
|
||||
cos_vals = torch.cat([torch.ones_like(cos_vals[:, :, :padding_size]), cos_vals], dim=-1)
|
||||
sin_vals = torch.cat([torch.zeros_like(sin_vals[:, :, :padding_size]), sin_vals], dim=-1)
|
||||
|
||||
# Reshape and extract one value per pair (since repeat_interleave duplicates each value)
|
||||
cos_vals = cos_vals.reshape(*cos_vals.shape[:2], -1, 2)[..., 0].to(out_dtype) # [B, N, dim//2]
|
||||
sin_vals = sin_vals.reshape(*sin_vals.shape[:2], -1, 2)[..., 0].to(out_dtype) # [B, N, dim//2]
|
||||
|
||||
# Build rotation matrix [[cos, -sin], [sin, cos]] and add heads dimension
|
||||
freqs_cis = torch.stack([
|
||||
torch.stack([cos_vals, -sin_vals], dim=-1),
|
||||
torch.stack([sin_vals, cos_vals], dim=-1)
|
||||
], dim=-2).unsqueeze(1) # [B, 1, N, dim//2, 2, 2]
|
||||
|
||||
return freqs_cis
|
||||
|
||||
|
||||
class LTXVModel(torch.nn.Module):
|
||||
@ -515,7 +501,7 @@ class LTXVModel(torch.nn.Module):
|
||||
shift, scale = scale_shift_values[:, :, 0], scale_shift_values[:, :, 1]
|
||||
x = self.norm_out(x)
|
||||
# Modulation
|
||||
x = x * (1 + scale) + shift
|
||||
x = torch.addcmul(x, x, scale).add_(shift)
|
||||
x = self.proj_out(x)
|
||||
|
||||
x = self.patchifier.unpatchify(
|
||||
|
||||
113
comfy/ldm/lumina/controlnet.py
Normal file
113
comfy/ldm/lumina/controlnet.py
Normal file
@ -0,0 +1,113 @@
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from .model import JointTransformerBlock
|
||||
|
||||
class ZImageControlTransformerBlock(JointTransformerBlock):
|
||||
def __init__(
|
||||
self,
|
||||
layer_id: int,
|
||||
dim: int,
|
||||
n_heads: int,
|
||||
n_kv_heads: int,
|
||||
multiple_of: int,
|
||||
ffn_dim_multiplier: float,
|
||||
norm_eps: float,
|
||||
qk_norm: bool,
|
||||
modulation=True,
|
||||
block_id=0,
|
||||
operation_settings=None,
|
||||
):
|
||||
super().__init__(layer_id, dim, n_heads, n_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, qk_norm, modulation, z_image_modulation=True, operation_settings=operation_settings)
|
||||
self.block_id = block_id
|
||||
if block_id == 0:
|
||||
self.before_proj = operation_settings.get("operations").Linear(self.dim, self.dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||
self.after_proj = operation_settings.get("operations").Linear(self.dim, self.dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||
|
||||
def forward(self, c, x, **kwargs):
|
||||
if self.block_id == 0:
|
||||
c = self.before_proj(c) + x
|
||||
c = super().forward(c, **kwargs)
|
||||
c_skip = self.after_proj(c)
|
||||
return c_skip, c
|
||||
|
||||
class ZImage_Control(torch.nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
dim: int = 3840,
|
||||
n_heads: int = 30,
|
||||
n_kv_heads: int = 30,
|
||||
multiple_of: int = 256,
|
||||
ffn_dim_multiplier: float = (8.0 / 3.0),
|
||||
norm_eps: float = 1e-5,
|
||||
qk_norm: bool = True,
|
||||
dtype=None,
|
||||
device=None,
|
||||
operations=None,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__()
|
||||
operation_settings = {"operations": operations, "device": device, "dtype": dtype}
|
||||
|
||||
self.additional_in_dim = 0
|
||||
self.control_in_dim = 16
|
||||
n_refiner_layers = 2
|
||||
self.n_control_layers = 6
|
||||
self.control_layers = nn.ModuleList(
|
||||
[
|
||||
ZImageControlTransformerBlock(
|
||||
i,
|
||||
dim,
|
||||
n_heads,
|
||||
n_kv_heads,
|
||||
multiple_of,
|
||||
ffn_dim_multiplier,
|
||||
norm_eps,
|
||||
qk_norm,
|
||||
block_id=i,
|
||||
operation_settings=operation_settings,
|
||||
)
|
||||
for i in range(self.n_control_layers)
|
||||
]
|
||||
)
|
||||
|
||||
all_x_embedder = {}
|
||||
patch_size = 2
|
||||
f_patch_size = 1
|
||||
x_embedder = operations.Linear(f_patch_size * patch_size * patch_size * self.control_in_dim, dim, bias=True, device=device, dtype=dtype)
|
||||
all_x_embedder[f"{patch_size}-{f_patch_size}"] = x_embedder
|
||||
|
||||
self.control_all_x_embedder = nn.ModuleDict(all_x_embedder)
|
||||
self.control_noise_refiner = nn.ModuleList(
|
||||
[
|
||||
JointTransformerBlock(
|
||||
layer_id,
|
||||
dim,
|
||||
n_heads,
|
||||
n_kv_heads,
|
||||
multiple_of,
|
||||
ffn_dim_multiplier,
|
||||
norm_eps,
|
||||
qk_norm,
|
||||
modulation=True,
|
||||
z_image_modulation=True,
|
||||
operation_settings=operation_settings,
|
||||
)
|
||||
for layer_id in range(n_refiner_layers)
|
||||
]
|
||||
)
|
||||
|
||||
def forward(self, cap_feats, control_context, x_freqs_cis, adaln_input):
|
||||
patch_size = 2
|
||||
f_patch_size = 1
|
||||
pH = pW = patch_size
|
||||
B, C, H, W = control_context.shape
|
||||
control_context = self.control_all_x_embedder[f"{patch_size}-{f_patch_size}"](control_context.view(B, C, H // pH, pH, W // pW, pW).permute(0, 2, 4, 3, 5, 1).flatten(3).flatten(1, 2))
|
||||
|
||||
x_attn_mask = None
|
||||
for layer in self.control_noise_refiner:
|
||||
control_context = layer(control_context, x_attn_mask, x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input)
|
||||
return control_context
|
||||
|
||||
def forward_control_block(self, layer_id, control_context, x, x_attn_mask, x_freqs_cis, adaln_input):
|
||||
return self.control_layers[layer_id](control_context, x, x_mask=x_attn_mask, freqs_cis=x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input=adaln_input)
|
||||
@ -12,26 +12,34 @@ from ..modules.diffusionmodules.mmdit import TimestepEmbedder
|
||||
from ..modules.attention import optimized_attention_masked
|
||||
from ..flux.layers import EmbedND
|
||||
from ...patcher_extension import WrapperExecutor, get_all_wrappers, WrappersMP
|
||||
from ..flux.math import apply_rope
|
||||
|
||||
|
||||
def modulate(x, scale):
|
||||
return x * (1 + scale.unsqueeze(1))
|
||||
|
||||
|
||||
#############################################################################
|
||||
# Core NextDiT Model #
|
||||
#############################################################################
|
||||
|
||||
def clamp_fp16(x):
|
||||
if x.dtype == torch.float16:
|
||||
return torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504)
|
||||
return x
|
||||
|
||||
|
||||
class JointAttention(nn.Module):
|
||||
"""Multi-head attention module."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dim: int,
|
||||
n_heads: int,
|
||||
n_kv_heads: Optional[int],
|
||||
qk_norm: bool,
|
||||
operation_settings={},
|
||||
self,
|
||||
dim: int,
|
||||
n_heads: int,
|
||||
n_kv_heads: Optional[int],
|
||||
qk_norm: bool,
|
||||
out_bias: bool = False,
|
||||
operation_settings=None,
|
||||
):
|
||||
"""
|
||||
Initialize the Attention module.
|
||||
@ -43,6 +51,8 @@ class JointAttention(nn.Module):
|
||||
|
||||
"""
|
||||
super().__init__()
|
||||
if operation_settings is None:
|
||||
operation_settings = {}
|
||||
self.n_kv_heads = n_heads if n_kv_heads is None else n_kv_heads
|
||||
self.n_local_heads = n_heads
|
||||
self.n_local_kv_heads = self.n_kv_heads
|
||||
@ -59,7 +69,7 @@ class JointAttention(nn.Module):
|
||||
self.out = operation_settings.get("operations").Linear(
|
||||
n_heads * self.head_dim,
|
||||
dim,
|
||||
bias=False,
|
||||
bias=out_bias,
|
||||
device=operation_settings.get("device"),
|
||||
dtype=operation_settings.get("dtype"),
|
||||
)
|
||||
@ -70,41 +80,12 @@ class JointAttention(nn.Module):
|
||||
else:
|
||||
self.q_norm = self.k_norm = nn.Identity()
|
||||
|
||||
@staticmethod
|
||||
def apply_rotary_emb(
|
||||
x_in: torch.Tensor,
|
||||
freqs_cis: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Apply rotary embeddings to input tensors using the given frequency
|
||||
tensor.
|
||||
|
||||
This function applies rotary embeddings to the given query 'xq' and
|
||||
key 'xk' tensors using the provided frequency tensor 'freqs_cis'. The
|
||||
input tensors are reshaped as complex numbers, and the frequency tensor
|
||||
is reshaped for broadcasting compatibility. The resulting tensors
|
||||
contain rotary embeddings and are returned as real tensors.
|
||||
|
||||
Args:
|
||||
x_in (torch.Tensor): Query or Key tensor to apply rotary embeddings.
|
||||
freqs_cis (torch.Tensor): Precomputed frequency tensor for complex
|
||||
exponentials.
|
||||
|
||||
Returns:
|
||||
Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor
|
||||
and key tensor with rotary embeddings.
|
||||
"""
|
||||
|
||||
t_ = x_in.reshape(*x_in.shape[:-1], -1, 1, 2)
|
||||
t_out = freqs_cis[..., 0] * t_[..., 0] + freqs_cis[..., 1] * t_[..., 1]
|
||||
return t_out.reshape(*x_in.shape)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
x_mask: torch.Tensor,
|
||||
freqs_cis: torch.Tensor,
|
||||
transformer_options={},
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
x_mask: torch.Tensor,
|
||||
freqs_cis: torch.Tensor,
|
||||
transformer_options=None,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
|
||||
@ -116,6 +97,8 @@ class JointAttention(nn.Module):
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
bsz, seqlen, _ = x.shape
|
||||
|
||||
xq, xk, xv = torch.split(
|
||||
@ -134,8 +117,7 @@ class JointAttention(nn.Module):
|
||||
xq = self.q_norm(xq)
|
||||
xk = self.k_norm(xk)
|
||||
|
||||
xq = JointAttention.apply_rotary_emb(xq, freqs_cis=freqs_cis)
|
||||
xk = JointAttention.apply_rotary_emb(xk, freqs_cis=freqs_cis)
|
||||
xq, xk = apply_rope(xq, xk, freqs_cis)
|
||||
|
||||
n_rep = self.n_local_heads // self.n_local_kv_heads
|
||||
if n_rep >= 1:
|
||||
@ -148,12 +130,12 @@ class JointAttention(nn.Module):
|
||||
|
||||
class FeedForward(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
dim: int,
|
||||
hidden_dim: int,
|
||||
multiple_of: int,
|
||||
ffn_dim_multiplier: Optional[float],
|
||||
operation_settings={},
|
||||
self,
|
||||
dim: int,
|
||||
hidden_dim: int,
|
||||
multiple_of: int,
|
||||
ffn_dim_multiplier: Optional[float],
|
||||
operation_settings=None,
|
||||
):
|
||||
"""
|
||||
Initialize the FeedForward module.
|
||||
@ -169,6 +151,8 @@ class FeedForward(nn.Module):
|
||||
"""
|
||||
super().__init__()
|
||||
# custom dim factor multiplier
|
||||
if operation_settings is None:
|
||||
operation_settings = {}
|
||||
if ffn_dim_multiplier is not None:
|
||||
hidden_dim = int(ffn_dim_multiplier * hidden_dim)
|
||||
hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
|
||||
@ -197,7 +181,7 @@ class FeedForward(nn.Module):
|
||||
|
||||
# @torch.compile
|
||||
def _forward_silu_gating(self, x1, x3):
|
||||
return F.silu(x1) * x3
|
||||
return clamp_fp16(F.silu(x1) * x3)
|
||||
|
||||
def forward(self, x):
|
||||
return self.w2(self._forward_silu_gating(self.w1(x), self.w3(x)))
|
||||
@ -205,17 +189,19 @@ class FeedForward(nn.Module):
|
||||
|
||||
class JointTransformerBlock(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
layer_id: int,
|
||||
dim: int,
|
||||
n_heads: int,
|
||||
n_kv_heads: int,
|
||||
multiple_of: int,
|
||||
ffn_dim_multiplier: float,
|
||||
norm_eps: float,
|
||||
qk_norm: bool,
|
||||
modulation=True,
|
||||
operation_settings={},
|
||||
self,
|
||||
layer_id: int,
|
||||
dim: int,
|
||||
n_heads: int,
|
||||
n_kv_heads: int,
|
||||
multiple_of: int,
|
||||
ffn_dim_multiplier: float,
|
||||
norm_eps: float,
|
||||
qk_norm: bool,
|
||||
modulation=True,
|
||||
z_image_modulation=False,
|
||||
attn_out_bias=False,
|
||||
operation_settings=None,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize a TransformerBlock.
|
||||
@ -233,12 +219,14 @@ class JointTransformerBlock(nn.Module):
|
||||
|
||||
"""
|
||||
super().__init__()
|
||||
if operation_settings is None:
|
||||
operation_settings = {}
|
||||
self.dim = dim
|
||||
self.head_dim = dim // n_heads
|
||||
self.attention = JointAttention(dim, n_heads, n_kv_heads, qk_norm, operation_settings=operation_settings)
|
||||
self.attention = JointAttention(dim, n_heads, n_kv_heads, qk_norm, out_bias=attn_out_bias, operation_settings=operation_settings)
|
||||
self.feed_forward = FeedForward(
|
||||
dim=dim,
|
||||
hidden_dim=4 * dim,
|
||||
hidden_dim=dim,
|
||||
multiple_of=multiple_of,
|
||||
ffn_dim_multiplier=ffn_dim_multiplier,
|
||||
operation_settings=operation_settings,
|
||||
@ -252,24 +240,35 @@ class JointTransformerBlock(nn.Module):
|
||||
|
||||
self.modulation = modulation
|
||||
if modulation:
|
||||
self.adaLN_modulation = nn.Sequential(
|
||||
nn.SiLU(),
|
||||
operation_settings.get("operations").Linear(
|
||||
min(dim, 1024),
|
||||
4 * dim,
|
||||
bias=True,
|
||||
device=operation_settings.get("device"),
|
||||
dtype=operation_settings.get("dtype"),
|
||||
),
|
||||
)
|
||||
if z_image_modulation:
|
||||
self.adaLN_modulation = nn.Sequential(
|
||||
operation_settings.get("operations").Linear(
|
||||
min(dim, 256),
|
||||
4 * dim,
|
||||
bias=True,
|
||||
device=operation_settings.get("device"),
|
||||
dtype=operation_settings.get("dtype"),
|
||||
),
|
||||
)
|
||||
else:
|
||||
self.adaLN_modulation = nn.Sequential(
|
||||
nn.SiLU(),
|
||||
operation_settings.get("operations").Linear(
|
||||
min(dim, 1024),
|
||||
4 * dim,
|
||||
bias=True,
|
||||
device=operation_settings.get("device"),
|
||||
dtype=operation_settings.get("dtype"),
|
||||
),
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
x_mask: torch.Tensor,
|
||||
freqs_cis: torch.Tensor,
|
||||
adaln_input: Optional[torch.Tensor]=None,
|
||||
transformer_options={},
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
x_mask: torch.Tensor,
|
||||
freqs_cis: torch.Tensor,
|
||||
adaln_input: Optional[torch.Tensor] = None,
|
||||
transformer_options=None,
|
||||
):
|
||||
"""
|
||||
Perform a forward pass through the TransformerBlock.
|
||||
@ -283,32 +282,34 @@ class JointTransformerBlock(nn.Module):
|
||||
feedforward layers.
|
||||
|
||||
"""
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
if self.modulation:
|
||||
assert adaln_input is not None
|
||||
scale_msa, gate_msa, scale_mlp, gate_mlp = self.adaLN_modulation(adaln_input).chunk(4, dim=1)
|
||||
|
||||
x = x + gate_msa.unsqueeze(1).tanh() * self.attention_norm2(
|
||||
self.attention(
|
||||
clamp_fp16(self.attention(
|
||||
modulate(self.attention_norm1(x), scale_msa),
|
||||
x_mask,
|
||||
freqs_cis,
|
||||
transformer_options=transformer_options,
|
||||
)
|
||||
))
|
||||
)
|
||||
x = x + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(
|
||||
self.feed_forward(
|
||||
clamp_fp16(self.feed_forward(
|
||||
modulate(self.ffn_norm1(x), scale_mlp),
|
||||
)
|
||||
))
|
||||
)
|
||||
else:
|
||||
assert adaln_input is None
|
||||
x = x + self.attention_norm2(
|
||||
self.attention(
|
||||
clamp_fp16(self.attention(
|
||||
self.attention_norm1(x),
|
||||
x_mask,
|
||||
freqs_cis,
|
||||
transformer_options=transformer_options,
|
||||
)
|
||||
))
|
||||
)
|
||||
x = x + self.ffn_norm2(
|
||||
self.feed_forward(
|
||||
@ -323,8 +324,10 @@ class FinalLayer(nn.Module):
|
||||
The final layer of NextDiT.
|
||||
"""
|
||||
|
||||
def __init__(self, hidden_size, patch_size, out_channels, operation_settings={}):
|
||||
def __init__(self, hidden_size, patch_size, out_channels, z_image_modulation=False, operation_settings=None):
|
||||
super().__init__()
|
||||
if operation_settings is None:
|
||||
operation_settings = {}
|
||||
self.norm_final = operation_settings.get("operations").LayerNorm(
|
||||
hidden_size,
|
||||
elementwise_affine=False,
|
||||
@ -340,10 +343,15 @@ class FinalLayer(nn.Module):
|
||||
dtype=operation_settings.get("dtype"),
|
||||
)
|
||||
|
||||
if z_image_modulation:
|
||||
min_mod = 256
|
||||
else:
|
||||
min_mod = 1024
|
||||
|
||||
self.adaLN_modulation = nn.Sequential(
|
||||
nn.SiLU(),
|
||||
operation_settings.get("operations").Linear(
|
||||
min(hidden_size, 1024),
|
||||
min(hidden_size, min_mod),
|
||||
hidden_size,
|
||||
bias=True,
|
||||
device=operation_settings.get("device"),
|
||||
@ -364,25 +372,30 @@ class NextDiT(nn.Module):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
patch_size: int = 2,
|
||||
in_channels: int = 4,
|
||||
dim: int = 4096,
|
||||
n_layers: int = 32,
|
||||
n_refiner_layers: int = 2,
|
||||
n_heads: int = 32,
|
||||
n_kv_heads: Optional[int] = None,
|
||||
multiple_of: int = 256,
|
||||
ffn_dim_multiplier: Optional[float] = None,
|
||||
norm_eps: float = 1e-5,
|
||||
qk_norm: bool = False,
|
||||
cap_feat_dim: int = 5120,
|
||||
axes_dims: List[int] = (16, 56, 56),
|
||||
axes_lens: List[int] = (1, 512, 512),
|
||||
image_model=None,
|
||||
device=None,
|
||||
dtype=None,
|
||||
operations=None,
|
||||
self,
|
||||
patch_size: int = 2,
|
||||
in_channels: int = 4,
|
||||
dim: int = 4096,
|
||||
n_layers: int = 32,
|
||||
n_refiner_layers: int = 2,
|
||||
n_heads: int = 32,
|
||||
n_kv_heads: Optional[int] = None,
|
||||
multiple_of: int = 256,
|
||||
ffn_dim_multiplier: float = 4.0,
|
||||
norm_eps: float = 1e-5,
|
||||
qk_norm: bool = False,
|
||||
cap_feat_dim: int = 5120,
|
||||
axes_dims: List[int] = (16, 56, 56),
|
||||
axes_lens: List[int] = (1, 512, 512),
|
||||
rope_theta=10000.0,
|
||||
z_image_modulation=False,
|
||||
time_scale=1.0,
|
||||
pad_tokens_multiple=None,
|
||||
clip_text_dim=None,
|
||||
image_model=None,
|
||||
device=None,
|
||||
dtype=None,
|
||||
operations=None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.dtype = dtype
|
||||
@ -390,6 +403,8 @@ class NextDiT(nn.Module):
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = in_channels
|
||||
self.patch_size = patch_size
|
||||
self.time_scale = time_scale
|
||||
self.pad_tokens_multiple = pad_tokens_multiple
|
||||
|
||||
self.x_embedder = operation_settings.get("operations").Linear(
|
||||
in_features=patch_size * patch_size * in_channels,
|
||||
@ -411,6 +426,7 @@ class NextDiT(nn.Module):
|
||||
norm_eps,
|
||||
qk_norm,
|
||||
modulation=True,
|
||||
z_image_modulation=z_image_modulation,
|
||||
operation_settings=operation_settings,
|
||||
)
|
||||
for layer_id in range(n_refiner_layers)
|
||||
@ -434,7 +450,7 @@ class NextDiT(nn.Module):
|
||||
]
|
||||
)
|
||||
|
||||
self.t_embedder = TimestepEmbedder(min(dim, 1024), **operation_settings)
|
||||
self.t_embedder = TimestepEmbedder(min(dim, 1024), output_size=256 if z_image_modulation else None, **operation_settings)
|
||||
self.cap_embedder = nn.Sequential(
|
||||
operation_settings.get("operations").RMSNorm(cap_feat_dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")),
|
||||
operation_settings.get("operations").Linear(
|
||||
@ -446,6 +462,31 @@ class NextDiT(nn.Module):
|
||||
),
|
||||
)
|
||||
|
||||
self.clip_text_pooled_proj = None
|
||||
|
||||
if clip_text_dim is not None:
|
||||
self.clip_text_dim = clip_text_dim
|
||||
self.clip_text_pooled_proj = nn.Sequential(
|
||||
operation_settings.get("operations").RMSNorm(clip_text_dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")),
|
||||
operation_settings.get("operations").Linear(
|
||||
clip_text_dim,
|
||||
clip_text_dim,
|
||||
bias=True,
|
||||
device=operation_settings.get("device"),
|
||||
dtype=operation_settings.get("dtype"),
|
||||
),
|
||||
)
|
||||
self.time_text_embed = nn.Sequential(
|
||||
nn.SiLU(),
|
||||
operation_settings.get("operations").Linear(
|
||||
min(dim, 1024) + clip_text_dim,
|
||||
min(dim, 1024),
|
||||
bias=True,
|
||||
device=operation_settings.get("device"),
|
||||
dtype=operation_settings.get("dtype"),
|
||||
),
|
||||
)
|
||||
|
||||
self.layers = nn.ModuleList(
|
||||
[
|
||||
JointTransformerBlock(
|
||||
@ -457,23 +498,29 @@ class NextDiT(nn.Module):
|
||||
ffn_dim_multiplier,
|
||||
norm_eps,
|
||||
qk_norm,
|
||||
z_image_modulation=z_image_modulation,
|
||||
attn_out_bias=False,
|
||||
operation_settings=operation_settings,
|
||||
)
|
||||
for layer_id in range(n_layers)
|
||||
]
|
||||
)
|
||||
self.norm_final = operation_settings.get("operations").RMSNorm(dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||
self.final_layer = FinalLayer(dim, patch_size, self.out_channels, operation_settings=operation_settings)
|
||||
self.final_layer = FinalLayer(dim, patch_size, self.out_channels, z_image_modulation=z_image_modulation, operation_settings=operation_settings)
|
||||
|
||||
if self.pad_tokens_multiple is not None:
|
||||
self.x_pad_token = nn.Parameter(torch.empty((1, dim), device=device, dtype=dtype))
|
||||
self.cap_pad_token = nn.Parameter(torch.empty((1, dim), device=device, dtype=dtype))
|
||||
|
||||
assert (dim // n_heads) == sum(axes_dims)
|
||||
self.axes_dims = axes_dims
|
||||
self.axes_lens = axes_lens
|
||||
self.rope_embedder = EmbedND(dim=dim // n_heads, theta=10000.0, axes_dim=axes_dims)
|
||||
self.rope_embedder = EmbedND(dim=dim // n_heads, theta=rope_theta, axes_dim=axes_dims)
|
||||
self.dim = dim
|
||||
self.n_heads = n_heads
|
||||
|
||||
def unpatchify(
|
||||
self, x: torch.Tensor, img_size: List[Tuple[int, int]], cap_size: List[int], return_tensor=False
|
||||
self, x: torch.Tensor, img_size: List[Tuple[int, int]], cap_size: List[int], return_tensor=False
|
||||
) -> List[torch.Tensor]:
|
||||
"""
|
||||
x: (N, T, patch_size**2 * C)
|
||||
@ -498,101 +545,61 @@ class NextDiT(nn.Module):
|
||||
return imgs
|
||||
|
||||
def patchify_and_embed(
|
||||
self, x: List[torch.Tensor] | torch.Tensor, cap_feats: torch.Tensor, cap_mask: torch.Tensor, t: torch.Tensor, num_tokens, transformer_options={}
|
||||
self, x: List[torch.Tensor] | torch.Tensor, cap_feats: torch.Tensor, cap_mask: torch.Tensor, t: torch.Tensor, num_tokens, transformer_options=None
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, List[Tuple[int, int]], List[int], torch.Tensor]:
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
bsz = len(x)
|
||||
pH = pW = self.patch_size
|
||||
device = x[0].device
|
||||
dtype = x[0].dtype
|
||||
|
||||
if cap_mask is not None:
|
||||
l_effective_cap_len = cap_mask.sum(dim=1).tolist()
|
||||
else:
|
||||
l_effective_cap_len = [num_tokens] * bsz
|
||||
if self.pad_tokens_multiple is not None:
|
||||
pad_extra = (-cap_feats.shape[1]) % self.pad_tokens_multiple
|
||||
cap_feats = torch.cat((cap_feats, self.cap_pad_token.to(device=cap_feats.device, dtype=cap_feats.dtype, copy=True).unsqueeze(0).repeat(cap_feats.shape[0], pad_extra, 1)), dim=1)
|
||||
|
||||
if cap_mask is not None and not torch.is_floating_point(cap_mask):
|
||||
cap_mask = (cap_mask - 1).to(dtype) * torch.finfo(dtype).max
|
||||
cap_pos_ids = torch.zeros(bsz, cap_feats.shape[1], 3, dtype=torch.float32, device=device)
|
||||
cap_pos_ids[:, :, 0] = torch.arange(cap_feats.shape[1], dtype=torch.float32, device=device) + 1.0
|
||||
|
||||
img_sizes = [(img.size(1), img.size(2)) for img in x]
|
||||
l_effective_img_len = [(H // pH) * (W // pW) for (H, W) in img_sizes]
|
||||
B, C, H, W = x.shape
|
||||
x = self.x_embedder(x.view(B, C, H // pH, pH, W // pW, pW).permute(0, 2, 4, 3, 5, 1).flatten(3).flatten(1, 2))
|
||||
|
||||
max_seq_len = max(
|
||||
(cap_len+img_len for cap_len, img_len in zip(l_effective_cap_len, l_effective_img_len))
|
||||
)
|
||||
max_cap_len = max(l_effective_cap_len)
|
||||
max_img_len = max(l_effective_img_len)
|
||||
rope_options = transformer_options.get("rope_options", None)
|
||||
h_scale = 1.0
|
||||
w_scale = 1.0
|
||||
h_start = 0
|
||||
w_start = 0
|
||||
if rope_options is not None:
|
||||
h_scale = rope_options.get("scale_y", 1.0)
|
||||
w_scale = rope_options.get("scale_x", 1.0)
|
||||
|
||||
position_ids = torch.zeros(bsz, max_seq_len, 3, dtype=torch.int32, device=device)
|
||||
h_start = rope_options.get("shift_y", 0.0)
|
||||
w_start = rope_options.get("shift_x", 0.0)
|
||||
|
||||
for i in range(bsz):
|
||||
cap_len = l_effective_cap_len[i]
|
||||
img_len = l_effective_img_len[i]
|
||||
H, W = img_sizes[i]
|
||||
H_tokens, W_tokens = H // pH, W // pW
|
||||
assert H_tokens * W_tokens == img_len
|
||||
H_tokens, W_tokens = H // pH, W // pW
|
||||
x_pos_ids = torch.zeros((bsz, x.shape[1], 3), dtype=torch.float32, device=device)
|
||||
x_pos_ids[:, :, 0] = cap_feats.shape[1] + 1
|
||||
x_pos_ids[:, :, 1] = (torch.arange(H_tokens, dtype=torch.float32, device=device) * h_scale + h_start).view(-1, 1).repeat(1, W_tokens).flatten()
|
||||
x_pos_ids[:, :, 2] = (torch.arange(W_tokens, dtype=torch.float32, device=device) * w_scale + w_start).view(1, -1).repeat(H_tokens, 1).flatten()
|
||||
|
||||
position_ids[i, :cap_len, 0] = torch.arange(cap_len, dtype=torch.int32, device=device)
|
||||
position_ids[i, cap_len:cap_len+img_len, 0] = cap_len
|
||||
row_ids = torch.arange(H_tokens, dtype=torch.int32, device=device).view(-1, 1).repeat(1, W_tokens).flatten()
|
||||
col_ids = torch.arange(W_tokens, dtype=torch.int32, device=device).view(1, -1).repeat(H_tokens, 1).flatten()
|
||||
position_ids[i, cap_len:cap_len+img_len, 1] = row_ids
|
||||
position_ids[i, cap_len:cap_len+img_len, 2] = col_ids
|
||||
if self.pad_tokens_multiple is not None:
|
||||
pad_extra = (-x.shape[1]) % self.pad_tokens_multiple
|
||||
x = torch.cat((x, self.x_pad_token.to(device=x.device, dtype=x.dtype, copy=True).unsqueeze(0).repeat(x.shape[0], pad_extra, 1)), dim=1)
|
||||
x_pos_ids = torch.nn.functional.pad(x_pos_ids, (0, 0, 0, pad_extra))
|
||||
|
||||
freqs_cis = self.rope_embedder(position_ids).movedim(1, 2).to(dtype)
|
||||
|
||||
# build freqs_cis for cap and image individually
|
||||
cap_freqs_cis_shape = list(freqs_cis.shape)
|
||||
# cap_freqs_cis_shape[1] = max_cap_len
|
||||
cap_freqs_cis_shape[1] = cap_feats.shape[1]
|
||||
cap_freqs_cis = torch.zeros(*cap_freqs_cis_shape, device=device, dtype=freqs_cis.dtype)
|
||||
|
||||
img_freqs_cis_shape = list(freqs_cis.shape)
|
||||
img_freqs_cis_shape[1] = max_img_len
|
||||
img_freqs_cis = torch.zeros(*img_freqs_cis_shape, device=device, dtype=freqs_cis.dtype)
|
||||
|
||||
for i in range(bsz):
|
||||
cap_len = l_effective_cap_len[i]
|
||||
img_len = l_effective_img_len[i]
|
||||
cap_freqs_cis[i, :cap_len] = freqs_cis[i, :cap_len]
|
||||
img_freqs_cis[i, :img_len] = freqs_cis[i, cap_len:cap_len+img_len]
|
||||
freqs_cis = self.rope_embedder(torch.cat((cap_pos_ids, x_pos_ids), dim=1)).movedim(1, 2)
|
||||
|
||||
# refine context
|
||||
for layer in self.context_refiner:
|
||||
cap_feats = layer(cap_feats, cap_mask, cap_freqs_cis, transformer_options=transformer_options)
|
||||
cap_feats = layer(cap_feats, cap_mask, freqs_cis[:, :cap_pos_ids.shape[1]], transformer_options=transformer_options)
|
||||
|
||||
# refine image
|
||||
flat_x = []
|
||||
for i in range(bsz):
|
||||
img = x[i]
|
||||
C, H, W = img.size()
|
||||
img = img.view(C, H // pH, pH, W // pW, pW).permute(1, 3, 2, 4, 0).flatten(2).flatten(0, 1)
|
||||
flat_x.append(img)
|
||||
x = flat_x
|
||||
padded_img_embed = torch.zeros(bsz, max_img_len, x[0].shape[-1], device=device, dtype=x[0].dtype)
|
||||
padded_img_mask = torch.zeros(bsz, max_img_len, dtype=dtype, device=device)
|
||||
for i in range(bsz):
|
||||
padded_img_embed[i, :l_effective_img_len[i]] = x[i]
|
||||
padded_img_mask[i, l_effective_img_len[i]:] = -torch.finfo(dtype).max
|
||||
|
||||
padded_img_embed = self.x_embedder(padded_img_embed)
|
||||
padded_img_mask = padded_img_mask.unsqueeze(1)
|
||||
padded_img_mask = None
|
||||
for layer in self.noise_refiner:
|
||||
padded_img_embed = layer(padded_img_embed, padded_img_mask, img_freqs_cis, t, transformer_options=transformer_options)
|
||||
|
||||
if cap_mask is not None:
|
||||
mask = torch.zeros(bsz, max_seq_len, dtype=dtype, device=device)
|
||||
mask[:, :max_cap_len] = cap_mask[:, :max_cap_len]
|
||||
else:
|
||||
mask = None
|
||||
|
||||
padded_full_embed = torch.zeros(bsz, max_seq_len, self.dim, device=device, dtype=x[0].dtype)
|
||||
for i in range(bsz):
|
||||
cap_len = l_effective_cap_len[i]
|
||||
img_len = l_effective_img_len[i]
|
||||
|
||||
padded_full_embed[i, :cap_len] = cap_feats[i, :cap_len]
|
||||
padded_full_embed[i, cap_len:cap_len+img_len] = padded_img_embed[i, :img_len]
|
||||
x = layer(x, padded_img_mask, freqs_cis[:, cap_pos_ids.shape[1]:], t, transformer_options=transformer_options)
|
||||
|
||||
padded_full_embed = torch.cat((cap_feats, x), dim=1)
|
||||
mask = None
|
||||
img_sizes = [(H, W)] * bsz
|
||||
l_effective_cap_len = [cap_feats.shape[1]] * bsz
|
||||
return padded_full_embed, mask, img_sizes, l_effective_cap_len, freqs_cis
|
||||
|
||||
def forward(self, x, timesteps, context, num_tokens, attention_mask=None, **kwargs):
|
||||
@ -603,7 +610,9 @@ class NextDiT(nn.Module):
|
||||
).execute(x, timesteps, context, num_tokens, attention_mask, **kwargs)
|
||||
|
||||
# def forward(self, x, t, cap_feats, cap_mask):
|
||||
def _forward(self, x, timesteps, context, num_tokens, attention_mask=None, **kwargs):
|
||||
def _forward(self, x, timesteps, context, num_tokens, attention_mask=None, transformer_options=None, **kwargs):
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
t = 1.0 - timesteps
|
||||
cap_feats = context
|
||||
cap_mask = attention_mask
|
||||
@ -615,21 +624,36 @@ class NextDiT(nn.Module):
|
||||
y: (N,) tensor of text tokens/features
|
||||
"""
|
||||
|
||||
t = self.t_embedder(t, dtype=x.dtype) # (N, D)
|
||||
t = self.t_embedder(t * self.time_scale, dtype=x.dtype) # (N, D)
|
||||
adaln_input = t
|
||||
|
||||
cap_feats = self.cap_embedder(cap_feats) # (N, L, D) # todo check if able to batchify w.o. redundant compute
|
||||
|
||||
transformer_options = kwargs.get("transformer_options", {})
|
||||
if self.clip_text_pooled_proj is not None:
|
||||
pooled = kwargs.get("clip_text_pooled", None)
|
||||
if pooled is not None:
|
||||
pooled = self.clip_text_pooled_proj(pooled)
|
||||
else:
|
||||
pooled = torch.zeros((1, self.clip_text_dim), device=x.device, dtype=x.dtype)
|
||||
|
||||
adaln_input = self.time_text_embed(torch.cat((t, pooled), dim=-1))
|
||||
|
||||
patches = transformer_options.get("patches", {})
|
||||
x_is_tensor = isinstance(x, torch.Tensor)
|
||||
x, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, t, num_tokens, transformer_options=transformer_options)
|
||||
freqs_cis = freqs_cis.to(x.device)
|
||||
img, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, t, num_tokens, transformer_options=transformer_options)
|
||||
freqs_cis = freqs_cis.to(img.device)
|
||||
|
||||
for layer in self.layers:
|
||||
x = layer(x, mask, freqs_cis, adaln_input, transformer_options=transformer_options)
|
||||
for i, layer in enumerate(self.layers):
|
||||
img = layer(img, mask, freqs_cis, adaln_input, transformer_options=transformer_options)
|
||||
if "double_block" in patches:
|
||||
for p in patches["double_block"]:
|
||||
out = p({"img": img[:, cap_size[0]:], "txt": img[:, :cap_size[0]], "pe": freqs_cis[:, cap_size[0]:], "vec": adaln_input, "x": x, "block_index": i, "transformer_options": transformer_options})
|
||||
if "img" in out:
|
||||
img[:, cap_size[0]:] = out["img"]
|
||||
if "txt" in out:
|
||||
img[:, :cap_size[0]] = out["txt"]
|
||||
|
||||
x = self.final_layer(x, adaln_input)
|
||||
x = self.unpatchify(x, img_size, cap_size, return_tensor=x_is_tensor)[:,:,:h,:w]
|
||||
|
||||
return -x
|
||||
img = self.final_layer(img, adaln_input)
|
||||
img = self.unpatchify(img, img_size, cap_size, return_tensor=x_is_tensor)[:, :, :h, :w]
|
||||
|
||||
return -img
|
||||
|
||||
@ -1,10 +1,12 @@
|
||||
import logging
|
||||
import math
|
||||
from contextlib import contextmanager
|
||||
from typing import Any, Dict, Tuple, Union, Callable
|
||||
from typing import Any, Dict, Tuple, Union, Callable, Optional
|
||||
|
||||
import torch
|
||||
|
||||
from einops import rearrange
|
||||
from ...model_management import cast_to
|
||||
from ..modules.distributions.distributions import DiagonalGaussianDistribution
|
||||
from ..modules.ema import LitEma
|
||||
from ..util import instantiate_from_config, get_obj_from_str
|
||||
@ -12,6 +14,7 @@ from ... import ops
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DiagonalGaussianRegularizer(torch.nn.Module):
|
||||
def __init__(self, sample: bool = False):
|
||||
super().__init__()
|
||||
@ -20,7 +23,7 @@ class DiagonalGaussianRegularizer(torch.nn.Module):
|
||||
def get_trainable_parameters(self) -> Any:
|
||||
yield from ()
|
||||
|
||||
def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, dict]:
|
||||
def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, Optional[dict]]:
|
||||
posterior = DiagonalGaussianDistribution(z)
|
||||
if self.sample:
|
||||
z = posterior.sample()
|
||||
@ -28,13 +31,15 @@ class DiagonalGaussianRegularizer(torch.nn.Module):
|
||||
z = posterior.mode()
|
||||
return z, None
|
||||
|
||||
|
||||
class EmptyRegularizer(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, dict]:
|
||||
def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, Optional[dict]]:
|
||||
return z, None
|
||||
|
||||
|
||||
class AbstractAutoencoder(torch.nn.Module):
|
||||
"""
|
||||
This is the base class for all autoencoders, including image autoencoders, image autoencoders with discriminators,
|
||||
@ -181,8 +186,26 @@ class AutoencodingEngineLegacy(AutoencodingEngine):
|
||||
self.post_quant_conv = conv_op(embed_dim, ddconfig["z_channels"], 1)
|
||||
self.embed_dim = embed_dim
|
||||
|
||||
if ddconfig.get("batch_norm_latent", False):
|
||||
self.bn_eps = 1e-4
|
||||
self.bn_momentum = 0.1
|
||||
self.ps = [2, 2]
|
||||
self.bn = torch.nn.BatchNorm2d(math.prod(self.ps) * ddconfig["z_channels"],
|
||||
eps=self.bn_eps,
|
||||
momentum=self.bn_momentum,
|
||||
affine=False,
|
||||
track_running_stats=True,
|
||||
)
|
||||
self.bn.eval()
|
||||
else:
|
||||
self.bn = None
|
||||
|
||||
def get_autoencoder_params(self) -> list:
|
||||
return list(self.parameters())
|
||||
|
||||
def encode(
|
||||
self, x: torch.Tensor, return_reg_log: bool = False
|
||||
self, x: torch.Tensor, return_reg_log: bool = False,
|
||||
unregularized: bool = False,
|
||||
) -> Union[torch.Tensor, Tuple[torch.Tensor, dict]]:
|
||||
if self.max_batch_size is None:
|
||||
z = self.encoder(x)
|
||||
@ -199,11 +222,36 @@ class AutoencodingEngineLegacy(AutoencodingEngine):
|
||||
z = torch.cat(z, 0)
|
||||
|
||||
z, reg_log = self.regularization(z)
|
||||
|
||||
if self.bn is not None:
|
||||
z = rearrange(z,
|
||||
"... c (i pi) (j pj) -> ... (c pi pj) i j",
|
||||
pi=self.ps[0],
|
||||
pj=self.ps[1],
|
||||
)
|
||||
|
||||
z = torch.nn.functional.batch_norm(z,
|
||||
cast_to(self.bn.running_mean, dtype=z.dtype, device=z.device),
|
||||
cast_to(self.bn.running_var, dtype=z.dtype, device=z.device),
|
||||
momentum=self.bn_momentum,
|
||||
eps=self.bn_eps)
|
||||
|
||||
if return_reg_log:
|
||||
return z, reg_log
|
||||
return z
|
||||
|
||||
def decode(self, z: torch.Tensor, **decoder_kwargs) -> torch.Tensor:
|
||||
if self.bn is not None:
|
||||
s = torch.sqrt(cast_to(self.bn.running_var.view(1, -1, 1, 1), dtype=z.dtype, device=z.device) + self.bn_eps)
|
||||
m = cast_to(self.bn.running_mean.view(1, -1, 1, 1), dtype=z.dtype, device=z.device)
|
||||
z = z * s + m
|
||||
z = rearrange(
|
||||
z,
|
||||
"... (c pi pj) i j -> ... c (i pi) (j pj)",
|
||||
pi=self.ps[0],
|
||||
pj=self.ps[1],
|
||||
)
|
||||
|
||||
if self.max_batch_size is None:
|
||||
dec = self.post_quant_conv(z)
|
||||
dec = self.decoder(dec, **decoder_kwargs)
|
||||
|
||||
@ -557,6 +557,7 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha
|
||||
|
||||
@wrap_attn
|
||||
def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False, **kwargs):
|
||||
exception_fallback = False
|
||||
if skip_reshape:
|
||||
b, _, _, dim_head = q.shape
|
||||
tensor_layout = "HND"
|
||||
@ -581,6 +582,8 @@ def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=
|
||||
out = sageattn(q, k, v, attn_mask=mask, is_causal=False, tensor_layout=tensor_layout)
|
||||
except Exception as e:
|
||||
logger.error("Error running sage attention: {}, using pytorch attention instead.".format(e))
|
||||
exception_fallback = True
|
||||
if exception_fallback:
|
||||
if tensor_layout == "NHD":
|
||||
q, k, v = map(
|
||||
lambda t: t.transpose(1, 2),
|
||||
|
||||
@ -213,12 +213,14 @@ class TimestepEmbedder(nn.Module):
|
||||
Embeds scalar timesteps into vector representations.
|
||||
"""
|
||||
|
||||
def __init__(self, hidden_size, frequency_embedding_size=256, dtype=None, device=None, operations=None):
|
||||
def __init__(self, hidden_size, frequency_embedding_size=256, output_size=None, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
if output_size is None:
|
||||
output_size = hidden_size
|
||||
self.mlp = nn.Sequential(
|
||||
operations.Linear(frequency_embedding_size, hidden_size, bias=True, dtype=dtype, device=device),
|
||||
nn.SiLU(),
|
||||
operations.Linear(hidden_size, hidden_size, bias=True, dtype=dtype, device=device),
|
||||
operations.Linear(hidden_size, output_size, bias=True, dtype=dtype, device=device),
|
||||
)
|
||||
self.frequency_embedding_size = frequency_embedding_size
|
||||
|
||||
|
||||
@ -18,6 +18,13 @@ if model_management.xformers_enabled_vae():
|
||||
import xformers.ops # pylint: disable=import-error
|
||||
|
||||
|
||||
def torch_cat_if_needed(xl, dim):
|
||||
if len(xl) > 1:
|
||||
return torch.cat(xl, dim)
|
||||
else:
|
||||
return xl[0]
|
||||
|
||||
|
||||
def get_timestep_embedding(timesteps, embedding_dim):
|
||||
"""
|
||||
This matches the implementation in Denoising Diffusion Probabilistic Models:
|
||||
@ -41,13 +48,43 @@ def get_timestep_embedding(timesteps, embedding_dim):
|
||||
|
||||
def nonlinearity(x):
|
||||
# swish
|
||||
return torch.nn.functional.silu(x)
|
||||
return torch.nn.functional.silu(x)
|
||||
|
||||
|
||||
def Normalize(in_channels, num_groups=32):
|
||||
return ops.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
|
||||
|
||||
|
||||
class CarriedConv3d(nn.Module):
|
||||
def __init__(self, n_channels, out_channels, kernel_size, stride=1, dilation=1, padding=0, **kwargs):
|
||||
super().__init__()
|
||||
self.conv = ops.Conv3d(n_channels, out_channels, kernel_size, stride=stride, dilation=dilation, **kwargs)
|
||||
|
||||
def forward(self, x):
|
||||
return self.conv(x)
|
||||
|
||||
|
||||
def conv_carry_causal_3d(xl, op, conv_carry_in=None, conv_carry_out=None):
|
||||
x = xl[0]
|
||||
xl.clear()
|
||||
|
||||
if isinstance(op, CarriedConv3d):
|
||||
if conv_carry_in is None:
|
||||
x = torch.nn.functional.pad(x, (1, 1, 1, 1, 2, 0), mode='replicate')
|
||||
else:
|
||||
carry_len = conv_carry_in[0].shape[2]
|
||||
x = torch.nn.functional.pad(x, (1, 1, 1, 1, 2 - carry_len, 0), mode='replicate')
|
||||
x = torch.cat([conv_carry_in.pop(0), x], dim=2)
|
||||
|
||||
if conv_carry_out is not None:
|
||||
to_push = x[:, :, -2:, :, :].clone()
|
||||
conv_carry_out.append(to_push)
|
||||
|
||||
out = op(x)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class VideoConv3d(nn.Module):
|
||||
def __init__(self, n_channels, out_channels, kernel_size, stride=1, dilation=1, padding_mode='replicate', padding=1, **kwargs):
|
||||
super().__init__()
|
||||
@ -96,29 +133,24 @@ class Upsample(nn.Module):
|
||||
stride=1,
|
||||
padding=1)
|
||||
|
||||
def forward(self, x):
|
||||
def forward(self, x, conv_carry_in=None, conv_carry_out=None):
|
||||
scale_factor = self.scale_factor
|
||||
if isinstance(scale_factor, (int, float)):
|
||||
scale_factor = (scale_factor,) * (x.ndim - 2)
|
||||
|
||||
if x.ndim == 5 and scale_factor[0] > 1.0:
|
||||
t = x.shape[2]
|
||||
if t > 1:
|
||||
a, b = x.split((1, t - 1), dim=2)
|
||||
del x
|
||||
b = interpolate_up(b, scale_factor)
|
||||
else:
|
||||
a = x
|
||||
|
||||
a = interpolate_up(a.squeeze(2), scale_factor=scale_factor[1:]).unsqueeze(2)
|
||||
if t > 1:
|
||||
x = torch.cat((a, b), dim=2)
|
||||
else:
|
||||
x = a
|
||||
results = []
|
||||
if conv_carry_in is None:
|
||||
first = x[:, :, :1, :, :]
|
||||
results.append(interpolate_up(first.squeeze(2), scale_factor=scale_factor[1:]).unsqueeze(2))
|
||||
x = x[:, :, 1:, :, :]
|
||||
if x.shape[2] > 0:
|
||||
results.append(interpolate_up(x, scale_factor))
|
||||
x = torch_cat_if_needed(results, dim=2)
|
||||
else:
|
||||
x = interpolate_up(x, scale_factor)
|
||||
if self.with_conv:
|
||||
x = self.conv(x)
|
||||
x = conv_carry_causal_3d([x], self.conv, conv_carry_in, conv_carry_out)
|
||||
return x
|
||||
|
||||
|
||||
@ -134,17 +166,20 @@ class Downsample(nn.Module):
|
||||
stride=stride,
|
||||
padding=0)
|
||||
|
||||
def forward(self, x):
|
||||
def forward(self, x, conv_carry_in=None, conv_carry_out=None):
|
||||
if self.with_conv:
|
||||
if x.ndim == 4:
|
||||
if isinstance(self.conv, CarriedConv3d):
|
||||
x = conv_carry_causal_3d([x], self.conv, conv_carry_in, conv_carry_out)
|
||||
elif x.ndim == 4:
|
||||
pad = (0, 1, 0, 1)
|
||||
mode = "constant"
|
||||
x = torch.nn.functional.pad(x, pad, mode=mode, value=0)
|
||||
x = self.conv(x)
|
||||
elif x.ndim == 5:
|
||||
pad = (1, 1, 1, 1, 2, 0)
|
||||
mode = "replicate"
|
||||
x = torch.nn.functional.pad(x, pad, mode=mode)
|
||||
x = self.conv(x)
|
||||
x = self.conv(x)
|
||||
else:
|
||||
x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
|
||||
return x
|
||||
@ -190,23 +225,23 @@ class ResnetBlock(nn.Module):
|
||||
stride=1,
|
||||
padding=0)
|
||||
|
||||
def forward(self, x, temb=None):
|
||||
def forward(self, x, temb=None, conv_carry_in=None, conv_carry_out=None):
|
||||
h = x
|
||||
h = self.norm1(h)
|
||||
h = self.swish(h)
|
||||
h = self.conv1(h)
|
||||
h = [self.swish(h)]
|
||||
h = conv_carry_causal_3d(h, self.conv1, conv_carry_in=conv_carry_in, conv_carry_out=conv_carry_out)
|
||||
|
||||
if temb is not None:
|
||||
h = h + self.temb_proj(self.swish(temb))[:, :, None, None]
|
||||
|
||||
h = self.norm2(h)
|
||||
h = self.swish(h)
|
||||
h = self.dropout(h)
|
||||
h = self.conv2(h)
|
||||
h = [self.dropout(h)]
|
||||
h = conv_carry_causal_3d(h, self.conv2, conv_carry_in=conv_carry_in, conv_carry_out=conv_carry_out)
|
||||
|
||||
if self.in_channels != self.out_channels:
|
||||
if self.use_conv_shortcut:
|
||||
x = self.conv_shortcut(x)
|
||||
x = conv_carry_causal_3d([x], self.conv_shortcut, conv_carry_in=conv_carry_in, conv_carry_out=conv_carry_out)
|
||||
else:
|
||||
x = self.nin_shortcut(x)
|
||||
|
||||
@ -290,6 +325,7 @@ def pytorch_attention(q, k, v):
|
||||
orig_shape = q.shape
|
||||
B = orig_shape[0]
|
||||
C = orig_shape[1]
|
||||
oom_fallback = False
|
||||
q, k, v = map(
|
||||
lambda t: t.view(B, 1, C, -1).transpose(2, 3).contiguous(),
|
||||
(q, k, v),
|
||||
@ -300,6 +336,8 @@ def pytorch_attention(q, k, v):
|
||||
out = out.transpose(2, 3).reshape(orig_shape)
|
||||
except model_management.OOM_EXCEPTION:
|
||||
logger.warning("scaled_dot_product_attention OOMed: switched to slice attention")
|
||||
oom_fallback = True
|
||||
if oom_fallback:
|
||||
out = slice_attention(q.view(B, -1, C), k.view(B, -1, C).transpose(1, 2), v.view(B, -1, C).transpose(1, 2)).reshape(orig_shape)
|
||||
return out
|
||||
|
||||
@ -529,9 +567,14 @@ class Encoder(nn.Module):
|
||||
self.num_res_blocks = num_res_blocks
|
||||
self.resolution = resolution
|
||||
self.in_channels = in_channels
|
||||
self.carried = False
|
||||
|
||||
if conv3d:
|
||||
conv_op = VideoConv3d
|
||||
if not attn_resolutions:
|
||||
conv_op = CarriedConv3d
|
||||
self.carried = True
|
||||
else:
|
||||
conv_op = VideoConv3d
|
||||
mid_attn_conv_op = ops.Conv3d
|
||||
else:
|
||||
conv_op = ops.Conv2d
|
||||
@ -544,6 +587,7 @@ class Encoder(nn.Module):
|
||||
stride=1,
|
||||
padding=1)
|
||||
|
||||
self.time_compress = 1
|
||||
curr_res = resolution
|
||||
in_ch_mult = (1,) + tuple(ch_mult)
|
||||
self.in_ch_mult = in_ch_mult
|
||||
@ -570,10 +614,15 @@ class Encoder(nn.Module):
|
||||
if time_compress is not None:
|
||||
if (self.num_resolutions - 1 - i_level) > math.log2(time_compress):
|
||||
stride = (1, 2, 2)
|
||||
else:
|
||||
self.time_compress *= 2
|
||||
down.downsample = Downsample(block_in, resamp_with_conv, stride=stride, conv_op=conv_op)
|
||||
curr_res = curr_res // 2
|
||||
self.down.append(down)
|
||||
|
||||
if time_compress is not None:
|
||||
self.time_compress = time_compress
|
||||
|
||||
# middle
|
||||
self.mid = nn.Module()
|
||||
self.mid.block_1 = ResnetBlock(in_channels=block_in,
|
||||
@ -599,15 +648,42 @@ class Encoder(nn.Module):
|
||||
def forward(self, x):
|
||||
# timestep embedding
|
||||
temb = None
|
||||
# downsampling
|
||||
h = self.conv_in(x)
|
||||
for i_level in range(self.num_resolutions):
|
||||
for i_block in range(self.num_res_blocks):
|
||||
h = self.down[i_level].block[i_block](h, temb)
|
||||
if len(self.down[i_level].attn) > 0:
|
||||
h = self.down[i_level].attn[i_block](h)
|
||||
if i_level != self.num_resolutions - 1:
|
||||
h = self.down[i_level].downsample(h)
|
||||
|
||||
if self.carried:
|
||||
xl = [x[:, :, :1, :, :]]
|
||||
if x.shape[2] > self.time_compress:
|
||||
tc = self.time_compress
|
||||
xl += torch.split(x[:, :, 1: 1 + ((x.shape[2] - 1) // tc) * tc, :, :], tc * 2, dim=2)
|
||||
x = xl
|
||||
else:
|
||||
x = [x]
|
||||
out = []
|
||||
|
||||
conv_carry_in = None
|
||||
|
||||
for i, x1 in enumerate(x):
|
||||
conv_carry_out = []
|
||||
if i == len(x) - 1:
|
||||
conv_carry_out = None
|
||||
|
||||
# downsampling
|
||||
x1 = [x1]
|
||||
h1 = conv_carry_causal_3d(x1, self.conv_in, conv_carry_in, conv_carry_out)
|
||||
|
||||
for i_level in range(self.num_resolutions):
|
||||
for i_block in range(self.num_res_blocks):
|
||||
h1 = self.down[i_level].block[i_block](h1, temb, conv_carry_in, conv_carry_out)
|
||||
if len(self.down[i_level].attn) > 0:
|
||||
assert i == 0 # carried should not happen if attn exists
|
||||
h1 = self.down[i_level].attn[i_block](h1)
|
||||
if i_level != self.num_resolutions - 1:
|
||||
h1 = self.down[i_level].downsample(h1, conv_carry_in, conv_carry_out)
|
||||
|
||||
out.append(h1)
|
||||
conv_carry_in = conv_carry_out
|
||||
|
||||
h = torch_cat_if_needed(out, dim=2)
|
||||
del out
|
||||
|
||||
# middle
|
||||
h = self.mid.block_1(h, temb)
|
||||
@ -616,15 +692,15 @@ class Encoder(nn.Module):
|
||||
|
||||
# end
|
||||
h = self.norm_out(h)
|
||||
h = nonlinearity(h)
|
||||
h = self.conv_out(h)
|
||||
h = [nonlinearity(h)]
|
||||
h = conv_carry_causal_3d(h, self.conv_out)
|
||||
return h
|
||||
|
||||
|
||||
class Decoder(nn.Module):
|
||||
def __init__(self, *, ch, out_ch, ch_mult=(1, 2, 4, 8), num_res_blocks,
|
||||
attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
|
||||
resolution, z_channels, give_pre_end=False, tanh_out=False, use_linear_attn=False,
|
||||
resolution, z_channels, tanh_out=False, use_linear_attn=False,
|
||||
conv_out_op=ops.Conv2d,
|
||||
resnet_op=ResnetBlock,
|
||||
attn_op=AttnBlock,
|
||||
@ -638,12 +714,18 @@ class Decoder(nn.Module):
|
||||
self.num_res_blocks = num_res_blocks
|
||||
self.resolution = resolution
|
||||
self.in_channels = in_channels
|
||||
self.give_pre_end = give_pre_end
|
||||
self.tanh_out = tanh_out
|
||||
self.carried = False
|
||||
|
||||
if conv3d:
|
||||
conv_op = VideoConv3d
|
||||
conv_out_op = VideoConv3d
|
||||
if not attn_resolutions and resnet_op == ResnetBlock:
|
||||
conv_op = CarriedConv3d
|
||||
conv_out_op = CarriedConv3d
|
||||
self.carried = True
|
||||
else:
|
||||
conv_op = VideoConv3d
|
||||
conv_out_op = VideoConv3d
|
||||
|
||||
mid_attn_conv_op = ops.Conv3d
|
||||
else:
|
||||
conv_op = ops.Conv2d
|
||||
@ -718,29 +800,43 @@ class Decoder(nn.Module):
|
||||
temb = None
|
||||
|
||||
# z to block_in
|
||||
h = self.conv_in(z)
|
||||
h = conv_carry_causal_3d([z], self.conv_in)
|
||||
|
||||
# middle
|
||||
h = self.mid.block_1(h, temb, **kwargs)
|
||||
h = self.mid.attn_1(h, **kwargs)
|
||||
h = self.mid.block_2(h, temb, **kwargs)
|
||||
|
||||
if self.carried:
|
||||
h = torch.split(h, 2, dim=2)
|
||||
else:
|
||||
h = [h]
|
||||
out = []
|
||||
|
||||
conv_carry_in = None
|
||||
|
||||
# upsampling
|
||||
for i_level in reversed(range(self.num_resolutions)):
|
||||
for i_block in range(self.num_res_blocks + 1):
|
||||
h = self.up[i_level].block[i_block](h, temb, **kwargs)
|
||||
if len(self.up[i_level].attn) > 0:
|
||||
h = self.up[i_level].attn[i_block](h, **kwargs)
|
||||
if i_level != 0:
|
||||
h = self.up[i_level].upsample(h)
|
||||
for i, h1 in enumerate(h):
|
||||
conv_carry_out = []
|
||||
if i == len(h) - 1:
|
||||
conv_carry_out = None
|
||||
for i_level in reversed(range(self.num_resolutions)):
|
||||
for i_block in range(self.num_res_blocks + 1):
|
||||
h1 = self.up[i_level].block[i_block](h1, temb, conv_carry_in, conv_carry_out, **kwargs)
|
||||
if len(self.up[i_level].attn) > 0:
|
||||
assert i == 0 # carried should not happen if attn exists
|
||||
h1 = self.up[i_level].attn[i_block](h1, **kwargs)
|
||||
if i_level != 0:
|
||||
h1 = self.up[i_level].upsample(h1, conv_carry_in, conv_carry_out)
|
||||
|
||||
# end
|
||||
if self.give_pre_end:
|
||||
return h
|
||||
h1 = self.norm_out(h1)
|
||||
h1 = [nonlinearity(h1)]
|
||||
h1 = conv_carry_causal_3d(h1, self.conv_out, conv_carry_in, conv_carry_out)
|
||||
if self.tanh_out:
|
||||
h1 = torch.tanh(h1)
|
||||
out.append(h1)
|
||||
conv_carry_in = conv_carry_out
|
||||
|
||||
h = self.norm_out(h)
|
||||
h = nonlinearity(h)
|
||||
h = self.conv_out(h, **kwargs)
|
||||
if self.tanh_out:
|
||||
h = torch.tanh(h)
|
||||
return h
|
||||
out = torch_cat_if_needed(out, dim=2)
|
||||
|
||||
return out
|
||||
|
||||
@ -44,7 +44,7 @@ class QwenImageControlNetModel(QwenImageTransformer2DModel):
|
||||
txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size) // 2, ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size) // 2))
|
||||
txt_ids = torch.arange(txt_start, txt_start + context.shape[1], device=x.device).reshape(1, -1, 1).repeat(x.shape[0], 1, 3)
|
||||
ids = torch.cat((txt_ids, img_ids), dim=1)
|
||||
image_rotary_emb = self.pe_embedder(ids).squeeze(1).unsqueeze(2).to(x.dtype)
|
||||
image_rotary_emb = self.pe_embedder(ids).to(x.dtype).contiguous()
|
||||
del ids, txt_ids, img_ids
|
||||
|
||||
hidden_states = self.img_in(hidden_states) + self.controlnet_x_embedder(hint)
|
||||
|
||||
@ -10,6 +10,7 @@ from ..flux.layers import EmbedND
|
||||
from ..lightricks.model import TimestepEmbedding, Timesteps
|
||||
from ..modules.attention import optimized_attention_masked
|
||||
from ...patcher_extension import WrapperExecutor, get_all_wrappers, WrappersMP
|
||||
from ..flux.math import apply_rope1
|
||||
|
||||
|
||||
class GELU(nn.Module):
|
||||
@ -137,33 +138,34 @@ class Attention(nn.Module):
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
batch_size = hidden_states.shape[0]
|
||||
seq_img = hidden_states.shape[1]
|
||||
seq_txt = encoder_hidden_states.shape[1]
|
||||
|
||||
img_query = self.to_q(hidden_states).unflatten(-1, (self.heads, -1))
|
||||
img_key = self.to_k(hidden_states).unflatten(-1, (self.heads, -1))
|
||||
img_value = self.to_v(hidden_states).unflatten(-1, (self.heads, -1))
|
||||
# Project and reshape to BHND format (batch, heads, seq, dim)
|
||||
img_query = self.to_q(hidden_states).view(batch_size, seq_img, self.heads, -1).transpose(1, 2).contiguous()
|
||||
img_key = self.to_k(hidden_states).view(batch_size, seq_img, self.heads, -1).transpose(1, 2).contiguous()
|
||||
img_value = self.to_v(hidden_states).view(batch_size, seq_img, self.heads, -1).transpose(1, 2)
|
||||
|
||||
txt_query = self.add_q_proj(encoder_hidden_states).unflatten(-1, (self.heads, -1))
|
||||
txt_key = self.add_k_proj(encoder_hidden_states).unflatten(-1, (self.heads, -1))
|
||||
txt_value = self.add_v_proj(encoder_hidden_states).unflatten(-1, (self.heads, -1))
|
||||
txt_query = self.add_q_proj(encoder_hidden_states).view(batch_size, seq_txt, self.heads, -1).transpose(1, 2).contiguous()
|
||||
txt_key = self.add_k_proj(encoder_hidden_states).view(batch_size, seq_txt, self.heads, -1).transpose(1, 2).contiguous()
|
||||
txt_value = self.add_v_proj(encoder_hidden_states).view(batch_size, seq_txt, self.heads, -1).transpose(1, 2)
|
||||
|
||||
img_query = self.norm_q(img_query)
|
||||
img_key = self.norm_k(img_key)
|
||||
txt_query = self.norm_added_q(txt_query)
|
||||
txt_key = self.norm_added_k(txt_key)
|
||||
|
||||
joint_query = torch.cat([txt_query, img_query], dim=1)
|
||||
joint_key = torch.cat([txt_key, img_key], dim=1)
|
||||
joint_value = torch.cat([txt_value, img_value], dim=1)
|
||||
joint_query = torch.cat([txt_query, img_query], dim=2)
|
||||
joint_key = torch.cat([txt_key, img_key], dim=2)
|
||||
joint_value = torch.cat([txt_value, img_value], dim=2)
|
||||
|
||||
joint_query = apply_rotary_emb(joint_query, image_rotary_emb)
|
||||
joint_key = apply_rotary_emb(joint_key, image_rotary_emb)
|
||||
joint_query = apply_rope1(joint_query, image_rotary_emb)
|
||||
joint_key = apply_rope1(joint_key, image_rotary_emb)
|
||||
|
||||
joint_query = joint_query.flatten(start_dim=2)
|
||||
joint_key = joint_key.flatten(start_dim=2)
|
||||
joint_value = joint_value.flatten(start_dim=2)
|
||||
|
||||
joint_hidden_states = optimized_attention_masked(joint_query, joint_key, joint_value, self.heads, attention_mask, transformer_options=transformer_options)
|
||||
joint_hidden_states = optimized_attention_masked(joint_query, joint_key, joint_value, self.heads,
|
||||
attention_mask, transformer_options=transformer_options,
|
||||
skip_reshape=True)
|
||||
|
||||
txt_attn_output = joint_hidden_states[:, :seq_txt, :]
|
||||
img_attn_output = joint_hidden_states[:, seq_txt:, :]
|
||||
@ -239,10 +241,10 @@ class QwenImageTransformerBlock(nn.Module):
|
||||
img_mod1, img_mod2 = img_mod_params.chunk(2, dim=-1)
|
||||
txt_mod1, txt_mod2 = txt_mod_params.chunk(2, dim=-1)
|
||||
|
||||
img_normed = self.img_norm1(hidden_states)
|
||||
img_modulated, img_gate1 = self._modulate(img_normed, img_mod1)
|
||||
txt_normed = self.txt_norm1(encoder_hidden_states)
|
||||
txt_modulated, txt_gate1 = self._modulate(txt_normed, txt_mod1)
|
||||
img_modulated, img_gate1 = self._modulate(self.img_norm1(hidden_states), img_mod1)
|
||||
del img_mod1
|
||||
txt_modulated, txt_gate1 = self._modulate(self.txt_norm1(encoder_hidden_states), txt_mod1)
|
||||
del txt_mod1
|
||||
|
||||
img_attn_output, txt_attn_output = self.attn(
|
||||
hidden_states=img_modulated,
|
||||
@ -251,16 +253,20 @@ class QwenImageTransformerBlock(nn.Module):
|
||||
image_rotary_emb=image_rotary_emb,
|
||||
transformer_options=transformer_options,
|
||||
)
|
||||
del img_modulated
|
||||
del txt_modulated
|
||||
|
||||
hidden_states = hidden_states + img_gate1 * img_attn_output
|
||||
encoder_hidden_states = encoder_hidden_states + txt_gate1 * txt_attn_output
|
||||
del img_attn_output
|
||||
del txt_attn_output
|
||||
del img_gate1
|
||||
del txt_gate1
|
||||
|
||||
img_normed2 = self.img_norm2(hidden_states)
|
||||
img_modulated2, img_gate2 = self._modulate(img_normed2, img_mod2)
|
||||
img_modulated2, img_gate2 = self._modulate(self.img_norm2(hidden_states), img_mod2)
|
||||
hidden_states = torch.addcmul(hidden_states, img_gate2, self.img_mlp(img_modulated2))
|
||||
|
||||
txt_normed2 = self.txt_norm2(encoder_hidden_states)
|
||||
txt_modulated2, txt_gate2 = self._modulate(txt_normed2, txt_mod2)
|
||||
txt_modulated2, txt_gate2 = self._modulate(self.txt_norm2(encoder_hidden_states), txt_mod2)
|
||||
encoder_hidden_states = torch.addcmul(encoder_hidden_states, txt_gate2, self.txt_mlp(txt_modulated2))
|
||||
|
||||
return encoder_hidden_states, hidden_states
|
||||
@ -421,7 +427,7 @@ class QwenImageTransformer2DModel(nn.Module):
|
||||
txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size) // 2, ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size) // 2))
|
||||
txt_ids = torch.arange(txt_start, txt_start + context.shape[1], device=x.device).reshape(1, -1, 1).repeat(x.shape[0], 1, 3)
|
||||
ids = torch.cat((txt_ids, img_ids), dim=1)
|
||||
image_rotary_emb = self.pe_embedder(ids).squeeze(1).unsqueeze(2).to(x.dtype)
|
||||
image_rotary_emb = self.pe_embedder(ids).to(x.dtype).contiguous()
|
||||
del ids, txt_ids, img_ids
|
||||
|
||||
hidden_states = self.img_in(hidden_states)
|
||||
@ -441,7 +447,10 @@ class QwenImageTransformer2DModel(nn.Module):
|
||||
patches = transformer_options.get("patches", {})
|
||||
blocks_replace = patches_replace.get("dit", {})
|
||||
|
||||
transformer_options["total_blocks"] = len(self.transformer_blocks)
|
||||
transformer_options["block_type"] = "double"
|
||||
for i, block in enumerate(self.transformer_blocks):
|
||||
transformer_options["block_index"] = i
|
||||
if ("double_block", i) in blocks_replace:
|
||||
def block_wrap(args):
|
||||
out = {}
|
||||
|
||||
@ -246,6 +246,7 @@ class WanAttentionBlock(nn.Module):
|
||||
# assert e[0].dtype == torch.float32
|
||||
|
||||
# self-attention
|
||||
x = x.contiguous() # otherwise implicit in LayerNorm
|
||||
y = self.self_attn(
|
||||
torch.addcmul(repeat_e(e[0], x), self.norm1(x), 1 + repeat_e(e[1], x)),
|
||||
freqs, transformer_options=transformer_options)
|
||||
@ -615,7 +616,7 @@ class WanModel(torch.nn.Module):
|
||||
x = self.unpatchify(x, grid_sizes)
|
||||
return x
|
||||
|
||||
def rope_encode(self, t, h, w, t_start=0, steps_t=None, steps_h=None, steps_w=None, device=None, dtype=None):
|
||||
def rope_encode(self, t, h, w, t_start=0, steps_t=None, steps_h=None, steps_w=None, device=None, dtype=None, transformer_options={}):
|
||||
patch_size = self.patch_size
|
||||
t_len = ((t + (patch_size[0] // 2)) // patch_size[0])
|
||||
h_len = ((h + (patch_size[1] // 2)) // patch_size[1])
|
||||
@ -628,10 +629,22 @@ class WanModel(torch.nn.Module):
|
||||
if steps_w is None:
|
||||
steps_w = w_len
|
||||
|
||||
h_start = 0
|
||||
w_start = 0
|
||||
rope_options = transformer_options.get("rope_options", None)
|
||||
if rope_options is not None:
|
||||
t_len = (t_len - 1.0) * rope_options.get("scale_t", 1.0) + 1.0
|
||||
h_len = (h_len - 1.0) * rope_options.get("scale_y", 1.0) + 1.0
|
||||
w_len = (w_len - 1.0) * rope_options.get("scale_x", 1.0) + 1.0
|
||||
|
||||
t_start += rope_options.get("shift_t", 0.0)
|
||||
h_start += rope_options.get("shift_y", 0.0)
|
||||
w_start += rope_options.get("shift_x", 0.0)
|
||||
|
||||
img_ids = torch.zeros((steps_t, steps_h, steps_w, 3), device=device, dtype=dtype)
|
||||
img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(t_start, t_start + (t_len - 1), steps=steps_t, device=device, dtype=dtype).reshape(-1, 1, 1)
|
||||
img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(0, h_len - 1, steps=steps_h, device=device, dtype=dtype).reshape(1, -1, 1)
|
||||
img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(0, w_len - 1, steps=steps_w, device=device, dtype=dtype).reshape(1, 1, -1)
|
||||
img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(h_start, h_start + (h_len - 1), steps=steps_h, device=device, dtype=dtype).reshape(1, -1, 1)
|
||||
img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(w_start, w_start + (w_len - 1), steps=steps_w, device=device, dtype=dtype).reshape(1, 1, -1)
|
||||
img_ids = img_ids.reshape(1, -1, img_ids.shape[-1])
|
||||
|
||||
freqs = self.rope_embedder(img_ids).movedim(1, 2)
|
||||
@ -661,7 +674,7 @@ class WanModel(torch.nn.Module):
|
||||
if self.ref_conv is not None and "reference_latent" in kwargs:
|
||||
t_len += 1
|
||||
|
||||
freqs = self.rope_encode(t_len, h, w, device=x.device, dtype=x.dtype)
|
||||
freqs = self.rope_encode(t_len, h, w, device=x.device, dtype=x.dtype, transformer_options=transformer_options)
|
||||
return self.forward_orig(x, timestep, context, clip_fea=clip_fea, freqs=freqs, transformer_options=transformer_options, **kwargs)[:, :, :t, :h, :w]
|
||||
|
||||
def unpatchify(self, x, grid_sizes):
|
||||
|
||||
@ -326,6 +326,23 @@ def model_lora_keys_unet(model, key_map=None):
|
||||
key_map["transformer.{}".format(key_lora)] = k
|
||||
key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = k # SimpleTuner lycoris format
|
||||
|
||||
if isinstance(model, model_base.Lumina2):
|
||||
diffusers_keys = utils.z_image_to_diffusers(model.model_config.unet_config, output_prefix="diffusion_model.")
|
||||
for k in diffusers_keys:
|
||||
if k.endswith(".weight"):
|
||||
to = diffusers_keys[k]
|
||||
key_lora = k[:-len(".weight")]
|
||||
key_map["diffusion_model.{}".format(key_lora)] = to
|
||||
key_map["transformer.{}".format(key_lora)] = to
|
||||
key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = to
|
||||
|
||||
if isinstance(model, model_base.Kandinsky5):
|
||||
for k in sdk:
|
||||
if k.startswith("diffusion_model.") and k.endswith(".weight"):
|
||||
key_lora = k[len("diffusion_model."):-len(".weight")]
|
||||
key_map["{}".format(key_lora)] = k
|
||||
key_map["transformer.{}".format(key_lora)] = k
|
||||
|
||||
return key_map
|
||||
|
||||
|
||||
|
||||
@ -53,6 +53,7 @@ from .ldm.modules.encoders.noise_aug_modules import CLIPEmbeddingNoiseAugmentati
|
||||
from .ldm.chroma_radiance import model as chroma_radiance
|
||||
from .ldm.omnigen.omnigen2 import OmniGen2Transformer2DModel
|
||||
from .ldm.pixart.pixartms import PixArtMS
|
||||
from .ldm.kandinsky5 import model as kadinsky5_model
|
||||
from .ldm.qwen_image.model import QwenImageTransformer2DModel
|
||||
from .ldm.wan.model import WanModel, VaceWanModel, CameraWanModel, WanModel_S2V, HumoWanModel
|
||||
from .ldm.wan.model_animate import AnimateWanModel
|
||||
@ -149,7 +150,7 @@ class BaseModel(torch.nn.Module):
|
||||
if not unet_config.get("disable_unet_model_creation", False):
|
||||
if model_config.custom_operations is None:
|
||||
fp8 = model_config.optimizations.get("fp8", False)
|
||||
operations = ops.pick_operations(unet_config.get("dtype", None), self.manual_cast_dtype, fp8_optimizations=fp8, scaled_fp8=model_config.scaled_fp8)
|
||||
operations = ops.pick_operations(unet_config.get("dtype", None), self.manual_cast_dtype, fp8_optimizations=fp8, model_config=model_config)
|
||||
else:
|
||||
operations = model_config.custom_operations
|
||||
self.operations = operations
|
||||
@ -216,8 +217,14 @@ class BaseModel(torch.nn.Module):
|
||||
extra_conds[o] = extra
|
||||
|
||||
t = self.process_timestep(t, x=x, **extra_conds)
|
||||
model_output = self.diffusion_model(xc, t, context=context, control=control, transformer_options=transformer_options, **extra_conds).float()
|
||||
return self.model_sampling.calculate_denoised(sigma, model_output, x)
|
||||
if "latent_shapes" in extra_conds:
|
||||
xc = utils.unpack_latents(xc, extra_conds.pop("latent_shapes"))
|
||||
|
||||
model_output = self.diffusion_model(xc, t, context=context, control=control, transformer_options=transformer_options, **extra_conds)
|
||||
if len(model_output) > 1 and not torch.is_tensor(model_output):
|
||||
model_output, _ = utils.pack_latents(model_output)
|
||||
|
||||
return self.model_sampling.calculate_denoised(sigma, model_output.float(), x)
|
||||
|
||||
def process_timestep(self, timestep, **kwargs):
|
||||
return timestep
|
||||
@ -343,10 +350,6 @@ class BaseModel(torch.nn.Module):
|
||||
extra_sds.append(self.model_config.process_clip_vision_state_dict_for_saving(clip_vision_state_dict))
|
||||
|
||||
unet_state_dict = self.diffusion_model.state_dict()
|
||||
|
||||
if self.model_config.scaled_fp8 is not None:
|
||||
unet_state_dict["scaled_fp8"] = torch.tensor([], dtype=self.model_config.scaled_fp8)
|
||||
|
||||
unet_state_dict = self.model_config.process_unet_state_dict_for_saving(unet_state_dict)
|
||||
|
||||
if self.model_type == ModelType.V_PREDICTION:
|
||||
@ -921,12 +924,13 @@ class Flux(BaseModel):
|
||||
attention_mask = kwargs.get("attention_mask", None)
|
||||
if attention_mask is not None:
|
||||
shape = kwargs["noise"].shape
|
||||
mask_ref_size = kwargs["attention_mask_img_shape"]
|
||||
# the model will pad to the patch size, and then divide
|
||||
# essentially dividing and rounding up
|
||||
(h_tok, w_tok) = (math.ceil(shape[2] / self.diffusion_model.patch_size), math.ceil(shape[3] / self.diffusion_model.patch_size))
|
||||
attention_mask = utils.upscale_dit_mask(attention_mask, mask_ref_size, (h_tok, w_tok))
|
||||
out['attention_mask'] = conds.CONDRegular(attention_mask)
|
||||
mask_ref_size = kwargs.get("attention_mask_img_shape", None)
|
||||
if mask_ref_size is not None:
|
||||
# the model will pad to the patch size, and then divide
|
||||
# essentially dividing and rounding up
|
||||
(h_tok, w_tok) = (math.ceil(shape[2] / self.diffusion_model.patch_size), math.ceil(shape[3] / self.diffusion_model.patch_size))
|
||||
attention_mask = utils.upscale_dit_mask(attention_mask, mask_ref_size, (h_tok, w_tok))
|
||||
out['attention_mask'] = conds.CONDRegular(attention_mask)
|
||||
|
||||
guidance = kwargs.get("guidance", 3.5)
|
||||
if guidance is not None:
|
||||
@ -948,7 +952,19 @@ class Flux(BaseModel):
|
||||
out = {}
|
||||
ref_latents = kwargs.get("reference_latents", None)
|
||||
if ref_latents is not None:
|
||||
out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()), ref_latents)) // 16])
|
||||
out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()[2:]), ref_latents))])
|
||||
return out
|
||||
|
||||
|
||||
class Flux2(Flux):
|
||||
def extra_conds(self, **kwargs):
|
||||
out = super().extra_conds(**kwargs)
|
||||
cross_attn = kwargs.get("cross_attn", None)
|
||||
if cross_attn is not None:
|
||||
target_text_len = 512
|
||||
if cross_attn.shape[1] < target_text_len:
|
||||
cross_attn = torch.nn.functional.pad(cross_attn, (0, 0, target_text_len - cross_attn.shape[1], 0))
|
||||
out['c_crossattn'] = conds.CONDRegular(cross_attn)
|
||||
return out
|
||||
|
||||
|
||||
@ -1135,6 +1151,12 @@ class Lumina2(BaseModel):
|
||||
cross_attn = kwargs.get("cross_attn", None)
|
||||
if cross_attn is not None:
|
||||
out['c_crossattn'] = conds.CONDRegular(cross_attn)
|
||||
if 'num_tokens' not in out:
|
||||
out['num_tokens'] = conds.CONDConstant(cross_attn.shape[1])
|
||||
|
||||
clip_text_pooled = kwargs["pooled_output"] # Newbie
|
||||
if clip_text_pooled is not None:
|
||||
out['clip_text_pooled'] = conds.CONDRegular(clip_text_pooled)
|
||||
return out
|
||||
|
||||
|
||||
@ -1580,3 +1602,144 @@ class HunyuanImage21Refiner(HunyuanImage21):
|
||||
out = super().extra_conds(**kwargs)
|
||||
out['disable_time_r'] = conds.CONDConstant(True)
|
||||
return out
|
||||
|
||||
|
||||
class HunyuanVideo15(HunyuanVideo):
|
||||
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
|
||||
super().__init__(model_config, model_type, device=device)
|
||||
|
||||
def concat_cond(self, **kwargs):
|
||||
noise = kwargs.get("noise", None)
|
||||
extra_channels = self.diffusion_model.img_in.proj.weight.shape[1] - noise.shape[1] - 1 # noise 32 img cond 32 + mask 1
|
||||
if extra_channels == 0:
|
||||
return None
|
||||
|
||||
image = kwargs.get("concat_latent_image", None)
|
||||
device = kwargs["device"]
|
||||
|
||||
if image is None:
|
||||
shape_image = list(noise.shape)
|
||||
shape_image[1] = extra_channels
|
||||
image = torch.zeros(shape_image, dtype=noise.dtype, layout=noise.layout, device=noise.device)
|
||||
else:
|
||||
latent_dim = self.latent_format.latent_channels
|
||||
image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
|
||||
for i in range(0, image.shape[1], latent_dim):
|
||||
image[:, i: i + latent_dim] = self.process_latent_in(image[:, i: i + latent_dim])
|
||||
image = utils.resize_to_batch_size(image, noise.shape[0])
|
||||
|
||||
mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
|
||||
if mask is None:
|
||||
mask = torch.zeros_like(noise)[:, :1]
|
||||
else:
|
||||
mask = 1.0 - mask
|
||||
mask = utils.common_upscale(mask.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
|
||||
if mask.shape[-3] < noise.shape[-3]:
|
||||
mask = torch.nn.functional.pad(mask, (0, 0, 0, 0, 0, noise.shape[-3] - mask.shape[-3]), mode='constant', value=0)
|
||||
mask = utils.resize_to_batch_size(mask, noise.shape[0])
|
||||
|
||||
return torch.cat((image, mask), dim=1)
|
||||
|
||||
def extra_conds(self, **kwargs):
|
||||
out = super().extra_conds(**kwargs)
|
||||
attention_mask = kwargs.get("attention_mask", None)
|
||||
if attention_mask is not None:
|
||||
if torch.numel(attention_mask) != attention_mask.sum():
|
||||
out['attention_mask'] = conds.CONDRegular(attention_mask)
|
||||
cross_attn = kwargs.get("cross_attn", None)
|
||||
if cross_attn is not None:
|
||||
out['c_crossattn'] = conds.CONDRegular(cross_attn)
|
||||
|
||||
conditioning_byt5small = kwargs.get("conditioning_byt5small", None)
|
||||
if conditioning_byt5small is not None:
|
||||
out['txt_byt5'] = conds.CONDRegular(conditioning_byt5small)
|
||||
|
||||
guidance = kwargs.get("guidance", 6.0)
|
||||
if guidance is not None:
|
||||
out['guidance'] = conds.CONDRegular(torch.FloatTensor([guidance]))
|
||||
|
||||
clip_vision_output = kwargs.get("clip_vision_output", None)
|
||||
if clip_vision_output is not None:
|
||||
out['clip_fea'] = conds.CONDRegular(clip_vision_output.last_hidden_state)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class HunyuanVideo15_SR_Distilled(HunyuanVideo15):
|
||||
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
|
||||
super().__init__(model_config, model_type, device=device)
|
||||
|
||||
def concat_cond(self, **kwargs):
|
||||
noise = kwargs.get("noise", None)
|
||||
image = kwargs.get("concat_latent_image", None)
|
||||
noise_augmentation = kwargs.get("noise_augmentation", 0.0)
|
||||
device = kwargs["device"]
|
||||
|
||||
if image is None:
|
||||
image = torch.zeros([noise.shape[0], noise.shape[1] * 2 + 2, noise.shape[-3], noise.shape[-2], noise.shape[-1]], device=model_management.intermediate_device())
|
||||
else:
|
||||
image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
|
||||
# image = self.process_latent_in(image) # scaling wasn't applied in reference code
|
||||
image = utils.resize_to_batch_size(image, noise.shape[0])
|
||||
lq_image_slice = slice(noise.shape[1] + 1, 2 * noise.shape[1] + 1)
|
||||
if noise_augmentation > 0:
|
||||
generator = torch.Generator(device="cpu")
|
||||
generator.manual_seed(kwargs.get("seed", 0) - 10)
|
||||
noise = torch.randn(image[:, lq_image_slice].shape, generator=generator, dtype=image.dtype, device="cpu").to(image.device)
|
||||
image[:, lq_image_slice] = noise_augmentation * noise + min(1.0 - noise_augmentation, 0.75) * image[:, lq_image_slice]
|
||||
else:
|
||||
image[:, lq_image_slice] = 0.75 * image[:, lq_image_slice]
|
||||
return image
|
||||
|
||||
def extra_conds(self, **kwargs):
|
||||
out = super().extra_conds(**kwargs)
|
||||
out['disable_time_r'] = conds.CONDConstant(False)
|
||||
return out
|
||||
|
||||
|
||||
class Kandinsky5(BaseModel):
|
||||
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
|
||||
super().__init__(model_config, model_type, device=device, unet_model=kadinsky5_model.Kandinsky5)
|
||||
|
||||
def encode_adm(self, **kwargs):
|
||||
return kwargs["pooled_output"]
|
||||
|
||||
def concat_cond(self, **kwargs):
|
||||
noise = kwargs.get("noise", None)
|
||||
device = kwargs["device"]
|
||||
image = torch.zeros_like(noise)
|
||||
|
||||
mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
|
||||
if mask is None:
|
||||
mask = torch.zeros_like(noise)[:, :1]
|
||||
else:
|
||||
mask = 1.0 - mask
|
||||
mask = utils.common_upscale(mask.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
|
||||
if mask.shape[-3] < noise.shape[-3]:
|
||||
mask = torch.nn.functional.pad(mask, (0, 0, 0, 0, 0, noise.shape[-3] - mask.shape[-3]), mode='constant', value=0)
|
||||
mask = utils.resize_to_batch_size(mask, noise.shape[0])
|
||||
|
||||
return torch.cat((image, mask), dim=1)
|
||||
|
||||
def extra_conds(self, **kwargs):
|
||||
out = super().extra_conds(**kwargs)
|
||||
attention_mask = kwargs.get("attention_mask", None)
|
||||
if attention_mask is not None:
|
||||
out['attention_mask'] = conds.CONDRegular(attention_mask)
|
||||
cross_attn = kwargs.get("cross_attn", None)
|
||||
if cross_attn is not None:
|
||||
out['c_crossattn'] = conds.CONDRegular(cross_attn)
|
||||
|
||||
time_dim_replace = kwargs.get("time_dim_replace", None)
|
||||
if time_dim_replace is not None:
|
||||
out['time_dim_replace'] = conds.CONDRegular(self.process_latent_in(time_dim_replace))
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class Kandinsky5Image(Kandinsky5):
|
||||
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
|
||||
super().__init__(model_config, model_type, device=device)
|
||||
|
||||
def concat_cond(self, **kwargs):
|
||||
return None
|
||||
|
||||
@ -8,6 +8,7 @@ import torch
|
||||
from . import supported_models, utils
|
||||
from . import supported_models_base
|
||||
from .gguf import GGMLOps
|
||||
from .utils import detect_layer_quantization
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -180,30 +181,73 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
||||
|
||||
guidance_keys = list(filter(lambda a: a.startswith("{}guidance_in.".format(key_prefix)), state_dict_keys))
|
||||
dit_config["guidance_embed"] = len(guidance_keys) > 0
|
||||
|
||||
# HunyuanVideo 1.5
|
||||
if '{}cond_type_embedding.weight'.format(key_prefix) in state_dict_keys:
|
||||
dit_config["use_cond_type_embedding"] = True
|
||||
else:
|
||||
dit_config["use_cond_type_embedding"] = False
|
||||
if '{}vision_in.proj.0.weight'.format(key_prefix) in state_dict_keys:
|
||||
dit_config["vision_in_dim"] = state_dict['{}vision_in.proj.0.weight'.format(key_prefix)].shape[0]
|
||||
dit_config["meanflow_sum"] = True
|
||||
else:
|
||||
dit_config["vision_in_dim"] = None
|
||||
dit_config["meanflow_sum"] = False
|
||||
return dit_config
|
||||
|
||||
if '{}double_blocks.0.img_attn.norm.key_norm.scale'.format(key_prefix) in state_dict_keys and ('{}img_in.weight'.format(key_prefix) in state_dict_keys or f"{key_prefix}distilled_guidance_layer.norms.0.scale" in state_dict_keys): # Flux, Chroma or Chroma Radiance (has no img_in.weight)
|
||||
dit_config = {}
|
||||
dit_config["image_model"] = "flux"
|
||||
if '{}double_stream_modulation_img.lin.weight'.format(key_prefix) in state_dict_keys:
|
||||
dit_config["image_model"] = "flux2"
|
||||
dit_config["axes_dim"] = [32, 32, 32, 32]
|
||||
dit_config["num_heads"] = 48
|
||||
dit_config["mlp_ratio"] = 3.0
|
||||
dit_config["theta"] = 2000
|
||||
dit_config["out_channels"] = 128
|
||||
dit_config["global_modulation"] = True
|
||||
dit_config["mlp_silu_act"] = True
|
||||
dit_config["qkv_bias"] = False
|
||||
dit_config["ops_bias"] = False
|
||||
dit_config["default_ref_method"] = "index"
|
||||
dit_config["ref_index_scale"] = 10.0
|
||||
dit_config["txt_ids_dims"] = [3]
|
||||
patch_size = 1
|
||||
else:
|
||||
dit_config["image_model"] = "flux"
|
||||
dit_config["axes_dim"] = [16, 56, 56]
|
||||
dit_config["num_heads"] = 24
|
||||
dit_config["mlp_ratio"] = 4.0
|
||||
dit_config["theta"] = 10000
|
||||
dit_config["out_channels"] = 16
|
||||
dit_config["qkv_bias"] = True
|
||||
dit_config["txt_ids_dims"] = []
|
||||
patch_size = 2
|
||||
|
||||
dit_config["in_channels"] = 16
|
||||
patch_size = 2
|
||||
dit_config["hidden_size"] = 3072
|
||||
dit_config["context_in_dim"] = 4096
|
||||
|
||||
dit_config["patch_size"] = patch_size
|
||||
in_key = "{}img_in.weight".format(key_prefix)
|
||||
if in_key in state_dict_keys:
|
||||
dit_config["in_channels"] = state_dict[in_key].shape[1] // (patch_size * patch_size)
|
||||
dit_config["out_channels"] = 16
|
||||
w = state_dict[in_key]
|
||||
dit_config["in_channels"] = w.shape[1] // (patch_size * patch_size)
|
||||
dit_config["hidden_size"] = w.shape[0]
|
||||
|
||||
txt_in_key = "{}txt_in.weight".format(key_prefix)
|
||||
if txt_in_key in state_dict_keys:
|
||||
w = state_dict[txt_in_key]
|
||||
dit_config["context_in_dim"] = w.shape[1]
|
||||
dit_config["hidden_size"] = w.shape[0]
|
||||
|
||||
vec_in_key = '{}vector_in.in_layer.weight'.format(key_prefix)
|
||||
if vec_in_key in state_dict_keys:
|
||||
dit_config["vec_in_dim"] = state_dict[vec_in_key].shape[1]
|
||||
dit_config["context_in_dim"] = 4096
|
||||
dit_config["hidden_size"] = 3072
|
||||
dit_config["mlp_ratio"] = 4.0
|
||||
dit_config["num_heads"] = 24
|
||||
else:
|
||||
dit_config["vec_in_dim"] = None
|
||||
|
||||
dit_config["depth"] = count_blocks(state_dict_keys, '{}double_blocks.'.format(key_prefix) + '{}.')
|
||||
dit_config["depth_single_blocks"] = count_blocks(state_dict_keys, '{}single_blocks.'.format(key_prefix) + '{}.')
|
||||
dit_config["axes_dim"] = [16, 56, 56]
|
||||
dit_config["theta"] = 10000
|
||||
dit_config["qkv_bias"] = True
|
||||
if '{}distilled_guidance_layer.0.norms.0.scale'.format(key_prefix) in state_dict_keys or '{}distilled_guidance_layer.norms.0.scale'.format(key_prefix) in state_dict_keys: # Chroma
|
||||
dit_config["image_model"] = "chroma"
|
||||
dit_config["in_channels"] = 64
|
||||
@ -224,8 +268,17 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
||||
dit_config["nerf_tile_size"] = 512
|
||||
dit_config["nerf_final_head_type"] = "conv" if f"{key_prefix}nerf_final_layer_conv.norm.scale" in state_dict_keys else "linear"
|
||||
dit_config["nerf_embedder_dtype"] = torch.float32
|
||||
if "__x0__" in state_dict_keys: # x0 pred
|
||||
dit_config["use_x0"] = True
|
||||
else:
|
||||
dit_config["use_x0"] = False
|
||||
else:
|
||||
dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys
|
||||
dit_config["yak_mlp"] = '{}double_blocks.0.img_mlp.gate_proj.weight'.format(key_prefix) in state_dict_keys
|
||||
dit_config["txt_norm"] = "{}txt_norm.scale".format(key_prefix) in state_dict_keys
|
||||
if dit_config["yak_mlp"] and dit_config["txt_norm"]: # Ovis model
|
||||
dit_config["txt_ids_dims"] = [1, 2]
|
||||
|
||||
return dit_config
|
||||
|
||||
if '{}t5_yproj.weight'.format(key_prefix) in state_dict_keys: # Genmo mochi preview
|
||||
@ -372,14 +425,34 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
||||
dit_config["image_model"] = "lumina2"
|
||||
dit_config["patch_size"] = 2
|
||||
dit_config["in_channels"] = 16
|
||||
dit_config["dim"] = 2304
|
||||
dit_config["cap_feat_dim"] = state_dict['{}cap_embedder.1.weight'.format(key_prefix)].shape[1]
|
||||
w = state_dict['{}cap_embedder.1.weight'.format(key_prefix)]
|
||||
dit_config["dim"] = w.shape[0]
|
||||
dit_config["cap_feat_dim"] = w.shape[1]
|
||||
dit_config["n_layers"] = count_blocks(state_dict_keys, '{}layers.'.format(key_prefix) + '{}.')
|
||||
dit_config["n_heads"] = 24
|
||||
dit_config["n_kv_heads"] = 8
|
||||
dit_config["qk_norm"] = True
|
||||
dit_config["axes_dims"] = [32, 32, 32]
|
||||
dit_config["axes_lens"] = [300, 512, 512]
|
||||
|
||||
if dit_config["dim"] == 2304: # Original Lumina 2
|
||||
dit_config["n_heads"] = 24
|
||||
dit_config["n_kv_heads"] = 8
|
||||
dit_config["axes_dims"] = [32, 32, 32]
|
||||
dit_config["axes_lens"] = [300, 512, 512]
|
||||
dit_config["rope_theta"] = 10000.0
|
||||
dit_config["ffn_dim_multiplier"] = 4.0
|
||||
ctd_weight = state_dict.get('{}clip_text_pooled_proj.0.weight'.format(key_prefix), None)
|
||||
if ctd_weight is not None:
|
||||
dit_config["clip_text_dim"] = ctd_weight.shape[0]
|
||||
elif dit_config["dim"] == 3840: # Z image
|
||||
dit_config["n_heads"] = 30
|
||||
dit_config["n_kv_heads"] = 30
|
||||
dit_config["axes_dims"] = [32, 48, 48]
|
||||
dit_config["axes_lens"] = [1536, 512, 512]
|
||||
dit_config["rope_theta"] = 256.0
|
||||
dit_config["ffn_dim_multiplier"] = (8.0 / 3.0)
|
||||
dit_config["z_image_modulation"] = True
|
||||
dit_config["time_scale"] = 1000.0
|
||||
if '{}cap_pad_token'.format(key_prefix) in state_dict_keys:
|
||||
dit_config["pad_tokens_multiple"] = 32
|
||||
|
||||
return dit_config
|
||||
|
||||
if '{}head.modulation'.format(key_prefix) in state_dict_keys: # Wan 2.1
|
||||
@ -556,6 +629,24 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
||||
dit_config["num_layers"] = count_blocks(state_dict_keys, '{}transformer_blocks.'.format(key_prefix) + '{}.')
|
||||
return dit_config
|
||||
|
||||
if '{}visual_transformer_blocks.0.cross_attention.key_norm.weight'.format(key_prefix) in state_dict_keys: # Kandinsky 5
|
||||
dit_config = {}
|
||||
model_dim = state_dict['{}visual_embeddings.in_layer.bias'.format(key_prefix)].shape[0]
|
||||
dit_config["model_dim"] = model_dim
|
||||
if model_dim in [4096, 2560]: # pro video and lite image
|
||||
dit_config["axes_dims"] = (32, 48, 48)
|
||||
if model_dim == 2560: # lite image
|
||||
dit_config["rope_scale_factor"] = (1.0, 1.0, 1.0)
|
||||
elif model_dim == 1792: # lite video
|
||||
dit_config["axes_dims"] = (16, 24, 24)
|
||||
dit_config["time_dim"] = state_dict['{}time_embeddings.in_layer.bias'.format(key_prefix)].shape[0]
|
||||
dit_config["image_model"] = "kandinsky5"
|
||||
dit_config["ff_dim"] = state_dict['{}visual_transformer_blocks.0.feed_forward.in_layer.weight'.format(key_prefix)].shape[0]
|
||||
dit_config["visual_embed_dim"] = state_dict['{}visual_embeddings.in_layer.weight'.format(key_prefix)].shape[1]
|
||||
dit_config["num_text_blocks"] = count_blocks(state_dict_keys, '{}text_transformer_blocks.'.format(key_prefix) + '{}.')
|
||||
dit_config["num_visual_blocks"] = count_blocks(state_dict_keys, '{}visual_transformer_blocks.'.format(key_prefix) + '{}.')
|
||||
return dit_config
|
||||
|
||||
if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys:
|
||||
return None
|
||||
|
||||
@ -699,16 +790,11 @@ def model_config_from_unet(state_dict, unet_key_prefix, use_base_if_no_match=Fal
|
||||
if model_config is None and use_base_if_no_match:
|
||||
model_config = supported_models_base.BASE(unet_config)
|
||||
|
||||
scaled_fp8_key = "{}scaled_fp8".format(unet_key_prefix)
|
||||
if scaled_fp8_key in state_dict:
|
||||
scaled_fp8_weight = state_dict.pop(scaled_fp8_key)
|
||||
model_config.scaled_fp8 = scaled_fp8_weight.dtype
|
||||
if model_config.scaled_fp8 == torch.float32:
|
||||
model_config.scaled_fp8 = torch.float8_e4m3fn
|
||||
if scaled_fp8_weight.nelement() == 2:
|
||||
model_config.optimizations["fp8"] = False
|
||||
else:
|
||||
model_config.optimizations["fp8"] = True
|
||||
# Detect per-layer quantization (mixed precision)
|
||||
quant_config = detect_layer_quantization(state_dict, unet_key_prefix)
|
||||
if quant_config:
|
||||
model_config.quant_config = quant_config
|
||||
logger.debug("Detected mixed precision quantization")
|
||||
|
||||
if metadata is not None and "format" in metadata and metadata["format"] == "gguf":
|
||||
model_config.custom_operations = GGMLOps
|
||||
|
||||
@ -14,6 +14,7 @@ from pathlib import Path
|
||||
from typing import List, Optional, Final, Set
|
||||
|
||||
import requests
|
||||
import requests_cache
|
||||
import tqdm
|
||||
from huggingface_hub import dump_environment_info, hf_hub_download, scan_cache_dir, snapshot_download, HfFileSystem, CacheNotFound
|
||||
from huggingface_hub.utils import GatedRepoError, LocalEntryNotFoundError
|
||||
@ -137,37 +138,39 @@ def get_or_download(folder_name: str, filename: str, known_files: Optional[List[
|
||||
path = None
|
||||
|
||||
cache_hit = False
|
||||
try:
|
||||
# always retrieve this from the cache if it already exists there
|
||||
path = hf_hub_download(repo_id=known_file.repo_id,
|
||||
filename=known_file.filename,
|
||||
repo_type=known_file.repo_type,
|
||||
revision=known_file.revision,
|
||||
local_files_only=True,
|
||||
local_dir=hf_destination_dir if args.force_hf_local_dir_mode else None,
|
||||
)
|
||||
logger.debug(f"hf_hub_download cache hit for {known_file.repo_id}/{known_file.filename}")
|
||||
cache_hit = True
|
||||
except LocalEntryNotFoundError:
|
||||
hf_hub_download_kwargs = dict(repo_id=known_file.repo_id,
|
||||
filename=known_file.filename,
|
||||
repo_type=known_file.repo_type,
|
||||
revision=known_file.revision,
|
||||
local_files_only=True,
|
||||
local_dir=hf_destination_dir if args.force_hf_local_dir_mode else None,
|
||||
token=True,
|
||||
)
|
||||
|
||||
with requests_cache.disabled():
|
||||
try:
|
||||
logger.debug(f"{folder_name}/{filename} is being downloaded from {known_file.repo_id}/{known_file.filename} candidate_str_match={candidate_str_match} candidate_filename_match={candidate_filename_match} candidate_alternate_filenames_match={candidate_alternate_filenames_match} candidate_save_filename_match={candidate_save_filename_match}")
|
||||
path = hf_hub_download(repo_id=known_file.repo_id,
|
||||
filename=known_file.filename,
|
||||
repo_type=known_file.repo_type,
|
||||
revision=known_file.revision,
|
||||
local_dir=hf_destination_dir if args.force_hf_local_dir_mode else None,
|
||||
)
|
||||
except requests.exceptions.HTTPError as exc_info:
|
||||
if exc_info.response.status_code == 401:
|
||||
raise GatedRepoError(f"{known_file.repo_id}/{known_file.filename}", response=exc_info.response)
|
||||
except IOError as exc_info:
|
||||
logger.error(f"cannot reach huggingface {known_file.repo_id}/{known_file.filename}", exc_info=exc_info)
|
||||
except Exception as exc_info:
|
||||
logger.error(f"an exception occurred while downloading {known_file.repo_id}/{known_file.filename}", exc_info=exc_info)
|
||||
dump_environment_info()
|
||||
for key, value in os.environ.items():
|
||||
if key.startswith("HF_XET"):
|
||||
print(f"{key}={value}", file=sys.stderr)
|
||||
# always retrieve this from the cache if it already exists there
|
||||
path = hf_hub_download(**hf_hub_download_kwargs)
|
||||
logger.debug(f"hf_hub_download cache hit for {known_file.repo_id}/{known_file.filename}")
|
||||
cache_hit = True
|
||||
except LocalEntryNotFoundError:
|
||||
try:
|
||||
logger.debug(f"{folder_name}/{filename} is being downloaded from {known_file.repo_id}/{known_file.filename} candidate_str_match={candidate_str_match} candidate_filename_match={candidate_filename_match} candidate_alternate_filenames_match={candidate_alternate_filenames_match} candidate_save_filename_match={candidate_save_filename_match}")
|
||||
hf_hub_download_kwargs.pop("local_files_only")
|
||||
path = hf_hub_download(**hf_hub_download_kwargs)
|
||||
except requests.exceptions.HTTPError as exc_info:
|
||||
if exc_info.response.status_code == 401:
|
||||
raise GatedRepoError(f"{known_file.repo_id}/{known_file.filename}", response=exc_info.response)
|
||||
except IOError as exc_info:
|
||||
logger.error(f"cannot reach huggingface {known_file.repo_id}/{known_file.filename}", exc_info=exc_info)
|
||||
except Exception as exc_info:
|
||||
logger.error(f"an exception occurred while downloading {known_file.repo_id}/{known_file.filename}. hf_hub_download kwargs={hf_hub_download_kwargs}", exc_info=exc_info)
|
||||
dump_environment_info()
|
||||
for key, value in os.environ.items():
|
||||
if key.startswith("HF_"):
|
||||
if key == "HF_TOKEN":
|
||||
value = "*****"
|
||||
print(f"{key}={value}", file=sys.stderr)
|
||||
|
||||
if path is not None and known_file.convert_to_16_bit and file_size is not None and file_size != 0:
|
||||
tensors = {}
|
||||
@ -546,6 +549,7 @@ KNOWN_APPROX_VAES: Final[KnownDownloadables] = KnownDownloadables([
|
||||
UrlFile("https://raw.githubusercontent.com/madebyollin/taesd/main/taesd3_decoder.pth", show_in_ui=False),
|
||||
UrlFile("https://raw.githubusercontent.com/madebyollin/taesd/main/taef1_encoder.pth", show_in_ui=False),
|
||||
UrlFile("https://raw.githubusercontent.com/madebyollin/taesd/main/taef1_decoder.pth", show_in_ui=False),
|
||||
# todo: update this with the video VAEs
|
||||
], folder_name="vae_approx")
|
||||
|
||||
KNOWN_VAES: Final[KnownDownloadables] = KnownDownloadables([
|
||||
@ -560,6 +564,13 @@ KNOWN_VAES: Final[KnownDownloadables] = KnownDownloadables([
|
||||
HuggingFile("Comfy-Org/Wan_2.1_ComfyUI_repackaged", "split_files/vae/wan_2.1_vae.safetensors"),
|
||||
HuggingFile("Comfy-Org/Wan_2.2_ComfyUI_Repackaged", "split_files/vae/wan2.2_vae.safetensors"),
|
||||
HuggingFile("Comfy-Org/Qwen-Image_ComfyUI", "split_files/vae/qwen_image_vae.safetensors"),
|
||||
# Flux 2
|
||||
HuggingFile("Comfy-Org/flux2-dev", "split_files/vae/flux2-vae.safetensors"),
|
||||
# Z Image Turbo
|
||||
HuggingFile("Comfy-Org/z_image_turbo", "split_files/vae/ae.safetensors", save_with_filename="z_image_turbo_vae.safetensors"),
|
||||
# Hunyuan Image
|
||||
HuggingFile("Comfy-Org/HunyuanImage_2.1_ComfyUI", "split_files/vae/hunyuan_image_2.1_vae_fp16.safetensors"),
|
||||
HuggingFile("Comfy-Org/HunyuanImage_2.1_ComfyUI", "split_files/vae/hunyuan_image_refiner_vae_fp16.safetensors"),
|
||||
], folder_name="vae")
|
||||
|
||||
KNOWN_HUGGINGFACE_MODEL_REPOS: Final[Set[str]] = {
|
||||
@ -644,8 +655,18 @@ KNOWN_UNET_MODELS: Final[KnownDownloadables] = KnownDownloadables([
|
||||
HuggingFile("Comfy-Org/Qwen-Image-Edit_ComfyUI", "split_files/diffusion_models/qwen_image_edit_2509_fp8_e4m3fn.safetensors"),
|
||||
HuggingFile("Comfy-Org/Qwen-Image-Edit_ComfyUI", "split_files/diffusion_models/qwen_image_edit_bf16.safetensors"),
|
||||
HuggingFile("Comfy-Org/Qwen-Image-Edit_ComfyUI", "split_files/diffusion_models/qwen_image_edit_fp8_e4m3fn.safetensors"),
|
||||
# Flux 2
|
||||
HuggingFile("Comfy-Org/flux2-dev", "split_files/diffusion_models/flux2_dev_fp8mixed.safetensors"),
|
||||
# Z Image Turbo
|
||||
HuggingFile("Comfy-Org/z_image_turbo", "split_files/diffusion_models/z_image_turbo_bf16.safetensors"),
|
||||
# Omnigen 2
|
||||
HuggingFile("Comfy-Org/Omnigen2_ComfyUI_repackaged", "split_files/diffusion_models/omnigen2_fp16.safetensors"),
|
||||
# Hunyuan Image
|
||||
HuggingFile("Comfy-Org/HunyuanImage_2.1_ComfyUI", "split_files/diffusion_models/hunyuanimage2.1_bf16.safetensors"),
|
||||
HuggingFile("Comfy-Org/HunyuanImage_2.1_ComfyUI", "split_files/diffusion_models/hunyuanimage2.1_refiner_bf16.safetensors"),
|
||||
# Ovis
|
||||
HuggingFile("Comfy-Org/Ovis-Image", "split_files/diffusion_models/ovis_image_bf16.safetensors"),
|
||||
], folder_names=["diffusion_models", "unet"])
|
||||
|
||||
KNOWN_CLIP_MODELS: Final[KnownDownloadables] = KnownDownloadables([
|
||||
# todo: is this correct?
|
||||
HuggingFile("comfyanonymous/flux_text_encoders", "t5xxl_fp16.safetensors"),
|
||||
@ -668,6 +689,16 @@ KNOWN_CLIP_MODELS: Final[KnownDownloadables] = KnownDownloadables([
|
||||
HuggingFile("Comfy-Org/HiDream-I1_ComfyUI", "split_files/text_encoders/llama_3.1_8b_instruct_fp8_scaled.safetensors"),
|
||||
HuggingFile("Comfy-Org/Qwen-Image_ComfyUI", "split_files/text_encoders/qwen_2.5_vl_7b.safetensors"),
|
||||
HuggingFile("Comfy-Org/Qwen-Image_ComfyUI", "split_files/text_encoders/qwen_2.5_vl_7b_fp8_scaled.safetensors"),
|
||||
# Flux 2
|
||||
HuggingFile("Comfy-Org/flux2-dev", "split_files/text_encoders/mistral_3_small_flux2_fp8.safetensors"),
|
||||
HuggingFile("Comfy-Org/flux2-dev", "split_files/text_encoders/mistral_3_small_flux2_bf16.safetensors"),
|
||||
# Z Image Turbo
|
||||
HuggingFile("Comfy-Org/z_image_turbo", "split_files/text_encoders/qwen_3_4b.safetensors"),
|
||||
# Omnigen 2
|
||||
HuggingFile("Comfy-Org/Omnigen2_ComfyUI_repackaged", "split_files/text_encoders/qwen_2.5_vl_fp16.safetensors"),
|
||||
# Hunyuan Image
|
||||
HuggingFile("Comfy-Org/HunyuanImage_2.1_ComfyUI", "split_files/text_encoders/byt5_small_glyphxl_fp16.safetensors"),
|
||||
HuggingFile("Comfy-Org/HunyuanImage_2.1_ComfyUI", "split_files/text_encoders/qwen_2.5_vl_7b.safetensors"),
|
||||
], folder_names=["clip", "text_encoders"])
|
||||
|
||||
KNOWN_STYLE_MODELS: Final[KnownDownloadables] = KnownDownloadables([
|
||||
|
||||
@ -1,21 +1,18 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import collections
|
||||
import dataclasses
|
||||
import functools
|
||||
from os.path import split
|
||||
from pathlib import PurePosixPath
|
||||
from typing import Optional, List, Sequence, Union, Iterable, Protocol
|
||||
from typing import Optional, List, Sequence, Union, Iterable
|
||||
|
||||
from can_ada import parse, URL # pylint: disable=no-name-in-module
|
||||
from typing_extensions import TypedDict, NotRequired, runtime_checkable
|
||||
from typing_extensions import TypedDict, NotRequired
|
||||
|
||||
from .component_model.executor_types import ComboOptions
|
||||
from .component_model.files import canonicalize_path
|
||||
|
||||
|
||||
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class UrlFile:
|
||||
_url: str
|
||||
|
||||
@ -114,6 +114,7 @@ if args.deterministic:
|
||||
|
||||
directml_device = None
|
||||
if args.directml is not None:
|
||||
logger.warning("WARNING: torch-directml barely works, is very slow, has not been updated in over 1 year and might be removed soon, please don't use it, there are better options.")
|
||||
import torch_directml # pylint: disable=import-error
|
||||
|
||||
device_index = args.directml
|
||||
@ -380,15 +381,20 @@ except:
|
||||
pass
|
||||
|
||||
SUPPORT_FP8_OPS = args.supports_fp8_compute
|
||||
|
||||
AMD_RDNA2_AND_OLDER_ARCH = ["gfx1030", "gfx1031", "gfx1010", "gfx1011", "gfx1012", "gfx906", "gfx900", "gfx803"]
|
||||
|
||||
try:
|
||||
if is_amd():
|
||||
torch.backends.cudnn.enabled = False # Seems to improve things a lot on AMD
|
||||
logger.info("Set: torch.backends.cudnn.enabled = False for better AMD performance.")
|
||||
arch = torch.cuda.get_device_properties(get_torch_device()).gcnArchName
|
||||
if not (any((a in arch) for a in AMD_RDNA2_AND_OLDER_ARCH)):
|
||||
torch.backends.cudnn.enabled = False # Seems to improve things a lot on AMD
|
||||
logger.info("Set: torch.backends.cudnn.enabled = False for better AMD performance.")
|
||||
try:
|
||||
rocm_version = tuple(map(int, str(torch.version.hip).split(".")[:2]))
|
||||
except:
|
||||
rocm_version = (6, -1)
|
||||
arch = torch.cuda.get_device_properties(get_torch_device()).gcnArchName
|
||||
|
||||
logger.debug("AMD arch: {}".format(arch))
|
||||
logger.debug("ROCm version: {}".format(rocm_version))
|
||||
if args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
|
||||
@ -557,6 +563,7 @@ class LoadedModel:
|
||||
if use_more_vram == 0:
|
||||
use_more_vram = 1e32
|
||||
self.model_use_more_vram(use_more_vram, force_patch_weights=force_patch_weights)
|
||||
|
||||
real_model = self.model.model
|
||||
|
||||
if is_intel_xpu() and not args.disable_ipex_optimize and 'ipex' in globals() and real_model is not None:
|
||||
@ -811,8 +818,11 @@ def _load_models_gpu(models: Sequence[ModelManageable], memory_required: int = 0
|
||||
loaded_memory = loaded_model.model_loaded_memory()
|
||||
current_free_mem = get_free_memory(torch_dev) + loaded_memory
|
||||
|
||||
lowvram_model_memory = max(128 * 1024 * 1024, (current_free_mem - minimum_memory_required), min(current_free_mem * MIN_WEIGHT_MEMORY_RATIO, current_free_mem - minimum_inference_memory()))
|
||||
lowvram_model_memory = max(0.1, lowvram_model_memory - loaded_memory)
|
||||
lowvram_model_memory = max(0, (current_free_mem - minimum_memory_required), min(current_free_mem * MIN_WEIGHT_MEMORY_RATIO, current_free_mem - minimum_inference_memory()))
|
||||
lowvram_model_memory = lowvram_model_memory - loaded_memory
|
||||
|
||||
if lowvram_model_memory == 0:
|
||||
lowvram_model_memory = 0.1
|
||||
|
||||
if vram_set_state == VRAMState.NO_VRAM:
|
||||
lowvram_model_memory = 0.1
|
||||
@ -1149,13 +1159,6 @@ def device_supports_non_blocking(device):
|
||||
return True
|
||||
|
||||
|
||||
def device_should_use_non_blocking(device):
|
||||
if not device_supports_non_blocking(device):
|
||||
return False
|
||||
return False
|
||||
# return True #TODO: figure out why this causes memory issues on Nvidia and possibly others
|
||||
|
||||
|
||||
def force_channels_last():
|
||||
if args.force_channels_last:
|
||||
return True
|
||||
@ -1165,57 +1168,77 @@ def force_channels_last():
|
||||
|
||||
|
||||
STREAMS = {}
|
||||
NUM_STREAMS = 1
|
||||
if args.async_offload:
|
||||
NUM_STREAMS = 2
|
||||
NUM_STREAMS = 0
|
||||
if args.async_offload is not None:
|
||||
NUM_STREAMS = args.async_offload
|
||||
else:
|
||||
# Enable by default on Nvidia
|
||||
if is_nvidia():
|
||||
NUM_STREAMS = 2
|
||||
|
||||
if args.disable_async_offload:
|
||||
NUM_STREAMS = 0
|
||||
|
||||
if NUM_STREAMS > 0:
|
||||
logger.debug("Using async weight offloading with {} streams".format(NUM_STREAMS))
|
||||
|
||||
|
||||
def current_stream(device):
|
||||
if device is None:
|
||||
return None
|
||||
if is_device_cuda(device):
|
||||
return torch.cuda.current_stream()
|
||||
elif is_device_xpu(device):
|
||||
return torch.xpu.current_stream()
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
stream_counters = {}
|
||||
|
||||
|
||||
def get_offload_stream(device):
|
||||
stream_counter = stream_counters.get(device, 0)
|
||||
if NUM_STREAMS <= 1:
|
||||
if NUM_STREAMS == 0:
|
||||
return None
|
||||
|
||||
if torch.compiler.is_compiling():
|
||||
return None
|
||||
|
||||
if device in STREAMS:
|
||||
ss = STREAMS[device]
|
||||
s = ss[stream_counter]
|
||||
# Sync the oldest stream in the queue with the current
|
||||
ss[stream_counter].wait_stream(current_stream(device))
|
||||
stream_counter = (stream_counter + 1) % len(ss)
|
||||
if is_device_cuda(device):
|
||||
ss[stream_counter].wait_stream(torch.cuda.current_stream())
|
||||
elif is_device_xpu(device):
|
||||
ss[stream_counter].wait_stream(torch.xpu.current_stream())
|
||||
stream_counters[device] = stream_counter
|
||||
return s
|
||||
return ss[stream_counter]
|
||||
elif is_device_cuda(device):
|
||||
ss = []
|
||||
for k in range(NUM_STREAMS):
|
||||
ss.append(torch.cuda.Stream(device=device, priority=0))
|
||||
s1 = torch.cuda.Stream(device=device, priority=0)
|
||||
s1.as_context = torch.cuda.stream
|
||||
ss.append(s1)
|
||||
STREAMS[device] = ss
|
||||
s = ss[stream_counter]
|
||||
stream_counter = (stream_counter + 1) % len(ss)
|
||||
stream_counters[device] = stream_counter
|
||||
return s
|
||||
elif is_device_xpu(device):
|
||||
ss = []
|
||||
for k in range(NUM_STREAMS):
|
||||
ss.append(torch.xpu.Stream(device=device, priority=0))
|
||||
s1 = torch.xpu.Stream(device=device, priority=0)
|
||||
s1.as_context = torch.xpu.stream
|
||||
ss.append(s1)
|
||||
STREAMS[device] = ss
|
||||
s = ss[stream_counter]
|
||||
stream_counter = (stream_counter + 1) % len(ss)
|
||||
stream_counters[device] = stream_counter
|
||||
return s
|
||||
return None
|
||||
|
||||
|
||||
def sync_stream(device, stream):
|
||||
if stream is None:
|
||||
if stream is None or current_stream(device) is None:
|
||||
return
|
||||
if is_device_cuda(device):
|
||||
torch.cuda.current_stream().wait_stream(stream)
|
||||
elif is_device_xpu(device):
|
||||
torch.xpu.current_stream().wait_stream(stream)
|
||||
current_stream(device).wait_stream(stream)
|
||||
|
||||
|
||||
def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, stream=None):
|
||||
@ -1224,12 +1247,18 @@ def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, str
|
||||
if dtype is None or weight.dtype == dtype:
|
||||
return weight
|
||||
if stream is not None:
|
||||
with stream:
|
||||
wf_context = stream
|
||||
if hasattr(wf_context, "as_context"):
|
||||
wf_context = wf_context.as_context(stream)
|
||||
with wf_context:
|
||||
return weight.to(dtype=dtype, copy=copy)
|
||||
return weight.to(dtype=dtype, copy=copy)
|
||||
|
||||
if stream is not None:
|
||||
with stream:
|
||||
wf_context = stream
|
||||
if hasattr(wf_context, "as_context"):
|
||||
wf_context = wf_context.as_context(stream)
|
||||
with wf_context:
|
||||
r = torch.empty_like(weight, dtype=dtype, device=device)
|
||||
r.copy_(weight, non_blocking=non_blocking)
|
||||
else:
|
||||
@ -1243,6 +1272,85 @@ def cast_to_device(tensor, device, dtype, copy=False):
|
||||
return cast_to(tensor, dtype=dtype, device=device, non_blocking=non_blocking, copy=copy)
|
||||
|
||||
|
||||
PINNED_MEMORY = {}
|
||||
TOTAL_PINNED_MEMORY = 0
|
||||
MAX_PINNED_MEMORY = -1
|
||||
if not args.disable_pinned_memory:
|
||||
if is_nvidia() or is_amd():
|
||||
if WINDOWS:
|
||||
MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.45 # Windows limit is apparently 50%
|
||||
else:
|
||||
MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.95
|
||||
logger.debug("Enabled pinned memory {}".format(MAX_PINNED_MEMORY // (1024 * 1024)))
|
||||
|
||||
PINNING_ALLOWED_TYPES = set(["Parameter", "QuantizedTensor"])
|
||||
|
||||
|
||||
def pin_memory(tensor):
|
||||
global TOTAL_PINNED_MEMORY
|
||||
if MAX_PINNED_MEMORY <= 0:
|
||||
return False
|
||||
|
||||
if type(tensor).__name__ not in PINNING_ALLOWED_TYPES:
|
||||
return False
|
||||
|
||||
if not is_device_cpu(tensor.device):
|
||||
return False
|
||||
|
||||
if tensor.is_pinned():
|
||||
# NOTE: Cuda does detect when a tensor is already pinned and would
|
||||
# error below, but there are proven cases where this also queues an error
|
||||
# on the GPU async. So dont trust the CUDA API and guard here
|
||||
return False
|
||||
|
||||
if not tensor.is_contiguous():
|
||||
return False
|
||||
|
||||
size = tensor.numel() * tensor.element_size()
|
||||
if (TOTAL_PINNED_MEMORY + size) > MAX_PINNED_MEMORY:
|
||||
return False
|
||||
|
||||
ptr = tensor.data_ptr()
|
||||
if ptr == 0:
|
||||
return False
|
||||
|
||||
if torch.cuda.cudart().cudaHostRegister(ptr, size, 1) == 0:
|
||||
PINNED_MEMORY[ptr] = size
|
||||
TOTAL_PINNED_MEMORY += size
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def unpin_memory(tensor):
|
||||
global TOTAL_PINNED_MEMORY
|
||||
if MAX_PINNED_MEMORY <= 0:
|
||||
return False
|
||||
|
||||
if not is_device_cpu(tensor.device):
|
||||
return False
|
||||
|
||||
ptr = tensor.data_ptr()
|
||||
size = tensor.numel() * tensor.element_size()
|
||||
|
||||
size_stored = PINNED_MEMORY.get(ptr, None)
|
||||
if size_stored is None:
|
||||
logger.warning("Tried to unpin tensor not pinned by ComfyUI")
|
||||
return False
|
||||
|
||||
if size != size_stored:
|
||||
logger.warning("Size of pinned tensor changed")
|
||||
return False
|
||||
|
||||
if torch.cuda.cudart().cudaHostUnregister(ptr) == 0:
|
||||
TOTAL_PINNED_MEMORY -= PINNED_MEMORY.pop(ptr)
|
||||
if len(PINNED_MEMORY) == 0:
|
||||
TOTAL_PINNED_MEMORY = 0
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def sage_attention_enabled():
|
||||
return args.use_sage_attention
|
||||
|
||||
@ -1531,7 +1639,7 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
|
||||
|
||||
if is_amd():
|
||||
arch = torch.cuda.get_device_properties(device).gcnArchName
|
||||
if any((a in arch) for a in ["gfx1030", "gfx1031", "gfx1010", "gfx1011", "gfx1012", "gfx906", "gfx900", "gfx803"]): # RDNA2 and older don't support bf16
|
||||
if any((a in arch) for a in AMD_RDNA2_AND_OLDER_ARCH): # RDNA2 and older don't support bf16
|
||||
if manual_cast:
|
||||
return True
|
||||
return False
|
||||
@ -1607,6 +1715,23 @@ def extended_fp16_support():
|
||||
return True
|
||||
|
||||
|
||||
LORA_COMPUTE_DTYPES = {}
|
||||
|
||||
|
||||
def lora_compute_dtype(device):
|
||||
dtype = LORA_COMPUTE_DTYPES.get(device, None)
|
||||
if dtype is not None:
|
||||
return dtype
|
||||
|
||||
if should_use_fp16(device):
|
||||
dtype = torch.float16
|
||||
else:
|
||||
dtype = torch.float32
|
||||
|
||||
LORA_COMPUTE_DTYPES[device] = dtype
|
||||
return dtype
|
||||
|
||||
|
||||
def soft_empty_cache(force=False):
|
||||
with model_management_lock:
|
||||
_soft_empty_cache(force=force)
|
||||
|
||||
@ -3,7 +3,8 @@ from __future__ import annotations
|
||||
import copy
|
||||
import dataclasses
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from typing import Protocol, Optional, TypeVar, runtime_checkable, Callable, Any, NamedTuple, TYPE_CHECKING
|
||||
import weakref
|
||||
from typing import Any, Callable, Protocol, runtime_checkable, Optional, TypeVar, NamedTuple, TYPE_CHECKING
|
||||
|
||||
import torch
|
||||
import torch.nn
|
||||
@ -26,8 +27,8 @@ class DeviceSettable(Protocol):
|
||||
|
||||
@runtime_checkable
|
||||
class HooksSupport(Protocol):
|
||||
wrappers: dict[str, dict[str, list[Callable]]]
|
||||
callbacks: dict[str, dict[str, list[Callable]]]
|
||||
wrappers: dict[str, list[Callable]]
|
||||
callbacks: dict[str, list[Callable]]
|
||||
hook_mode: "EnumHookMode"
|
||||
|
||||
def prepare_hook_patches_current_keyframe(self, t, hook_group, model_options): ...
|
||||
@ -52,6 +53,8 @@ class HooksSupport(Protocol):
|
||||
|
||||
def add_wrapper_with_key(self, wrapper_type: str, key: str, wrapper: Callable): ...
|
||||
|
||||
def remove_wrappers_with_key(self, wrapper_type: str, key: str) -> list: ...
|
||||
|
||||
|
||||
class HooksSupportStub(HooksSupport, metaclass=ABCMeta):
|
||||
def prepare_hook_patches_current_keyframe(self, t, hook_group, model_options):
|
||||
@ -79,7 +82,7 @@ class HooksSupportStub(HooksSupport, metaclass=ABCMeta):
|
||||
return
|
||||
|
||||
@property
|
||||
def wrappers(self):
|
||||
def wrappers(self) -> dict:
|
||||
if not hasattr(self, "_wrappers"):
|
||||
setattr(self, "_wrappers", {})
|
||||
return getattr(self, "_wrappers")
|
||||
@ -128,6 +131,11 @@ class HooksSupportStub(HooksSupport, metaclass=ABCMeta):
|
||||
w = self.wrappers.setdefault(wrapper_type, {}).setdefault(key, [])
|
||||
w.append(wrapper)
|
||||
|
||||
def remove_wrappers_with_key(self, wrapper_type: str, key: str) -> list:
|
||||
w = self.wrappers.get(wrapper_type, {}).get(key, [])
|
||||
del self.wrappers[wrapper_type][key]
|
||||
return w
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class TrainingSupport(Protocol):
|
||||
@ -344,14 +352,19 @@ class ModelManageableStub(HooksSupportStub, TrainingSupportStub, ModelManageable
|
||||
return copy.copy(self)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class MemoryMeasurements:
|
||||
model: torch.nn.Module | DeviceSettable
|
||||
model_loaded_weight_memory: int = 0
|
||||
lowvram_patch_counter: int = 0
|
||||
model_lowvram: bool = False
|
||||
current_weight_patches_uuid: Any = None
|
||||
_device: torch.device | None = None
|
||||
def __init__(self, model):
|
||||
self.model_loaded_weight_memory: int = 0
|
||||
self.lowvram_patch_counter: int = 0
|
||||
self.model_lowvram: bool = False
|
||||
self.current_weight_patches_uuid: Any = None
|
||||
self._device: torch.device | None = None
|
||||
self.model_offload_buffer_memory = None
|
||||
self._model_ref = weakref.ref(model)
|
||||
|
||||
@property
|
||||
def model(self):
|
||||
return self._model_ref()
|
||||
|
||||
@property
|
||||
def device(self) -> torch.device:
|
||||
@ -402,6 +415,7 @@ class ModelOptions(TypedDict, total=False):
|
||||
|
||||
|
||||
class LoadingListItem(NamedTuple):
|
||||
module_offload_mem: int
|
||||
module_size: int
|
||||
name: str
|
||||
module: torch.nn.Module
|
||||
|
||||
@ -40,10 +40,13 @@ from .component_model.deprecation import _deprecate_method
|
||||
from .float import stochastic_rounding
|
||||
from .gguf import move_patch_to_device, is_torch_compatible, is_quantized, GGMLOps
|
||||
from .hooks import EnumHookMode, _HookRef, HookGroup, EnumHookType, WeightHook, create_transformer_options_from_hooks
|
||||
from .lora import calculate_weight
|
||||
from .lora_types import PatchDict, PatchDictKey, PatchTuple, PatchWeightTuple, ModelPatchesDictValue, PatchSupport
|
||||
from .model_base import BaseModel
|
||||
from .model_management import lora_compute_dtype
|
||||
from .model_management_types import ModelManageable, MemoryMeasurements, ModelOptions, LatentFormatT, LoadingListItem, TrainingSupport, HooksSupport
|
||||
from .patcher_extension import CallbacksMP, WrappersMP, PatcherInjection
|
||||
from .quant_ops import QuantizedTensor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -144,27 +147,27 @@ class LowVramPatch:
|
||||
def __init__(self, key, patches, convert_func=None, set_func=None):
|
||||
self.key = key
|
||||
self.patches = patches
|
||||
self.convert_func = convert_func
|
||||
self.convert_func = convert_func # TODO: remove
|
||||
self.set_func = set_func
|
||||
|
||||
def __call__(self, weight):
|
||||
intermediate_dtype = weight.dtype
|
||||
if self.convert_func is not None:
|
||||
weight = self.convert_func(weight.to(dtype=torch.float32, copy=True), inplace=True)
|
||||
return calculate_weight(self.patches[self.key], weight, self.key, intermediate_dtype=weight.dtype)
|
||||
|
||||
if intermediate_dtype not in [torch.float32, torch.float16, torch.bfloat16]: # intermediate_dtype has to be one that is supported in math ops
|
||||
intermediate_dtype = torch.float32
|
||||
out = lora.calculate_weight(self.patches[self.key], weight.to(intermediate_dtype), self.key, intermediate_dtype=intermediate_dtype)
|
||||
if self.set_func is None:
|
||||
return stochastic_rounding(out, weight.dtype, seed=string_to_seed(self.key))
|
||||
else:
|
||||
return self.set_func(out, seed=string_to_seed(self.key), return_weight=True)
|
||||
|
||||
out = lora.calculate_weight(self.patches[self.key], weight, self.key, intermediate_dtype=intermediate_dtype)
|
||||
if self.set_func is not None:
|
||||
return self.set_func(out, seed=string_to_seed(self.key), return_weight=True).to(dtype=intermediate_dtype)
|
||||
else:
|
||||
return out
|
||||
LOWVRAM_PATCH_ESTIMATE_MATH_FACTOR = 2
|
||||
|
||||
|
||||
def low_vram_patch_estimate_vram(model, key):
|
||||
weight, set_func, convert_func = get_key_weight(model, key)
|
||||
if weight is None:
|
||||
return 0
|
||||
model_dtype = getattr(model, "manual_cast_dtype", torch.float32)
|
||||
if model_dtype is None:
|
||||
model_dtype = weight.dtype
|
||||
|
||||
# todo: model_dtype is an instance of Elipses, so you really shouldn't do this
|
||||
return weight.numel() * model_dtype.itemsize * LOWVRAM_PATCH_ESTIMATE_MATH_FACTOR
|
||||
|
||||
|
||||
def get_key_weight(model, key):
|
||||
set_func = None
|
||||
@ -253,7 +256,6 @@ class ModelPatcher(ModelManageable, PatchSupport):
|
||||
self.object_patches_backup = {}
|
||||
self.weight_wrapper_patches = {}
|
||||
self._model_options: ModelOptions = {"transformer_options": {}}
|
||||
self.model_size()
|
||||
self.load_device = load_device
|
||||
self.offload_device = offload_device
|
||||
self.weight_inplace_update = weight_inplace_update
|
||||
@ -262,6 +264,7 @@ class ModelPatcher(ModelManageable, PatchSupport):
|
||||
self.patches_uuid: uuid.UUID = uuid.uuid4()
|
||||
self.ckpt_name = ckpt_name
|
||||
self._memory_measurements = MemoryMeasurements(self.model)
|
||||
self.pinned = set()
|
||||
self.attachments: dict[str] = {}
|
||||
self.additional_models: dict[str, list[ModelPatcher]] = {}
|
||||
self.callbacks: dict[str, dict[str, list[Callable]]] = CallbacksMP.init_callbacks()
|
||||
@ -322,18 +325,29 @@ class ModelPatcher(ModelManageable, PatchSupport):
|
||||
def lowvram_patch_counter(self):
|
||||
return self._memory_measurements.lowvram_patch_counter
|
||||
|
||||
@property
|
||||
def model_offload_buffer_memory(self) -> int:
|
||||
return self._memory_measurements.model_offload_buffer_memory
|
||||
|
||||
@model_offload_buffer_memory.setter
|
||||
def model_offload_buffer_memory(self, value):
|
||||
self._memory_measurements.model_offload_buffer_memory = value
|
||||
|
||||
def model_size(self):
|
||||
if self.size > 0:
|
||||
return self.size
|
||||
self.size = model_management.module_size(self.model)
|
||||
return self.size
|
||||
|
||||
def get_ram_usage(self):
|
||||
return self.model_size()
|
||||
|
||||
def loaded_size(self):
|
||||
return self._memory_measurements.model_loaded_weight_memory
|
||||
|
||||
def clone(self):
|
||||
n = self.__class__(self.model, self.load_device, self.offload_device, self.size, weight_inplace_update=self.weight_inplace_update)
|
||||
n._memory_measurements = self._memory_measurements
|
||||
def clone(self) -> "ModelPatcher":
|
||||
n = self.__class__(self.model, self.load_device, self.offload_device, self.model_size(), weight_inplace_update=self.weight_inplace_update)
|
||||
n._memory_measurements = copy.copy(self._memory_measurements)
|
||||
n.ckpt_name = self.ckpt_name
|
||||
n.patches = {}
|
||||
for k in self.patches:
|
||||
@ -346,6 +360,7 @@ class ModelPatcher(ModelManageable, PatchSupport):
|
||||
n.backup = self.backup
|
||||
n.object_patches_backup = self.object_patches_backup
|
||||
n._parent = self
|
||||
n.pinned = self.pinned
|
||||
|
||||
n.force_cast_weights = self.force_cast_weights
|
||||
|
||||
@ -430,8 +445,11 @@ class ModelPatcher(ModelManageable, PatchSupport):
|
||||
return True
|
||||
|
||||
def memory_required(self, input_shape) -> int:
|
||||
assert isinstance(self.model, BaseModel)
|
||||
return self.model.memory_required(input_shape=input_shape)
|
||||
if isinstance(self.model, BaseModel):
|
||||
return self.model.memory_required(input_shape=input_shape)
|
||||
else:
|
||||
# todo: some other heuristic to determine memory required
|
||||
raise ValueError("unexpected call to memory required on object that doesn't have a BaseModel but is using ModelPatcher")
|
||||
|
||||
def set_model_sampler_cfg_function(self, sampler_cfg_function, disable_cfg1_optimization=False):
|
||||
if len(inspect.signature(sampler_cfg_function).parameters) == 3:
|
||||
@ -504,6 +522,18 @@ class ModelPatcher(ModelManageable, PatchSupport):
|
||||
def set_model_post_input_patch(self, patch):
|
||||
self.set_model_patch(patch, "post_input")
|
||||
|
||||
def set_model_rope_options(self, scale_x, shift_x, scale_y, shift_y, scale_t, shift_t, **kwargs):
|
||||
rope_options = self.model_options["transformer_options"].get("rope_options", {})
|
||||
rope_options["scale_x"] = scale_x
|
||||
rope_options["scale_y"] = scale_y
|
||||
rope_options["scale_t"] = scale_t
|
||||
|
||||
rope_options["shift_x"] = shift_x
|
||||
rope_options["shift_y"] = shift_y
|
||||
rope_options["shift_t"] = shift_t
|
||||
|
||||
self.model_options["transformer_options"]["rope_options"] = rope_options
|
||||
|
||||
def add_object_patch(self, name, obj):
|
||||
self.object_patches[name] = obj
|
||||
|
||||
@ -680,16 +710,19 @@ class ModelPatcher(ModelManageable, PatchSupport):
|
||||
utils.copy_to_param(self.model, key, out_weight)
|
||||
else:
|
||||
utils.set_attr_param(self.model, key, out_weight)
|
||||
|
||||
if self.gguf.patch_on_device:
|
||||
return
|
||||
# end gguf
|
||||
|
||||
if key not in self.backup:
|
||||
self.backup[key] = collections.namedtuple('Dimension', ['weight', 'inplace_update'])(weight.to(device=self.offload_device, copy=inplace_update), inplace_update)
|
||||
|
||||
temp_dtype = lora_compute_dtype(device_to)
|
||||
if device_to is not None:
|
||||
temp_weight = model_management.cast_to_device(weight, device_to, torch.float32, copy=True)
|
||||
temp_weight = model_management.cast_to_device(weight, device_to, temp_dtype, copy=True)
|
||||
else:
|
||||
temp_weight = weight.to(torch.float32, copy=True)
|
||||
temp_weight = weight.to(temp_dtype, copy=True)
|
||||
if convert_func is not None:
|
||||
temp_weight = convert_func(temp_weight, inplace=True)
|
||||
|
||||
@ -703,6 +736,27 @@ class ModelPatcher(ModelManageable, PatchSupport):
|
||||
else:
|
||||
set_func(out_weight, inplace_update=inplace_update, seed=string_to_seed(key))
|
||||
|
||||
def pin_weight_to_device(self, key):
|
||||
if self.gguf.loaded_from_gguf and key not in self.patches:
|
||||
weight = utils.get_attr(self.model, key)
|
||||
if is_quantized(weight):
|
||||
# weight.detach_mmap()
|
||||
return
|
||||
|
||||
weight, set_func, convert_func = get_key_weight(self.model, key)
|
||||
if model_management.pin_memory(weight):
|
||||
self.pinned.add(key)
|
||||
|
||||
def unpin_weight(self, key):
|
||||
if key in self.pinned:
|
||||
weight, set_func, convert_func = get_key_weight(self.model, key)
|
||||
model_management.unpin_memory(weight)
|
||||
self.pinned.remove(key)
|
||||
|
||||
def unpin_all_weights(self):
|
||||
for key in list(self.pinned):
|
||||
self.unpin_weight(key)
|
||||
|
||||
def _load_list(self) -> list[LoadingListItem]:
|
||||
loading = []
|
||||
for n, m in self.model.named_modules():
|
||||
@ -715,37 +769,68 @@ class ModelPatcher(ModelManageable, PatchSupport):
|
||||
skip = True # skip random weights in non leaf modules
|
||||
break
|
||||
if not skip and (hasattr(m, "comfy_cast_weights") or len(params) > 0):
|
||||
loading.append(LoadingListItem(model_management.module_size(m), n, m, params))
|
||||
module_mem = model_management.module_size(m)
|
||||
module_offload_mem = module_mem
|
||||
if hasattr(m, "comfy_cast_weights"):
|
||||
def check_module_offload_mem(key):
|
||||
if key in self.patches:
|
||||
return low_vram_patch_estimate_vram(self.model, key)
|
||||
model_dtype = getattr(self.model, "manual_cast_dtype", None)
|
||||
weight, _, _ = get_key_weight(self.model, key)
|
||||
if model_dtype is None or weight is None:
|
||||
return 0
|
||||
if (weight.dtype != model_dtype or isinstance(weight, QuantizedTensor)):
|
||||
return weight.numel() * model_dtype.itemsize
|
||||
return 0
|
||||
|
||||
module_offload_mem += check_module_offload_mem("{}.weight".format(n))
|
||||
module_offload_mem += check_module_offload_mem("{}.bias".format(n))
|
||||
loading.append(LoadingListItem(module_offload_mem, module_mem, n, m, params))
|
||||
return loading
|
||||
|
||||
def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False, full_load=False):
|
||||
if self.gguf.loaded_from_gguf:
|
||||
force_patch_weights = True
|
||||
|
||||
if self.gguf.loaded_from_gguf and not self.gguf.mmap_released:
|
||||
for n, m in self.model.named_modules():
|
||||
if hasattr(m, "weight"):
|
||||
if is_quantized(m.weight):
|
||||
pass
|
||||
# m.weight.detach_mmap()
|
||||
self.gguf.mmap_released = True
|
||||
|
||||
with self.use_ejected():
|
||||
self.unpatch_hooks()
|
||||
mem_counter = 0
|
||||
patch_counter = 0
|
||||
lowvram_counter = 0
|
||||
lowvram_mem_counter = 0
|
||||
loading = self._load_list()
|
||||
|
||||
load_completely: list[LoadingListItem] = []
|
||||
offloaded: list[LoadingListItem] = []
|
||||
offload_buffer = 0
|
||||
loading.sort(reverse=True)
|
||||
for x in loading:
|
||||
n = x.name
|
||||
m = x.module
|
||||
params = x.params
|
||||
module_mem = x.module_size
|
||||
for i, x in enumerate(loading):
|
||||
module_offload_mem, module_mem, n, m, params = x
|
||||
|
||||
lowvram_weight = False
|
||||
|
||||
potential_offload = max(offload_buffer, module_offload_mem + sum([x1[1] for x1 in loading[i + 1:i + 1 + model_management.NUM_STREAMS]]))
|
||||
lowvram_fits = mem_counter + module_mem + potential_offload < lowvram_model_memory
|
||||
|
||||
weight_key = "{}.weight".format(n)
|
||||
bias_key = "{}.bias".format(n)
|
||||
|
||||
if not full_load and hasattr(m, "comfy_cast_weights"):
|
||||
if mem_counter + module_mem >= lowvram_model_memory:
|
||||
if self.gguf.loaded_from_gguf and self.load_device == self.offload_device:
|
||||
lowvram_fits = True
|
||||
if not lowvram_fits:
|
||||
offload_buffer = potential_offload
|
||||
lowvram_weight = True
|
||||
lowvram_counter += 1
|
||||
lowvram_mem_counter += module_mem
|
||||
if hasattr(m, "prev_comfy_cast_weights"): # Already lowvramed
|
||||
continue
|
||||
|
||||
@ -771,13 +856,16 @@ class ModelPatcher(ModelManageable, PatchSupport):
|
||||
patch_counter += 1
|
||||
|
||||
cast_weight = True
|
||||
offloaded.append(LoadingListItem(0, module_mem, n, m, params))
|
||||
else:
|
||||
if hasattr(m, "comfy_cast_weights"):
|
||||
wipe_lowvram_weight(m)
|
||||
|
||||
if full_load or mem_counter + module_mem < lowvram_model_memory:
|
||||
if full_load or lowvram_fits:
|
||||
mem_counter += module_mem
|
||||
load_completely.append(LoadingListItem(module_mem, n, m, params))
|
||||
load_completely.append(LoadingListItem(0, module_mem, n, m, params))
|
||||
else:
|
||||
offload_buffer = potential_offload
|
||||
|
||||
if cast_weight and hasattr(m, "comfy_cast_weights"):
|
||||
m.prev_comfy_cast_weights = m.comfy_cast_weights
|
||||
@ -802,7 +890,11 @@ class ModelPatcher(ModelManageable, PatchSupport):
|
||||
continue
|
||||
|
||||
for param in params:
|
||||
self.patch_weight_to_device("{}.{}".format(n, param), device_to=device_to)
|
||||
key = "{}.{}".format(n, param)
|
||||
self.unpin_weight(key)
|
||||
self.patch_weight_to_device(key, device_to=device_to)
|
||||
if model_management.is_device_cuda(device_to):
|
||||
torch.cuda.synchronize()
|
||||
|
||||
models_loaded_regularly.append("name={} module={}".format(n, m))
|
||||
m.comfy_patched_weights = True
|
||||
@ -810,11 +902,21 @@ class ModelPatcher(ModelManageable, PatchSupport):
|
||||
for x in load_completely:
|
||||
x.module.to(device_to)
|
||||
|
||||
for x in offloaded:
|
||||
n = x.name
|
||||
params = x.params
|
||||
for param in params:
|
||||
self.pin_weight_to_device("{}.{}".format(n, param))
|
||||
|
||||
if lowvram_counter > 0:
|
||||
logger.debug(f"loaded partially lowvram_model_memory={lowvram_model_memory / (1024 * 1024):.1f}MB mem_counter={mem_counter / (1024 * 1024):.1f}MB patch_counter={patch_counter}")
|
||||
if hasattr(self.model, "model_lovram"):
|
||||
self.model.model_lowvram = True
|
||||
self._memory_measurements.model_lowvram = True
|
||||
else:
|
||||
logger.debug(f"loaded completely lowvram_model_memory={lowvram_model_memory / (1024 * 1024):.1f}MB mem_counter={mem_counter / (1024 * 1024):.1f}MB full_load={full_load}")
|
||||
if hasattr(self.model, "model_lovram"):
|
||||
self.model.model_lowvram = False
|
||||
self._memory_measurements.model_lowvram = False
|
||||
if full_load:
|
||||
self.model.to(device_to)
|
||||
@ -843,9 +945,9 @@ class ModelPatcher(ModelManageable, PatchSupport):
|
||||
self.gguf.mmap_released = True
|
||||
|
||||
self._memory_measurements.lowvram_patch_counter += patch_counter
|
||||
|
||||
self.model_device = device_to
|
||||
self._memory_measurements.model_loaded_weight_memory = mem_counter
|
||||
self._memory_measurements.model_offload_buffer_memory = offload_buffer
|
||||
self._memory_measurements.current_weight_patches_uuid = self.patches_uuid
|
||||
|
||||
for callback in self.get_all_callbacks(CallbacksMP.ON_LOAD):
|
||||
@ -881,6 +983,7 @@ class ModelPatcher(ModelManageable, PatchSupport):
|
||||
p.patches = []
|
||||
if unpatch_weights:
|
||||
self.unpatch_hooks()
|
||||
self.unpin_all_weights()
|
||||
if self._memory_measurements.model_lowvram:
|
||||
for m in self.model.modules():
|
||||
move_weight_functions(m, device_to)
|
||||
@ -907,6 +1010,7 @@ class ModelPatcher(ModelManageable, PatchSupport):
|
||||
self.model.to(device_to)
|
||||
self.model_device = device_to
|
||||
self._memory_measurements.model_loaded_weight_memory = 0
|
||||
self._memory_measurements.model_offload_buffer_memory = 0
|
||||
|
||||
for m in self.model.modules():
|
||||
if hasattr(m, "comfy_patched_weights"):
|
||||
@ -918,7 +1022,7 @@ class ModelPatcher(ModelManageable, PatchSupport):
|
||||
|
||||
self.object_patches_backup.clear()
|
||||
|
||||
def partially_unload(self, device_to, memory_to_free=0):
|
||||
def partially_unload(self, device_to, memory_to_free=0, force_patch_weights=False):
|
||||
freed_layers: list[str] = []
|
||||
with self.use_ejected():
|
||||
hooks_unpatched = False
|
||||
@ -926,13 +1030,19 @@ class ModelPatcher(ModelManageable, PatchSupport):
|
||||
patch_counter = 0
|
||||
unload_list = self._load_list()
|
||||
unload_list.sort()
|
||||
|
||||
offload_buffer = self._memory_measurements.model_offload_buffer_memory
|
||||
offload_weight_factor = 0
|
||||
if len(unload_list) > 0:
|
||||
NS = model_management.NUM_STREAMS
|
||||
offload_weight_factor = [min(offload_buffer / (NS + 1), unload_list[0][1])] * NS
|
||||
|
||||
for unload in unload_list:
|
||||
if memory_to_free < memory_freed:
|
||||
if memory_to_free + offload_buffer - self._memory_measurements.model_offload_buffer_memory < memory_freed:
|
||||
break
|
||||
module_mem = unload[0]
|
||||
n = unload[1]
|
||||
m = unload[2]
|
||||
params = unload[3]
|
||||
module_offload_mem, module_mem, n, m, params = unload
|
||||
|
||||
potential_offload = module_offload_mem + sum(offload_weight_factor)
|
||||
|
||||
lowvram_possible = hasattr(m, "comfy_cast_weights")
|
||||
if hasattr(m, "comfy_patched_weights") and m.comfy_patched_weights == True:
|
||||
@ -963,27 +1073,40 @@ class ModelPatcher(ModelManageable, PatchSupport):
|
||||
module_mem += move_weight_functions(m, device_to)
|
||||
if lowvram_possible:
|
||||
if weight_key in self.patches:
|
||||
_, set_func, convert_func = get_key_weight(self.model, weight_key)
|
||||
m.weight_function.append(LowVramPatch(weight_key, self.patches, convert_func, set_func))
|
||||
patch_counter += 1
|
||||
if force_patch_weights:
|
||||
self.patch_weight_to_device(weight_key)
|
||||
else:
|
||||
_, set_func, convert_func = get_key_weight(self.model, weight_key)
|
||||
m.weight_function.append(LowVramPatch(weight_key, self.patches, convert_func, set_func))
|
||||
patch_counter += 1
|
||||
if bias_key in self.patches:
|
||||
_, set_func, convert_func = get_key_weight(self.model, bias_key)
|
||||
m.bias_function.append(LowVramPatch(bias_key, self.patches, convert_func, set_func))
|
||||
patch_counter += 1
|
||||
if force_patch_weights:
|
||||
self.patch_weight_to_device(bias_key)
|
||||
else:
|
||||
_, set_func, convert_func = get_key_weight(self.model, bias_key)
|
||||
m.bias_function.append(LowVramPatch(bias_key, self.patches, convert_func, set_func))
|
||||
patch_counter += 1
|
||||
cast_weight = True
|
||||
|
||||
if cast_weight:
|
||||
if cast_weight and hasattr(m, "comfy_cast_weights"):
|
||||
m.prev_comfy_cast_weights = m.comfy_cast_weights
|
||||
m.comfy_cast_weights = True
|
||||
m.comfy_patched_weights = False
|
||||
memory_freed += module_mem
|
||||
freed_layers.append(n)
|
||||
offload_buffer = max(offload_buffer, potential_offload)
|
||||
offload_weight_factor.append(module_mem)
|
||||
offload_weight_factor.pop(0)
|
||||
|
||||
logger.debug("freed {}".format(natsorted(freed_layers)))
|
||||
for param in params:
|
||||
self.pin_weight_to_device("{}.{}".format(n, param))
|
||||
|
||||
logger.debug(f"Freed {natsorted(freed_layers)}")
|
||||
|
||||
self._memory_measurements.model_lowvram = True
|
||||
self._memory_measurements.lowvram_patch_counter += patch_counter
|
||||
self._memory_measurements.model_loaded_weight_memory -= memory_freed
|
||||
self._memory_measurements.model_offload_buffer_memory = offload_buffer
|
||||
return memory_freed
|
||||
|
||||
def partially_load(self, device_to, extra_memory=0, force_patch_weights=False) -> int:
|
||||
@ -996,6 +1119,9 @@ class ModelPatcher(ModelManageable, PatchSupport):
|
||||
extra_memory += (used - self._memory_measurements.model_loaded_weight_memory)
|
||||
|
||||
self.patch_model(load_weights=False)
|
||||
if extra_memory < 0 and not unpatch_weights:
|
||||
self.partially_unload(self.offload_device, -extra_memory, force_patch_weights=force_patch_weights)
|
||||
return 0
|
||||
full_load = False
|
||||
if not self._memory_measurements.model_lowvram and self._memory_measurements.model_loaded_weight_memory > 0:
|
||||
return 0
|
||||
@ -1080,9 +1206,11 @@ class ModelPatcher(ModelManageable, PatchSupport):
|
||||
w.append(wrapper)
|
||||
|
||||
def remove_wrappers_with_key(self, wrapper_type: str, key: str):
|
||||
wrappers_removed = []
|
||||
w = self.wrappers.get(wrapper_type, {})
|
||||
if key in w:
|
||||
w.pop(key)
|
||||
wrappers_removed.append(w.pop(key))
|
||||
return wrappers_removed
|
||||
|
||||
def get_wrappers(self, wrapper_type: str, key: str):
|
||||
return self.wrappers.get(wrapper_type, {}).get(key, [])
|
||||
@ -1399,4 +1527,5 @@ class ModelPatcher(ModelManageable, PatchSupport):
|
||||
self.clear_cached_hook_weights()
|
||||
|
||||
def __del__(self):
|
||||
self.unpin_all_weights()
|
||||
self.detach(unpatch_all=False)
|
||||
|
||||
91
comfy/nested_tensor.py
Normal file
91
comfy/nested_tensor.py
Normal file
@ -0,0 +1,91 @@
|
||||
import torch
|
||||
|
||||
class NestedTensor:
|
||||
def __init__(self, tensors):
|
||||
self.tensors = list(tensors)
|
||||
self.is_nested = True
|
||||
|
||||
def _copy(self):
|
||||
return NestedTensor(self.tensors)
|
||||
|
||||
def apply_operation(self, other, operation):
|
||||
o = self._copy()
|
||||
if isinstance(other, NestedTensor):
|
||||
for i, t in enumerate(o.tensors):
|
||||
o.tensors[i] = operation(t, other.tensors[i])
|
||||
else:
|
||||
for i, t in enumerate(o.tensors):
|
||||
o.tensors[i] = operation(t, other)
|
||||
return o
|
||||
|
||||
def __add__(self, b):
|
||||
return self.apply_operation(b, lambda x, y: x + y)
|
||||
|
||||
def __sub__(self, b):
|
||||
return self.apply_operation(b, lambda x, y: x - y)
|
||||
|
||||
def __mul__(self, b):
|
||||
return self.apply_operation(b, lambda x, y: x * y)
|
||||
|
||||
# def __itruediv__(self, b):
|
||||
# return self.apply_operation(b, lambda x, y: x / y)
|
||||
|
||||
def __truediv__(self, b):
|
||||
return self.apply_operation(b, lambda x, y: x / y)
|
||||
|
||||
def __getitem__(self, *args, **kwargs):
|
||||
return self.apply_operation(None, lambda x, y: x.__getitem__(*args, **kwargs))
|
||||
|
||||
def unbind(self):
|
||||
return self.tensors
|
||||
|
||||
def to(self, *args, **kwargs):
|
||||
o = self._copy()
|
||||
for i, t in enumerate(o.tensors):
|
||||
o.tensors[i] = t.to(*args, **kwargs)
|
||||
return o
|
||||
|
||||
def new_ones(self, *args, **kwargs):
|
||||
return self.tensors[0].new_ones(*args, **kwargs)
|
||||
|
||||
def float(self):
|
||||
return self.to(dtype=torch.float)
|
||||
|
||||
def chunk(self, *args, **kwargs):
|
||||
return self.apply_operation(None, lambda x, y: x.chunk(*args, **kwargs))
|
||||
|
||||
def size(self):
|
||||
return self.tensors[0].size()
|
||||
|
||||
@property
|
||||
def shape(self):
|
||||
return self.tensors[0].shape
|
||||
|
||||
@property
|
||||
def ndim(self):
|
||||
dims = 0
|
||||
for t in self.tensors:
|
||||
dims = max(t.ndim, dims)
|
||||
return dims
|
||||
|
||||
@property
|
||||
def device(self):
|
||||
return self.tensors[0].device
|
||||
|
||||
@property
|
||||
def dtype(self):
|
||||
return self.tensors[0].dtype
|
||||
|
||||
@property
|
||||
def layout(self):
|
||||
return self.tensors[0].layout
|
||||
|
||||
|
||||
def cat_nested(tensors, *args, **kwargs):
|
||||
cated_tensors = []
|
||||
for i in range(len(tensors[0].tensors)):
|
||||
tens = []
|
||||
for j in range(len(tensors)):
|
||||
tens.append(tensors[j].tensors[i])
|
||||
cated_tensors.append(torch.cat(tens, *args, **kwargs))
|
||||
return NestedTensor(cated_tensors)
|
||||
@ -4,6 +4,7 @@ import pathlib
|
||||
import requests_cache
|
||||
from contextlib import contextmanager
|
||||
|
||||
from .cli_args import args
|
||||
|
||||
@contextmanager
|
||||
def use_requests_caching(
|
||||
@ -35,5 +36,9 @@ def use_requests_caching(
|
||||
kwargs.setdefault('use_cache_dir', not path_provided)
|
||||
kwargs.setdefault('cache_control', cache_control)
|
||||
|
||||
if args.disable_requests_caching:
|
||||
yield
|
||||
return
|
||||
|
||||
with requests_cache.enabled(cache_name, **kwargs):
|
||||
yield
|
||||
|
||||
@ -745,8 +745,10 @@ class LoraLoaderModelOnly(LoraLoader):
|
||||
|
||||
|
||||
class VAELoader:
|
||||
video_taes = ["taehv", "lighttaew2_2", "lighttaew2_1", "lighttaehy1_5"]
|
||||
image_taes = ["taesd", "taesdxl", "taesd3", "taef1"]
|
||||
@staticmethod
|
||||
def vae_list():
|
||||
def vae_list(s=None):
|
||||
vaes = get_filename_list_with_downloadable("vae", KNOWN_VAES)
|
||||
approx_vaes = get_filename_list_with_downloadable("vae_approx", KNOWN_APPROX_VAES)
|
||||
sdxl_taesd_enc = False
|
||||
@ -775,6 +777,11 @@ class VAELoader:
|
||||
f1_taesd_dec = True
|
||||
elif v.startswith("taef1_decoder."):
|
||||
f1_taesd_enc = True
|
||||
else:
|
||||
for tae in VAELoader.video_taes:
|
||||
if v.startswith(tae):
|
||||
vaes.append(v)
|
||||
|
||||
if sd1_taesd_dec and sd1_taesd_enc:
|
||||
vaes.append("taesd")
|
||||
if sdxl_taesd_dec and sdxl_taesd_enc:
|
||||
@ -818,8 +825,7 @@ class VAELoader:
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": {"vae_name": (s.vae_list(),)}}
|
||||
|
||||
return {"required": {"vae_name": (s.vae_list(s),)}}
|
||||
RETURN_TYPES = ("VAE",)
|
||||
FUNCTION = "load_vae"
|
||||
|
||||
@ -831,10 +837,13 @@ class VAELoader:
|
||||
if vae_name == "pixel_space":
|
||||
sd_ = {}
|
||||
sd_["pixel_space_vae"] = torch.tensor(1.0)
|
||||
elif vae_name in ["taesd", "taesdxl", "taesd3", "taef1"]:
|
||||
elif vae_name in self.image_taes:
|
||||
sd_ = self.load_taesd(vae_name)
|
||||
else:
|
||||
vae_path = get_full_path_or_raise("vae", vae_name, KNOWN_VAES)
|
||||
if os.path.splitext(vae_name)[0] in self.video_taes:
|
||||
vae_path = folder_paths.get_full_path_or_raise("vae_approx", vae_name)
|
||||
else:
|
||||
vae_path = get_full_path_or_raise("vae", vae_name, KNOWN_VAES)
|
||||
sd_, metadata = utils.load_torch_file(vae_path, return_metadata=True)
|
||||
vae = sd.VAE(sd=sd_, metadata=metadata, ckpt_name=vae_name)
|
||||
vae.throw_exception_if_invalid()
|
||||
@ -1016,7 +1025,7 @@ class CLIPLoader:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": {"clip_name": (get_filename_list_with_downloadable("text_encoders", KNOWN_CLIP_MODELS),),
|
||||
"type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image"],),
|
||||
"type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis"],),
|
||||
},
|
||||
"optional": {
|
||||
"device": (["default", "cpu"], {"advanced": True}),
|
||||
@ -1046,7 +1055,7 @@ class DualCLIPLoader:
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": {"clip_name1": (get_filename_list_with_downloadable("text_encoders"),), "clip_name2": (
|
||||
get_filename_list_with_downloadable("text_encoders"),),
|
||||
"type": (["sdxl", "sd3", "flux", "hunyuan_video", "hidream", "hunyuan_image"],),
|
||||
"type": (["sdxl", "sd3", "flux", "hunyuan_video", "hidream", "hunyuan_image", "hunyuan_video_15", "kandinsky5", "kandinsky5_image"],),
|
||||
},
|
||||
"optional": {
|
||||
"device": (["default", "cpu"], {"advanced": True}),
|
||||
@ -2003,6 +2012,11 @@ class ImageBatch:
|
||||
CATEGORY = "image"
|
||||
|
||||
def batch(self, image1, image2):
|
||||
if image1.shape[-1] != image2.shape[-1]:
|
||||
if image1.shape[-1] > image2.shape[-1]:
|
||||
image2 = torch.nn.functional.pad(image2, (0,1), mode='constant', value=1.0)
|
||||
else:
|
||||
image1 = torch.nn.functional.pad(image1, (0,1), mode='constant', value=1.0)
|
||||
if image1.shape[1:] != image2.shape[1:]:
|
||||
image2 = utils.common_upscale(image2.movedim(-1, 1), image1.shape[2], image1.shape[1], "bilinear", "center").movedim(1, -1)
|
||||
s = torch.cat((image1, image2), dim=0)
|
||||
|
||||
457
comfy/ops.py
457
comfy/ops.py
@ -17,20 +17,22 @@
|
||||
"""
|
||||
import contextlib
|
||||
import logging
|
||||
import torch
|
||||
from torch import Tensor
|
||||
from typing import Optional, Type, Union
|
||||
|
||||
import torch
|
||||
from torch import Tensor
|
||||
|
||||
from . import model_management, rmsnorm
|
||||
from .interruption import throw_exception_if_processing_interrupted
|
||||
from .cli_args import args, PerformanceFeature
|
||||
from .execution_context import current_execution_context
|
||||
from .float import stochastic_rounding
|
||||
from .interruption import throw_exception_if_processing_interrupted
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_RUN_EVERY_OP_ENABLED = model_management.torch_version_numeric >= (2, 5)
|
||||
|
||||
import json
|
||||
|
||||
|
||||
def run_every_op():
|
||||
global _RUN_EVERY_OP_ENABLED
|
||||
@ -82,7 +84,8 @@ except Exception as exc_info:
|
||||
NVIDIA_MEMORY_CONV_BUG_WORKAROUND = False
|
||||
try:
|
||||
if model_management.is_nvidia():
|
||||
if torch.backends.cudnn.version() >= 91002 and model_management.torch_version_numeric >= (2, 9) and model_management.torch_version_numeric <= (2, 10):
|
||||
cudnn_version = torch.backends.cudnn.version()
|
||||
if (cudnn_version >= 91002 and cudnn_version < 91500) and model_management.torch_version_numeric >= (2, 9) and model_management.torch_version_numeric <= (2, 10):
|
||||
# TODO: change upper bound version once it's fixed'
|
||||
NVIDIA_MEMORY_CONV_BUG_WORKAROUND = True
|
||||
logger.debug("working around nvidia conv3d memory bug.")
|
||||
@ -96,41 +99,74 @@ def cast_to_input(weight, input, non_blocking=False, copy=True):
|
||||
return model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy)
|
||||
|
||||
|
||||
@torch.compiler.disable()
|
||||
def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None):
|
||||
def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, offloadable=False):
|
||||
# NOTE: offloadable=False is a a legacy and if you are a custom node author reading this please pass
|
||||
# offloadable=True and call uncast_bias_weight() after your last usage of the weight/bias. This
|
||||
# will add async-offload support to your cast and improve performance.
|
||||
if input is not None:
|
||||
if dtype is None:
|
||||
dtype = input.dtype
|
||||
if isinstance(input, QuantizedTensor):
|
||||
dtype = input._layout_params["orig_dtype"]
|
||||
else:
|
||||
dtype = input.dtype
|
||||
if bias_dtype is None:
|
||||
bias_dtype = dtype
|
||||
if device is None:
|
||||
device = input.device
|
||||
|
||||
offload_stream = model_management.get_offload_stream(device)
|
||||
if offload_stream is not None:
|
||||
wf_context = offload_stream
|
||||
if offloadable and (device != s.weight.device or
|
||||
(s.bias is not None and device != s.bias.device)):
|
||||
offload_stream = model_management.get_offload_stream(device)
|
||||
else:
|
||||
wf_context = contextlib.nullcontext()
|
||||
offload_stream = None
|
||||
|
||||
non_blocking = model_management.device_supports_non_blocking(device)
|
||||
|
||||
weight_has_function = len(s.weight_function) > 0
|
||||
bias_has_function = len(s.bias_function) > 0
|
||||
|
||||
weight = model_management.cast_to(s.weight, None, device, non_blocking=non_blocking, copy=weight_has_function, stream=offload_stream)
|
||||
|
||||
bias = None
|
||||
non_blocking = True if torch.jit.is_tracing() or torch.jit.is_scripting() else model_management.device_supports_non_blocking(device)
|
||||
if s.bias is not None:
|
||||
has_function = len(s.bias_function) > 0
|
||||
bias = model_management.cast_to(s.bias, bias_dtype, device, non_blocking=non_blocking, copy=has_function, stream=offload_stream)
|
||||
if has_function:
|
||||
with wf_context:
|
||||
for f in s.bias_function:
|
||||
bias = f(bias)
|
||||
|
||||
has_function = len(s.weight_function) > 0
|
||||
weight = model_management.cast_to(s.weight, dtype, device, non_blocking=non_blocking, copy=has_function, stream=offload_stream)
|
||||
if has_function:
|
||||
with wf_context:
|
||||
for f in s.weight_function:
|
||||
weight = f(weight)
|
||||
bias = model_management.cast_to(s.bias, bias_dtype, device, non_blocking=non_blocking, copy=bias_has_function, stream=offload_stream)
|
||||
|
||||
model_management.sync_stream(device, offload_stream)
|
||||
return weight, bias
|
||||
|
||||
bias_a = bias
|
||||
weight_a = weight
|
||||
|
||||
if s.bias is not None:
|
||||
for f in s.bias_function:
|
||||
bias = f(bias)
|
||||
|
||||
if weight_has_function or weight.dtype != dtype:
|
||||
weight = weight.to(dtype=dtype)
|
||||
if isinstance(weight, QuantizedTensor):
|
||||
weight = weight.dequantize()
|
||||
for f in s.weight_function:
|
||||
weight = f(weight)
|
||||
|
||||
if offloadable:
|
||||
return weight, bias, (offload_stream, weight_a, bias_a)
|
||||
else:
|
||||
# Legacy function signature
|
||||
return weight, bias
|
||||
|
||||
|
||||
def uncast_bias_weight(s, weight, bias, offload_stream):
|
||||
if offload_stream is None:
|
||||
return
|
||||
os, weight_a, bias_a = offload_stream
|
||||
if os is None:
|
||||
return
|
||||
if weight_a is not None:
|
||||
device = weight_a.device
|
||||
else:
|
||||
if bias_a is None:
|
||||
return
|
||||
device = bias_a.device
|
||||
os.wait_stream(model_management.current_stream(device))
|
||||
|
||||
|
||||
class SkipInit:
|
||||
@ -191,8 +227,10 @@ class disable_weight_init:
|
||||
return None
|
||||
|
||||
def forward_comfy_cast_weights(self, input):
|
||||
weight, bias = cast_bias_weight(self, input)
|
||||
return torch.nn.functional.linear(input, weight, bias)
|
||||
weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True)
|
||||
x = torch.nn.functional.linear(input, weight, bias)
|
||||
uncast_bias_weight(self, weight, bias, offload_stream)
|
||||
return x
|
||||
|
||||
def forward(self, *args, **kwargs):
|
||||
run_every_op()
|
||||
@ -206,8 +244,10 @@ class disable_weight_init:
|
||||
return None
|
||||
|
||||
def forward_comfy_cast_weights(self, input):
|
||||
weight, bias = cast_bias_weight(self, input)
|
||||
return self._conv_forward(input, weight, bias)
|
||||
weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True)
|
||||
x = self._conv_forward(input, weight, bias)
|
||||
uncast_bias_weight(self, weight, bias, offload_stream)
|
||||
return x
|
||||
|
||||
def forward(self, *args, **kwargs):
|
||||
run_every_op()
|
||||
@ -221,8 +261,10 @@ class disable_weight_init:
|
||||
return None
|
||||
|
||||
def forward_comfy_cast_weights(self, input):
|
||||
weight, bias = cast_bias_weight(self, input)
|
||||
return self._conv_forward(input, weight, bias)
|
||||
weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True)
|
||||
x = self._conv_forward(input, weight, bias)
|
||||
uncast_bias_weight(self, weight, bias, offload_stream)
|
||||
return x
|
||||
|
||||
def forward(self, *args, **kwargs):
|
||||
run_every_op()
|
||||
@ -245,8 +287,10 @@ class disable_weight_init:
|
||||
return super()._conv_forward(input, weight, bias, *args, **kwargs)
|
||||
|
||||
def forward_comfy_cast_weights(self, input):
|
||||
weight, bias = cast_bias_weight(self, input)
|
||||
return self._conv_forward(input, weight, bias)
|
||||
weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True)
|
||||
x = self._conv_forward(input, weight, bias)
|
||||
uncast_bias_weight(self, weight, bias, offload_stream)
|
||||
return x
|
||||
|
||||
def forward(self, *args, **kwargs):
|
||||
run_every_op()
|
||||
@ -260,8 +304,10 @@ class disable_weight_init:
|
||||
return None
|
||||
|
||||
def forward_comfy_cast_weights(self, input):
|
||||
weight, bias = cast_bias_weight(self, input)
|
||||
return torch.nn.functional.group_norm(input, self.num_groups, weight, bias, self.eps)
|
||||
weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True)
|
||||
x = torch.nn.functional.group_norm(input, self.num_groups, weight, bias, self.eps)
|
||||
uncast_bias_weight(self, weight, bias, offload_stream)
|
||||
return x
|
||||
|
||||
def forward(self, *args, **kwargs):
|
||||
run_every_op()
|
||||
@ -276,11 +322,14 @@ class disable_weight_init:
|
||||
|
||||
def forward_comfy_cast_weights(self, input):
|
||||
if self.weight is not None:
|
||||
weight, bias = cast_bias_weight(self, input)
|
||||
weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True)
|
||||
else:
|
||||
weight = None
|
||||
bias = None
|
||||
return torch.nn.functional.layer_norm(input, self.normalized_shape, weight, bias, self.eps)
|
||||
offload_stream = None
|
||||
x = torch.nn.functional.layer_norm(input, self.normalized_shape, weight, bias, self.eps)
|
||||
uncast_bias_weight(self, weight, bias, offload_stream)
|
||||
return x
|
||||
|
||||
def forward(self, *args, **kwargs):
|
||||
run_every_op()
|
||||
@ -296,11 +345,15 @@ class disable_weight_init:
|
||||
|
||||
def forward_comfy_cast_weights(self, input):
|
||||
if self.weight is not None:
|
||||
weight, bias = cast_bias_weight(self, input)
|
||||
weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True)
|
||||
else:
|
||||
weight = None
|
||||
return rmsnorm.rms_norm(input, weight, self.eps) # TODO: switch to commented out line when old torch is deprecated
|
||||
# return torch.nn.functional.rms_norm(input, self.normalized_shape, weight, self.eps)
|
||||
bias = None
|
||||
offload_stream = None
|
||||
x = rmsnorm.rms_norm(input, weight, self.eps) # TODO: switch to commented out line when old torch is deprecated
|
||||
# x = torch.nn.functional.rms_norm(input, self.normalized_shape, weight, self.eps)
|
||||
uncast_bias_weight(self, weight, bias, offload_stream)
|
||||
return x
|
||||
|
||||
def forward(self, *args, **kwargs):
|
||||
run_every_op()
|
||||
@ -319,10 +372,12 @@ class disable_weight_init:
|
||||
input, output_size, self.stride, self.padding, self.kernel_size,
|
||||
num_spatial_dims, self.dilation)
|
||||
|
||||
weight, bias = cast_bias_weight(self, input)
|
||||
return torch.nn.functional.conv_transpose2d(
|
||||
weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True)
|
||||
x = torch.nn.functional.conv_transpose2d(
|
||||
input, weight, bias, self.stride, self.padding,
|
||||
output_padding, self.groups, self.dilation)
|
||||
uncast_bias_weight(self, weight, bias, offload_stream)
|
||||
return x
|
||||
|
||||
def forward(self, *args, **kwargs):
|
||||
run_every_op()
|
||||
@ -341,10 +396,12 @@ class disable_weight_init:
|
||||
input, output_size, self.stride, self.padding, self.kernel_size,
|
||||
num_spatial_dims, self.dilation)
|
||||
|
||||
weight, bias = cast_bias_weight(self, input)
|
||||
return torch.nn.functional.conv_transpose1d(
|
||||
weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True)
|
||||
x = torch.nn.functional.conv_transpose1d(
|
||||
input, weight, bias, self.stride, self.padding,
|
||||
output_padding, self.groups, self.dilation)
|
||||
uncast_bias_weight(self, weight, bias, offload_stream)
|
||||
return x
|
||||
|
||||
def forward(self, *args, **kwargs):
|
||||
run_every_op()
|
||||
@ -362,8 +419,10 @@ class disable_weight_init:
|
||||
output_dtype = out_dtype
|
||||
if self.weight.dtype == torch.float16 or self.weight.dtype == torch.bfloat16:
|
||||
out_dtype = None
|
||||
weight, bias = cast_bias_weight(self, device=input.device, dtype=out_dtype)
|
||||
return torch.nn.functional.embedding(input, weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse).to(dtype=output_dtype)
|
||||
weight, bias, offload_stream = cast_bias_weight(self, device=input.device, dtype=out_dtype, offloadable=True)
|
||||
x = torch.nn.functional.embedding(input, weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse).to(dtype=output_dtype)
|
||||
uncast_bias_weight(self, weight, bias, offload_stream)
|
||||
return x
|
||||
|
||||
def forward(self, *args, **kwargs):
|
||||
run_every_op()
|
||||
@ -417,48 +476,33 @@ class manual_cast(disable_weight_init):
|
||||
|
||||
|
||||
def fp8_linear(self, input):
|
||||
"""
|
||||
Legacy FP8 linear function for backward compatibility.
|
||||
Uses QuantizedTensor subclass for dispatch.
|
||||
"""
|
||||
dtype = self.weight.dtype
|
||||
if dtype not in [torch.float8_e4m3fn]:
|
||||
return None
|
||||
|
||||
tensor_2d = False
|
||||
if len(input.shape) == 2:
|
||||
tensor_2d = True
|
||||
input = input.unsqueeze(1)
|
||||
|
||||
input_shape = input.shape
|
||||
input_dtype = input.dtype
|
||||
if len(input.shape) == 3:
|
||||
w, bias = cast_bias_weight(self, input, dtype=dtype, bias_dtype=input_dtype)
|
||||
w = w.t()
|
||||
|
||||
scale_weight = self.scale_weight
|
||||
scale_input = self.scale_input
|
||||
if scale_weight is None:
|
||||
scale_weight = torch.ones((), device=input.device, dtype=torch.float32)
|
||||
else:
|
||||
scale_weight = scale_weight.to(input.device)
|
||||
if input.ndim == 3 or input.ndim == 2:
|
||||
w, bias, offload_stream = cast_bias_weight(self, input, dtype=dtype, bias_dtype=input_dtype, offloadable=True)
|
||||
scale_weight = torch.ones((), device=input.device, dtype=torch.float32)
|
||||
|
||||
if scale_input is None:
|
||||
scale_input = torch.ones((), device=input.device, dtype=torch.float32)
|
||||
input = torch.clamp(input, min=-448, max=448, out=input)
|
||||
input = input.reshape(-1, input_shape[2]).to(dtype).contiguous()
|
||||
else:
|
||||
scale_input = scale_input.to(input.device)
|
||||
input = (input * (1.0 / scale_input).to(input_dtype)).reshape(-1, input_shape[2]).to(dtype).contiguous()
|
||||
scale_input = torch.ones((), device=input.device, dtype=torch.float32)
|
||||
input = torch.clamp(input, min=-448, max=448, out=input)
|
||||
layout_params_weight = {'scale': scale_input, 'orig_dtype': input_dtype}
|
||||
quantized_input = QuantizedTensor(input.to(dtype).contiguous(), "TensorCoreFP8Layout", layout_params_weight)
|
||||
|
||||
if bias is not None:
|
||||
o = torch._scaled_mm(input, w, out_dtype=input_dtype, bias=bias, scale_a=scale_input, scale_b=scale_weight)
|
||||
else:
|
||||
o = torch._scaled_mm(input, w, out_dtype=input_dtype, scale_a=scale_input, scale_b=scale_weight)
|
||||
# Wrap weight in QuantizedTensor - this enables unified dispatch
|
||||
# Call F.linear - __torch_dispatch__ routes to fp8_linear handler in quant_ops.py!
|
||||
layout_params_weight = {'scale': scale_weight, 'orig_dtype': input_dtype}
|
||||
quantized_weight = QuantizedTensor(w, "TensorCoreFP8Layout", layout_params_weight)
|
||||
o = torch.nn.functional.linear(quantized_input, quantized_weight, bias)
|
||||
|
||||
if isinstance(o, tuple):
|
||||
o = o[0]
|
||||
|
||||
if tensor_2d:
|
||||
return o.reshape(input_shape[0], -1)
|
||||
|
||||
return o.reshape((-1, input_shape[1], self.weight.shape[0]))
|
||||
uncast_bias_weight(self, w, bias, offload_stream)
|
||||
return o
|
||||
|
||||
return None
|
||||
|
||||
@ -471,7 +515,7 @@ class fp8_ops(manual_cast):
|
||||
return None
|
||||
|
||||
def forward_comfy_cast_weights(self, input):
|
||||
if not self.training:
|
||||
if len(self.weight_function) == 0 and len(self.bias_function) == 0:
|
||||
try:
|
||||
out = fp8_linear(self, input)
|
||||
if out is not None:
|
||||
@ -479,67 +523,18 @@ class fp8_ops(manual_cast):
|
||||
except Exception as e:
|
||||
logger.info("Exception during fp8 op: {}".format(e))
|
||||
|
||||
weight, bias = cast_bias_weight(self, input)
|
||||
return torch.nn.functional.linear(input, weight, bias)
|
||||
if input.dtype == torch.float32 and (self.weight.dtype == torch.float16 or self.weight.dtype == torch.bfloat16):
|
||||
input = input.to(self.weight.dtype)
|
||||
weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True)
|
||||
x = torch.nn.functional.linear(input, weight, bias)
|
||||
uncast_bias_weight(self, weight, bias, offload_stream)
|
||||
return x
|
||||
|
||||
|
||||
class scaled_fp8_op_base(manual_cast):
|
||||
pass
|
||||
|
||||
|
||||
def scaled_fp8_ops(fp8_matrix_mult=False, scale_input=False, override_dtype=None):
|
||||
logger.info("Using scaled fp8: fp8 matrix mult: {}, scale input: {}".format(fp8_matrix_mult, scale_input))
|
||||
|
||||
class scaled_fp8_op(scaled_fp8_op_base):
|
||||
class Linear(manual_cast.Linear):
|
||||
def __init__(self, *args, **kwargs):
|
||||
if override_dtype is not None:
|
||||
kwargs['dtype'] = override_dtype
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def reset_parameters(self):
|
||||
if not hasattr(self, 'scale_weight'):
|
||||
self.scale_weight = torch.nn.parameter.Parameter(data=torch.ones((), device=self.weight.device, dtype=torch.float32), requires_grad=False)
|
||||
|
||||
if not scale_input:
|
||||
self.scale_input = None
|
||||
|
||||
if not hasattr(self, 'scale_input'):
|
||||
self.scale_input = torch.nn.parameter.Parameter(data=torch.ones((), device=self.weight.device, dtype=torch.float32), requires_grad=False)
|
||||
return None
|
||||
|
||||
def forward_comfy_cast_weights(self, input):
|
||||
if fp8_matrix_mult:
|
||||
out = fp8_linear(self, input)
|
||||
if out is not None:
|
||||
return out
|
||||
|
||||
weight, bias = cast_bias_weight(self, input)
|
||||
|
||||
if weight.numel() < input.numel(): # TODO: optimize
|
||||
return torch.nn.functional.linear(input, weight * self.scale_weight.to(device=weight.device, dtype=weight.dtype), bias)
|
||||
else:
|
||||
return torch.nn.functional.linear(input * self.scale_weight.to(device=weight.device, dtype=weight.dtype), weight, bias)
|
||||
|
||||
def convert_weight(self, weight, inplace=False, **kwargs):
|
||||
if inplace:
|
||||
weight *= self.scale_weight.to(device=weight.device, dtype=weight.dtype)
|
||||
return weight
|
||||
else:
|
||||
return weight * self.scale_weight.to(device=weight.device, dtype=weight.dtype)
|
||||
|
||||
def set_weight(self, weight, inplace_update=False, seed=None, return_weight=False, **kwargs):
|
||||
weight = stochastic_rounding(weight / self.scale_weight.to(device=weight.device, dtype=weight.dtype), self.weight.dtype, seed=seed)
|
||||
if return_weight:
|
||||
return weight
|
||||
if inplace_update:
|
||||
self.weight.data.copy_(weight)
|
||||
else:
|
||||
self.weight = torch.nn.Parameter(weight, requires_grad=False)
|
||||
|
||||
return scaled_fp8_op
|
||||
|
||||
|
||||
CUBLAS_IS_AVAILABLE = False
|
||||
try:
|
||||
from cublas_ops import CublasLinear
|
||||
@ -565,14 +560,194 @@ else:
|
||||
|
||||
Operations = Type[Union[manual_cast, fp8_ops, disable_weight_init, skip_init, scaled_fp8_op_base]]
|
||||
|
||||
# ==============================================================================
|
||||
# Mixed Precision Operations
|
||||
# ==============================================================================
|
||||
from .quant_ops import QuantizedTensor, QUANT_ALGOS
|
||||
|
||||
def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_fp8=False, fp8_optimizations=False, scaled_fp8: Optional[torch.dtype] = None, inference_mode: Optional[bool] = None) -> Operations:
|
||||
|
||||
def mixed_precision_ops(quant_config=None, compute_dtype=torch.bfloat16, full_precision_mm=False):
|
||||
if quant_config is None:
|
||||
quant_config = {}
|
||||
|
||||
class MixedPrecisionOps(manual_cast):
|
||||
_quant_config = quant_config
|
||||
_compute_dtype = compute_dtype
|
||||
_full_precision_mm = full_precision_mm
|
||||
|
||||
class Linear(torch.nn.Module, CastWeightBiasOp):
|
||||
def __init__(
|
||||
self,
|
||||
in_features: int,
|
||||
out_features: int,
|
||||
bias: bool = True,
|
||||
device=None,
|
||||
dtype=None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
if dtype is None:
|
||||
dtype = MixedPrecisionOps._compute_dtype
|
||||
|
||||
self.factory_kwargs = {"device": device, "dtype": dtype}
|
||||
|
||||
self.in_features = in_features
|
||||
self.out_features = out_features
|
||||
self._has_bias = bias
|
||||
|
||||
self.tensor_class = None
|
||||
self._full_precision_mm = MixedPrecisionOps._full_precision_mm
|
||||
|
||||
def reset_parameters(self):
|
||||
return None
|
||||
|
||||
def _load_from_state_dict(self, state_dict, prefix, local_metadata,
|
||||
strict, missing_keys, unexpected_keys, error_msgs):
|
||||
|
||||
device = self.factory_kwargs["device"]
|
||||
layer_name = prefix.rstrip('.')
|
||||
weight_key = f"{prefix}weight"
|
||||
weight = state_dict.pop(weight_key, None)
|
||||
if weight is None:
|
||||
raise ValueError(f"Missing weight for layer {layer_name}")
|
||||
|
||||
manually_loaded_keys = [weight_key]
|
||||
|
||||
layer_conf = state_dict.pop(f"{prefix}comfy_quant", None)
|
||||
if layer_conf is not None:
|
||||
layer_conf = json.loads(layer_conf.numpy().tobytes())
|
||||
|
||||
if layer_conf is None:
|
||||
dtype = self.factory_kwargs["dtype"]
|
||||
self.weight = torch.nn.Parameter(weight.to(device=device, dtype=dtype), requires_grad=False)
|
||||
if dtype != MixedPrecisionOps._compute_dtype:
|
||||
self.comfy_cast_weights = True
|
||||
if self._has_bias:
|
||||
self.bias = torch.nn.Parameter(torch.empty(self.out_features, device=device, dtype=dtype))
|
||||
else:
|
||||
self.register_parameter("bias", None)
|
||||
else:
|
||||
self.quant_format = layer_conf.get("format", None)
|
||||
if not self._full_precision_mm:
|
||||
self._full_precision_mm = layer_conf.get("full_precision_matrix_mult", False)
|
||||
|
||||
if self.quant_format is None:
|
||||
raise ValueError(f"Unknown quantization format for layer {layer_name}")
|
||||
|
||||
qconfig = QUANT_ALGOS[self.quant_format]
|
||||
self.layout_type = qconfig["comfy_tensor_layout"]
|
||||
|
||||
weight_scale_key = f"{prefix}weight_scale"
|
||||
scale = state_dict.pop(weight_scale_key, None)
|
||||
if scale is not None:
|
||||
scale = scale.to(device)
|
||||
layout_params = {
|
||||
'scale': scale,
|
||||
'orig_dtype': MixedPrecisionOps._compute_dtype,
|
||||
'block_size': qconfig.get("group_size", None),
|
||||
}
|
||||
|
||||
if scale is not None:
|
||||
manually_loaded_keys.append(weight_scale_key)
|
||||
|
||||
self.weight = torch.nn.Parameter(
|
||||
QuantizedTensor(weight.to(device=device, dtype=qconfig.get("storage_t", None)), self.layout_type, layout_params),
|
||||
requires_grad=False
|
||||
)
|
||||
|
||||
if self._has_bias:
|
||||
self.bias = torch.nn.Parameter(torch.empty(self.out_features, device=device, dtype=MixedPrecisionOps._compute_dtype))
|
||||
else:
|
||||
self.register_parameter("bias", None)
|
||||
|
||||
for param_name in qconfig["parameters"]:
|
||||
param_key = f"{prefix}{param_name}"
|
||||
_v = state_dict.pop(param_key, None)
|
||||
if _v is None:
|
||||
continue
|
||||
self.register_parameter(param_name, torch.nn.Parameter(_v.to(device=device), requires_grad=False))
|
||||
manually_loaded_keys.append(param_key)
|
||||
|
||||
super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
|
||||
|
||||
for key in manually_loaded_keys:
|
||||
if key in missing_keys:
|
||||
missing_keys.remove(key)
|
||||
|
||||
def state_dict(self, *args, destination=None, prefix="", **kwargs):
|
||||
sd: dict = super().state_dict(*args, destination=destination, prefix=prefix, **kwargs)
|
||||
if isinstance(self.weight, QuantizedTensor):
|
||||
sd["{}weight_scale".format(prefix)] = self.weight._layout_params['scale'] # pylint: disable=unsupported-assignment-operation
|
||||
quant_conf = {"format": self.quant_format}
|
||||
if self._full_precision_mm:
|
||||
quant_conf["full_precision_matrix_mult"] = True
|
||||
sd["{}comfy_quant".format(prefix)] = torch.frombuffer(json.dumps(quant_conf).encode('utf-8'), dtype=torch.uint8) # pylint: disable=unsupported-assignment-operation
|
||||
return sd
|
||||
|
||||
def _forward(self, input, weight, bias):
|
||||
return torch.nn.functional.linear(input, weight, bias)
|
||||
|
||||
def forward_comfy_cast_weights(self, input):
|
||||
if input.dtype == torch.float32 and (self.weight.dtype == torch.float16 or self.weight.dtype == torch.bfloat16):
|
||||
input = input.to(self.weight.dtype)
|
||||
weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True)
|
||||
x = self._forward(input, weight, bias)
|
||||
uncast_bias_weight(self, weight, bias, offload_stream)
|
||||
return x
|
||||
|
||||
def forward(self, input, *args, **kwargs):
|
||||
run_every_op()
|
||||
|
||||
if self._full_precision_mm or self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
|
||||
return self.forward_comfy_cast_weights(input, *args, **kwargs)
|
||||
if (getattr(self, 'layout_type', None) is not None and
|
||||
not isinstance(input, QuantizedTensor)):
|
||||
input = QuantizedTensor.from_float(input, self.layout_type, scale=getattr(self, 'input_scale', None), dtype=self.weight.dtype)
|
||||
return self._forward(input, self.weight, self.bias)
|
||||
|
||||
def convert_weight(self, weight, inplace=False, **kwargs):
|
||||
if isinstance(weight, QuantizedTensor):
|
||||
return weight.dequantize()
|
||||
else:
|
||||
return weight
|
||||
|
||||
def set_weight(self, weight, inplace_update=False, seed=None, return_weight=False, **kwargs):
|
||||
if getattr(self, 'layout_type', None) is not None:
|
||||
weight = QuantizedTensor.from_float(weight, self.layout_type, scale="recalculate", dtype=self.weight.dtype, stochastic_rounding=seed, inplace_ops=True)
|
||||
else:
|
||||
weight = weight.to(self.weight.dtype)
|
||||
if return_weight:
|
||||
return weight
|
||||
|
||||
assert inplace_update is False # TODO: eventually remove the inplace_update stuff
|
||||
self.weight = torch.nn.Parameter(weight, requires_grad=False)
|
||||
|
||||
def _apply(self, fn, recurse=True): # This is to get torch.compile + moving weights to another device working
|
||||
if recurse:
|
||||
for module in self.children():
|
||||
module._apply(fn)
|
||||
|
||||
for key, param in self._parameters.items():
|
||||
if param is None:
|
||||
continue
|
||||
self.register_parameter(key, torch.nn.Parameter(fn(param), requires_grad=False))
|
||||
for key, buf in self._buffers.items():
|
||||
if buf is not None:
|
||||
self._buffers[key] = fn(buf)
|
||||
return self
|
||||
|
||||
return MixedPrecisionOps
|
||||
|
||||
|
||||
def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_fp8=False, fp8_optimizations=False, model_config=None, inference_mode: Optional[bool] = None):
|
||||
if inference_mode is None:
|
||||
# todo: check a context here, since this isn't being used by any callers yet
|
||||
inference_mode = current_execution_context().inference_mode
|
||||
fp8_compute = model_management.supports_fp8_compute(load_device)
|
||||
if scaled_fp8 is not None:
|
||||
return scaled_fp8_ops(fp8_matrix_mult=fp8_compute and fp8_optimizations, scale_input=fp8_optimizations, override_dtype=scaled_fp8)
|
||||
fp8_compute = model_management.supports_fp8_compute(load_device) # TODO: if we support more ops this needs to be more granular
|
||||
|
||||
if model_config and hasattr(model_config, 'quant_config') and model_config.quant_config:
|
||||
logger.info("Using mixed precision operations")
|
||||
return mixed_precision_ops(model_config.quant_config, compute_dtype, full_precision_mm=not fp8_compute)
|
||||
|
||||
if (
|
||||
fp8_compute and
|
||||
|
||||
581
comfy/quant_ops.py
Normal file
581
comfy/quant_ops.py
Normal file
@ -0,0 +1,581 @@
|
||||
import torch
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
from typing import Tuple, Dict
|
||||
from .float import stochastic_rounding as stochastic_rounding_fn
|
||||
|
||||
_LAYOUT_REGISTRY = {}
|
||||
_GENERIC_UTILS = {}
|
||||
|
||||
|
||||
def register_layout_op(torch_op, layout_type):
|
||||
"""
|
||||
Decorator to register a layout-specific operation handler.
|
||||
Args:
|
||||
torch_op: PyTorch operation (e.g., torch.ops.aten.linear.default)
|
||||
layout_type: Layout class (e.g., TensorCoreFP8Layout)
|
||||
Example:
|
||||
@register_layout_op(torch.ops.aten.linear.default, TensorCoreFP8Layout)
|
||||
def fp8_linear(func, args, kwargs):
|
||||
# FP8-specific linear implementation
|
||||
...
|
||||
"""
|
||||
def decorator(handler_func):
|
||||
if torch_op not in _LAYOUT_REGISTRY:
|
||||
_LAYOUT_REGISTRY[torch_op] = {}
|
||||
_LAYOUT_REGISTRY[torch_op][layout_type] = handler_func
|
||||
return handler_func
|
||||
return decorator
|
||||
|
||||
|
||||
def register_generic_util(torch_op):
|
||||
"""
|
||||
Decorator to register a generic utility that works for all layouts.
|
||||
Args:
|
||||
torch_op: PyTorch operation (e.g., torch.ops.aten.detach.default)
|
||||
|
||||
Example:
|
||||
@register_generic_util(torch.ops.aten.detach.default)
|
||||
def generic_detach(func, args, kwargs):
|
||||
# Works for any layout
|
||||
...
|
||||
"""
|
||||
def decorator(handler_func):
|
||||
_GENERIC_UTILS[torch_op] = handler_func
|
||||
return handler_func
|
||||
return decorator
|
||||
|
||||
|
||||
def _get_layout_from_args(args):
|
||||
for arg in args:
|
||||
if isinstance(arg, QuantizedTensor):
|
||||
return arg._layout_type
|
||||
elif isinstance(arg, (list, tuple)):
|
||||
for item in arg:
|
||||
if isinstance(item, QuantizedTensor):
|
||||
return item._layout_type
|
||||
return None
|
||||
|
||||
|
||||
def _move_layout_params_to_device(params, device):
|
||||
new_params = {}
|
||||
for k, v in params.items():
|
||||
if isinstance(v, torch.Tensor):
|
||||
new_params[k] = v.to(device=device)
|
||||
else:
|
||||
new_params[k] = v
|
||||
return new_params
|
||||
|
||||
|
||||
def _copy_layout_params(params):
|
||||
new_params = {}
|
||||
for k, v in params.items():
|
||||
if isinstance(v, torch.Tensor):
|
||||
new_params[k] = v.clone()
|
||||
else:
|
||||
new_params[k] = v
|
||||
return new_params
|
||||
|
||||
def _copy_layout_params_inplace(src, dst, non_blocking=False):
|
||||
for k, v in src.items():
|
||||
if isinstance(v, torch.Tensor):
|
||||
dst[k].copy_(v, non_blocking=non_blocking)
|
||||
else:
|
||||
dst[k] = v
|
||||
|
||||
class QuantizedLayout:
|
||||
"""
|
||||
Base class for quantization layouts.
|
||||
|
||||
A layout encapsulates the format-specific logic for quantization/dequantization
|
||||
and provides a uniform interface for extracting raw tensors needed for computation.
|
||||
|
||||
New quantization formats should subclass this and implement the required methods.
|
||||
"""
|
||||
@classmethod
|
||||
def quantize(cls, tensor, **kwargs) -> Tuple[torch.Tensor, Dict]:
|
||||
raise NotImplementedError(f"{cls.__name__} must implement quantize()")
|
||||
|
||||
@staticmethod
|
||||
def dequantize(qdata, **layout_params) -> torch.Tensor:
|
||||
raise NotImplementedError("TensorLayout must implement dequantize()")
|
||||
|
||||
@classmethod
|
||||
def get_plain_tensors(cls, qtensor) -> torch.Tensor:
|
||||
raise NotImplementedError(f"{cls.__name__} must implement get_plain_tensors()")
|
||||
|
||||
|
||||
class QuantizedTensor(torch.Tensor):
|
||||
"""
|
||||
Universal quantized tensor that works with any layout.
|
||||
|
||||
This tensor subclass uses a pluggable layout system to support multiple
|
||||
quantization formats (FP8, INT4, INT8, etc.) without code duplication.
|
||||
|
||||
The layout_type determines format-specific behavior, while common operations
|
||||
(detach, clone, to) are handled generically.
|
||||
|
||||
Attributes:
|
||||
_qdata: The quantized tensor data
|
||||
_layout_type: Layout class (e.g., TensorCoreFP8Layout)
|
||||
_layout_params: Dict with layout-specific params (scale, zero_point, etc.)
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def __new__(cls, qdata, layout_type, layout_params):
|
||||
"""
|
||||
Create a quantized tensor.
|
||||
|
||||
Args:
|
||||
qdata: The quantized data tensor
|
||||
layout_type: Layout class (subclass of QuantizedLayout)
|
||||
layout_params: Dict with layout-specific parameters
|
||||
"""
|
||||
return torch.Tensor._make_wrapper_subclass(cls, qdata.shape, device=qdata.device, dtype=qdata.dtype, requires_grad=False)
|
||||
|
||||
def __init__(self, qdata, layout_type, layout_params):
|
||||
self._qdata = qdata
|
||||
self._layout_type = layout_type
|
||||
self._layout_params = layout_params
|
||||
|
||||
def __repr__(self):
|
||||
layout_name = self._layout_type
|
||||
param_str = ", ".join(f"{k}={v}" for k, v in list(self._layout_params.items())[:2])
|
||||
return f"QuantizedTensor(shape={self.shape}, layout={layout_name}, {param_str})"
|
||||
|
||||
@property
|
||||
def layout_type(self):
|
||||
return self._layout_type
|
||||
|
||||
def __tensor_flatten__(self):
|
||||
"""
|
||||
Tensor flattening protocol for proper device movement.
|
||||
"""
|
||||
inner_tensors = ["_qdata"]
|
||||
ctx = {
|
||||
"layout_type": self._layout_type,
|
||||
}
|
||||
|
||||
tensor_params = {}
|
||||
non_tensor_params = {}
|
||||
for k, v in self._layout_params.items():
|
||||
if isinstance(v, torch.Tensor):
|
||||
tensor_params[k] = v
|
||||
else:
|
||||
non_tensor_params[k] = v
|
||||
|
||||
ctx["tensor_param_keys"] = list(tensor_params.keys())
|
||||
ctx["non_tensor_params"] = non_tensor_params
|
||||
|
||||
for k, v in tensor_params.items():
|
||||
attr_name = f"_layout_param_{k}"
|
||||
object.__setattr__(self, attr_name, v)
|
||||
inner_tensors.append(attr_name)
|
||||
|
||||
return inner_tensors, ctx
|
||||
|
||||
@staticmethod
|
||||
def __tensor_unflatten__(inner_tensors, ctx, outer_size, outer_stride):
|
||||
"""
|
||||
Tensor unflattening protocol for proper device movement.
|
||||
Reconstructs the QuantizedTensor after device movement.
|
||||
"""
|
||||
layout_type = ctx["layout_type"]
|
||||
layout_params = dict(ctx["non_tensor_params"])
|
||||
|
||||
for key in ctx["tensor_param_keys"]:
|
||||
attr_name = f"_layout_param_{key}"
|
||||
layout_params[key] = inner_tensors[attr_name]
|
||||
|
||||
return QuantizedTensor(inner_tensors["_qdata"], layout_type, layout_params)
|
||||
|
||||
@classmethod
|
||||
def from_float(cls, tensor, layout_type, **quantize_kwargs) -> 'QuantizedTensor':
|
||||
qdata, layout_params = LAYOUTS[layout_type].quantize(tensor, **quantize_kwargs)
|
||||
return cls(qdata, layout_type, layout_params)
|
||||
|
||||
def dequantize(self) -> torch.Tensor:
|
||||
return LAYOUTS[self._layout_type].dequantize(self._qdata, **self._layout_params)
|
||||
|
||||
@classmethod
|
||||
def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
|
||||
kwargs = kwargs or {}
|
||||
|
||||
# Step 1: Check generic utilities first (detach, clone, to, etc.)
|
||||
if func in _GENERIC_UTILS:
|
||||
return _GENERIC_UTILS[func](func, args, kwargs)
|
||||
|
||||
# Step 2: Check layout-specific handlers (linear, matmul, etc.)
|
||||
layout_type = _get_layout_from_args(args)
|
||||
if layout_type and func in _LAYOUT_REGISTRY:
|
||||
handler = _LAYOUT_REGISTRY[func].get(layout_type)
|
||||
if handler:
|
||||
return handler(func, args, kwargs)
|
||||
|
||||
# Step 3: Fallback to dequantization
|
||||
if isinstance(args[0] if args else None, QuantizedTensor):
|
||||
logger.info(f"QuantizedTensor: Unhandled operation {func}, falling back to dequantization. kwargs={kwargs}")
|
||||
return cls._dequant_and_fallback(func, args, kwargs)
|
||||
|
||||
@classmethod
|
||||
def _dequant_and_fallback(cls, func, args, kwargs):
|
||||
def dequant_arg(arg):
|
||||
if isinstance(arg, QuantizedTensor):
|
||||
return arg.dequantize()
|
||||
elif isinstance(arg, (list, tuple)):
|
||||
return type(arg)(dequant_arg(a) for a in arg)
|
||||
return arg
|
||||
|
||||
new_args = dequant_arg(args)
|
||||
new_kwargs = dequant_arg(kwargs)
|
||||
return func(*new_args, **new_kwargs)
|
||||
|
||||
def data_ptr(self):
|
||||
return self._qdata.data_ptr()
|
||||
|
||||
def is_pinned(self):
|
||||
return self._qdata.is_pinned()
|
||||
|
||||
def is_contiguous(self, *arg, **kwargs):
|
||||
return self._qdata.is_contiguous(*arg, **kwargs)
|
||||
|
||||
def storage(self):
|
||||
return self._qdata.storage()
|
||||
|
||||
# ==============================================================================
|
||||
# Generic Utilities (Layout-Agnostic Operations)
|
||||
# ==============================================================================
|
||||
|
||||
def _create_transformed_qtensor(qt, transform_fn):
|
||||
new_data = transform_fn(qt._qdata)
|
||||
new_params = _copy_layout_params(qt._layout_params)
|
||||
return QuantizedTensor(new_data, qt._layout_type, new_params)
|
||||
|
||||
|
||||
def _handle_device_transfer(qt, target_device, target_dtype=None, target_layout=None, op_name="to"):
|
||||
if target_layout is not None and target_layout != torch.strided:
|
||||
logger.warning(
|
||||
f"QuantizedTensor: layout change requested to {target_layout}, "
|
||||
f"but not supported. Ignoring layout."
|
||||
)
|
||||
|
||||
# Handle device transfer
|
||||
current_device = qt._qdata.device
|
||||
if target_device is not None:
|
||||
# Normalize device for comparison
|
||||
if isinstance(target_device, str):
|
||||
target_device = torch.device(target_device)
|
||||
if isinstance(current_device, str):
|
||||
current_device = torch.device(current_device)
|
||||
|
||||
if target_device != current_device:
|
||||
logger.debug(f"QuantizedTensor.{op_name}: Moving from {current_device} to {target_device}")
|
||||
new_q_data = qt._qdata.to(device=target_device)
|
||||
new_params = _move_layout_params_to_device(qt._layout_params, target_device)
|
||||
if target_dtype is not None:
|
||||
new_params["orig_dtype"] = target_dtype
|
||||
new_qt = QuantizedTensor(new_q_data, qt._layout_type, new_params)
|
||||
logger.debug(f"QuantizedTensor.{op_name}: Created new tensor on {target_device}")
|
||||
return new_qt
|
||||
|
||||
logger.debug(f"QuantizedTensor.{op_name}: No device change needed, returning original")
|
||||
return qt
|
||||
|
||||
|
||||
@register_generic_util(torch.ops.aten.detach.default)
|
||||
def generic_detach(func, args, kwargs):
|
||||
"""Detach operation - creates a detached copy of the quantized tensor."""
|
||||
qt = args[0]
|
||||
if isinstance(qt, QuantizedTensor):
|
||||
return _create_transformed_qtensor(qt, lambda x: x.detach())
|
||||
return func(*args, **kwargs)
|
||||
|
||||
|
||||
@register_generic_util(torch.ops.aten.clone.default)
|
||||
def generic_clone(func, args, kwargs):
|
||||
"""Clone operation - creates a deep copy of the quantized tensor."""
|
||||
qt = args[0]
|
||||
if isinstance(qt, QuantizedTensor):
|
||||
return _create_transformed_qtensor(qt, lambda x: x.clone())
|
||||
return func(*args, **kwargs)
|
||||
|
||||
|
||||
@register_generic_util(torch.ops.aten._to_copy.default)
|
||||
def generic_to_copy(func, args, kwargs):
|
||||
"""Device/dtype transfer operation - handles .to(device) calls."""
|
||||
qt = args[0]
|
||||
if isinstance(qt, QuantizedTensor):
|
||||
return _handle_device_transfer(
|
||||
qt,
|
||||
target_device=kwargs.get('device', None),
|
||||
target_dtype=kwargs.get('dtype', None),
|
||||
op_name="_to_copy"
|
||||
)
|
||||
return func(*args, **kwargs)
|
||||
|
||||
|
||||
@register_generic_util(torch.ops.aten.to.dtype_layout)
|
||||
def generic_to_dtype_layout(func, args, kwargs):
|
||||
"""Handle .to(device) calls using the dtype_layout variant."""
|
||||
qt = args[0]
|
||||
if isinstance(qt, QuantizedTensor):
|
||||
return _handle_device_transfer(
|
||||
qt,
|
||||
target_device=kwargs.get('device', None),
|
||||
target_dtype=kwargs.get('dtype', None),
|
||||
target_layout=kwargs.get('layout', None),
|
||||
op_name="to"
|
||||
)
|
||||
return func(*args, **kwargs)
|
||||
|
||||
|
||||
@register_generic_util(torch.ops.aten.copy_.default)
|
||||
def generic_copy_(func, args, kwargs):
|
||||
qt_dest = args[0]
|
||||
src = args[1]
|
||||
non_blocking = args[2] if len(args) > 2 else False
|
||||
if isinstance(qt_dest, QuantizedTensor):
|
||||
if isinstance(src, QuantizedTensor):
|
||||
# Copy from another quantized tensor
|
||||
qt_dest._qdata.copy_(src._qdata, non_blocking=non_blocking)
|
||||
qt_dest._layout_type = src._layout_type
|
||||
orig_dtype = qt_dest._layout_params["orig_dtype"]
|
||||
_copy_layout_params_inplace(src._layout_params, qt_dest._layout_params, non_blocking=non_blocking)
|
||||
qt_dest._layout_params["orig_dtype"] = orig_dtype
|
||||
else:
|
||||
# Copy from regular tensor - just copy raw data
|
||||
qt_dest._qdata.copy_(src)
|
||||
return qt_dest
|
||||
return func(*args, **kwargs)
|
||||
|
||||
|
||||
@register_generic_util(torch.ops.aten.to.dtype)
|
||||
def generic_to_dtype(func, args, kwargs):
|
||||
"""Handle .to(dtype) calls - dtype conversion only."""
|
||||
src = args[0]
|
||||
if isinstance(src, QuantizedTensor):
|
||||
# For dtype-only conversion, just change the orig_dtype, no real cast is needed
|
||||
target_dtype = args[1] if len(args) > 1 else kwargs.get('dtype')
|
||||
src._layout_params["orig_dtype"] = target_dtype
|
||||
return src
|
||||
return func(*args, **kwargs)
|
||||
|
||||
|
||||
@register_generic_util(torch.ops.aten._has_compatible_shallow_copy_type.default)
|
||||
def generic_has_compatible_shallow_copy_type(func, args, kwargs):
|
||||
return True
|
||||
|
||||
|
||||
@register_generic_util(torch.ops.aten.empty_like.default)
|
||||
def generic_empty_like(func, args, kwargs):
|
||||
"""Empty_like operation - creates an empty tensor with the same quantized structure."""
|
||||
qt = args[0]
|
||||
if isinstance(qt, QuantizedTensor):
|
||||
# Create empty tensor with same shape and dtype as the quantized data
|
||||
hp_dtype = kwargs.pop('dtype', qt._layout_params["orig_dtype"])
|
||||
new_qdata = torch.empty_like(qt._qdata, **kwargs)
|
||||
|
||||
# Handle device transfer for layout params
|
||||
target_device = kwargs.get('device', new_qdata.device)
|
||||
new_params = _move_layout_params_to_device(qt._layout_params, target_device)
|
||||
|
||||
# Update orig_dtype if dtype is specified
|
||||
new_params['orig_dtype'] = hp_dtype
|
||||
|
||||
return QuantizedTensor(new_qdata, qt._layout_type, new_params)
|
||||
return func(*args, **kwargs)
|
||||
|
||||
# ==============================================================================
|
||||
# FP8 Layout + Operation Handlers
|
||||
# ==============================================================================
|
||||
class TensorCoreFP8Layout(QuantizedLayout):
|
||||
"""
|
||||
Storage format:
|
||||
- qdata: FP8 tensor (torch.float8_e4m3fn or torch.float8_e5m2)
|
||||
- scale: Scalar tensor (float32) for dequantization
|
||||
- orig_dtype: Original dtype before quantization (for casting back)
|
||||
"""
|
||||
@classmethod
|
||||
def quantize(cls, tensor, scale=None, dtype=torch.float8_e4m3fn, stochastic_rounding=0, inplace_ops=False):
|
||||
orig_dtype = tensor.dtype
|
||||
|
||||
if isinstance(scale, str) and scale == "recalculate":
|
||||
scale = torch.amax(tensor.abs()).to(dtype=torch.float32) / torch.finfo(dtype).max
|
||||
if tensor.dtype not in [torch.float32, torch.bfloat16]: # Prevent scale from being too small
|
||||
tensor_info = torch.finfo(tensor.dtype)
|
||||
scale = (1.0 / torch.clamp((1.0 / scale), min=tensor_info.min, max=tensor_info.max))
|
||||
|
||||
if scale is not None:
|
||||
if not isinstance(scale, torch.Tensor):
|
||||
scale = torch.tensor(scale)
|
||||
scale = scale.to(device=tensor.device, dtype=torch.float32)
|
||||
|
||||
if inplace_ops:
|
||||
tensor *= (1.0 / scale).to(tensor.dtype)
|
||||
else:
|
||||
tensor = tensor * (1.0 / scale).to(tensor.dtype)
|
||||
else:
|
||||
scale = torch.ones((), device=tensor.device, dtype=torch.float32)
|
||||
|
||||
if stochastic_rounding > 0:
|
||||
tensor = stochastic_rounding_fn(tensor, dtype=dtype, seed=stochastic_rounding)
|
||||
else:
|
||||
lp_amax = torch.finfo(dtype).max
|
||||
torch.clamp(tensor, min=-lp_amax, max=lp_amax, out=tensor)
|
||||
tensor = tensor.to(dtype, memory_format=torch.contiguous_format)
|
||||
|
||||
layout_params = {
|
||||
'scale': scale,
|
||||
'orig_dtype': orig_dtype
|
||||
}
|
||||
return tensor, layout_params
|
||||
|
||||
@staticmethod
|
||||
def dequantize(qdata, scale, orig_dtype, **kwargs):
|
||||
plain_tensor = torch.ops.aten._to_copy.default(qdata, dtype=orig_dtype)
|
||||
plain_tensor.mul_(scale)
|
||||
return plain_tensor
|
||||
|
||||
@classmethod
|
||||
def get_plain_tensors(cls, qtensor):
|
||||
return qtensor._qdata, qtensor._layout_params['scale']
|
||||
|
||||
QUANT_ALGOS = {
|
||||
"float8_e4m3fn": {
|
||||
"storage_t": torch.float8_e4m3fn,
|
||||
"parameters": {"weight_scale", "input_scale"},
|
||||
"comfy_tensor_layout": "TensorCoreFP8Layout",
|
||||
},
|
||||
}
|
||||
|
||||
LAYOUTS = {
|
||||
"TensorCoreFP8Layout": TensorCoreFP8Layout,
|
||||
}
|
||||
|
||||
|
||||
@register_layout_op(torch.ops.aten.linear.default, "TensorCoreFP8Layout")
|
||||
def fp8_linear(func, args, kwargs):
|
||||
input_tensor = args[0]
|
||||
weight = args[1]
|
||||
bias = args[2] if len(args) > 2 else None
|
||||
|
||||
if isinstance(input_tensor, QuantizedTensor) and isinstance(weight, QuantizedTensor):
|
||||
plain_input, scale_a = TensorCoreFP8Layout.get_plain_tensors(input_tensor)
|
||||
plain_weight, scale_b = TensorCoreFP8Layout.get_plain_tensors(weight)
|
||||
|
||||
out_dtype = kwargs.get("out_dtype")
|
||||
if out_dtype is None:
|
||||
out_dtype = input_tensor._layout_params['orig_dtype']
|
||||
|
||||
weight_t = plain_weight.t()
|
||||
|
||||
tensor_2d = False
|
||||
if len(plain_input.shape) == 2:
|
||||
tensor_2d = True
|
||||
plain_input = plain_input.unsqueeze(1)
|
||||
|
||||
input_shape = plain_input.shape
|
||||
if len(input_shape) != 3:
|
||||
return None
|
||||
|
||||
try:
|
||||
output = torch._scaled_mm(
|
||||
plain_input.reshape(-1, input_shape[2]).contiguous(),
|
||||
weight_t,
|
||||
bias=bias,
|
||||
scale_a=scale_a,
|
||||
scale_b=scale_b,
|
||||
out_dtype=out_dtype,
|
||||
)
|
||||
|
||||
if isinstance(output, tuple): # TODO: remove when we drop support for torch 2.4
|
||||
output = output[0]
|
||||
|
||||
if not tensor_2d:
|
||||
output = output.reshape((-1, input_shape[1], weight.shape[0]))
|
||||
|
||||
if output.dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
|
||||
output_scale = scale_a * scale_b
|
||||
output_params = {
|
||||
'scale': output_scale,
|
||||
'orig_dtype': input_tensor._layout_params['orig_dtype']
|
||||
}
|
||||
return QuantizedTensor(output, "TensorCoreFP8Layout", output_params)
|
||||
else:
|
||||
return output
|
||||
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"FP8 _scaled_mm failed, falling back to dequantization: {e}")
|
||||
|
||||
# Case 2: DQ Fallback
|
||||
if isinstance(weight, QuantizedTensor):
|
||||
weight = weight.dequantize()
|
||||
if isinstance(input_tensor, QuantizedTensor):
|
||||
input_tensor = input_tensor.dequantize()
|
||||
|
||||
return torch.nn.functional.linear(input_tensor, weight, bias)
|
||||
|
||||
def fp8_mm_(input_tensor, weight, bias=None, out_dtype=None):
|
||||
if out_dtype is None:
|
||||
out_dtype = input_tensor._layout_params['orig_dtype']
|
||||
|
||||
plain_input, scale_a = TensorCoreFP8Layout.get_plain_tensors(input_tensor)
|
||||
plain_weight, scale_b = TensorCoreFP8Layout.get_plain_tensors(weight)
|
||||
|
||||
output = torch._scaled_mm(
|
||||
plain_input.contiguous(),
|
||||
plain_weight,
|
||||
bias=bias,
|
||||
scale_a=scale_a,
|
||||
scale_b=scale_b,
|
||||
out_dtype=out_dtype,
|
||||
)
|
||||
|
||||
if isinstance(output, tuple): # TODO: remove when we drop support for torch 2.4
|
||||
output = output[0]
|
||||
return output
|
||||
|
||||
@register_layout_op(torch.ops.aten.addmm.default, "TensorCoreFP8Layout")
|
||||
def fp8_addmm(func, args, kwargs):
|
||||
input_tensor = args[1]
|
||||
weight = args[2]
|
||||
bias = args[0]
|
||||
|
||||
if isinstance(input_tensor, QuantizedTensor) and isinstance(weight, QuantizedTensor):
|
||||
return fp8_mm_(input_tensor, weight, bias=bias, out_dtype=kwargs.get("out_dtype", None))
|
||||
|
||||
a = list(args)
|
||||
if isinstance(args[0], QuantizedTensor):
|
||||
a[0] = args[0].dequantize()
|
||||
if isinstance(args[1], QuantizedTensor):
|
||||
a[1] = args[1].dequantize()
|
||||
if isinstance(args[2], QuantizedTensor):
|
||||
a[2] = args[2].dequantize()
|
||||
|
||||
return func(*a, **kwargs)
|
||||
|
||||
@register_layout_op(torch.ops.aten.mm.default, "TensorCoreFP8Layout")
|
||||
def fp8_mm(func, args, kwargs):
|
||||
input_tensor = args[0]
|
||||
weight = args[1]
|
||||
|
||||
if isinstance(input_tensor, QuantizedTensor) and isinstance(weight, QuantizedTensor):
|
||||
return fp8_mm_(input_tensor, weight, bias=None, out_dtype=kwargs.get("out_dtype", None))
|
||||
|
||||
a = list(args)
|
||||
if isinstance(args[0], QuantizedTensor):
|
||||
a[0] = args[0].dequantize()
|
||||
if isinstance(args[1], QuantizedTensor):
|
||||
a[1] = args[1].dequantize()
|
||||
return func(*a, **kwargs)
|
||||
|
||||
@register_layout_op(torch.ops.aten.view.default, "TensorCoreFP8Layout")
|
||||
@register_layout_op(torch.ops.aten.t.default, "TensorCoreFP8Layout")
|
||||
def fp8_func(func, args, kwargs):
|
||||
input_tensor = args[0]
|
||||
if isinstance(input_tensor, QuantizedTensor):
|
||||
plain_input, scale_a = TensorCoreFP8Layout.get_plain_tensors(input_tensor)
|
||||
ar = list(args)
|
||||
ar[0] = plain_input
|
||||
return QuantizedTensor(func(*ar, **kwargs), "TensorCoreFP8Layout", input_tensor._layout_params)
|
||||
return func(*args, **kwargs)
|
||||
@ -5,6 +5,21 @@ from . import model_management
|
||||
from . import samplers
|
||||
from . import utils
|
||||
from .component_model.deprecation import _deprecate_method
|
||||
from .nested_tensor import NestedTensor
|
||||
|
||||
|
||||
def prepare_noise_inner(latent_image, generator, noise_inds=None):
|
||||
if noise_inds is None:
|
||||
return torch.randn(latent_image.size(), dtype=latent_image.dtype, layout=latent_image.layout, generator=generator, device="cpu")
|
||||
|
||||
unique_inds, inverse = np.unique(noise_inds, return_inverse=True)
|
||||
noises = []
|
||||
for i in range(unique_inds[-1] + 1):
|
||||
noise = torch.randn([1] + list(latent_image.size())[1:], dtype=latent_image.dtype, layout=latent_image.layout, generator=generator, device="cpu")
|
||||
if i in unique_inds:
|
||||
noises.append(noise)
|
||||
noises = [noises[i] for i in inverse]
|
||||
return torch.cat(noises, axis=0)
|
||||
|
||||
|
||||
def prepare_noise(latent_image, seed, noise_inds=None):
|
||||
@ -13,36 +28,41 @@ def prepare_noise(latent_image, seed, noise_inds=None):
|
||||
optional arg skip can be used to skip and discard x number of noise generations for a given seed
|
||||
"""
|
||||
generator = torch.manual_seed(seed)
|
||||
if noise_inds is None:
|
||||
return torch.randn(latent_image.size(), dtype=latent_image.dtype, layout=latent_image.layout, generator=generator, device="cpu")
|
||||
|
||||
unique_inds, inverse = np.unique(noise_inds, return_inverse=True)
|
||||
noises = []
|
||||
for i in range(unique_inds[-1]+1):
|
||||
noise = torch.randn([1] + list(latent_image.size())[1:], dtype=latent_image.dtype, layout=latent_image.layout, generator=generator, device="cpu")
|
||||
if i in unique_inds:
|
||||
noises.append(noise)
|
||||
noises = [noises[i] for i in inverse]
|
||||
noises = torch.cat(noises, axis=0)
|
||||
if latent_image.is_nested:
|
||||
tensors = latent_image.unbind()
|
||||
noises = []
|
||||
for t in tensors:
|
||||
noises.append(prepare_noise_inner(t, generator, noise_inds))
|
||||
noises = NestedTensor(noises)
|
||||
else:
|
||||
noises = prepare_noise_inner(latent_image, generator, noise_inds)
|
||||
|
||||
return noises
|
||||
|
||||
|
||||
def fix_empty_latent_channels(model, latent_image):
|
||||
latent_format = model.get_model_object("latent_format") #Resize the empty latent image so it has the right number of channels
|
||||
if latent_image.is_nested:
|
||||
return latent_image
|
||||
latent_format = model.get_model_object("latent_format") # Resize the empty latent image so it has the right number of channels
|
||||
if latent_format.latent_channels != latent_image.shape[1] and torch.count_nonzero(latent_image) == 0:
|
||||
latent_image = utils.repeat_to_batch_size(latent_image, latent_format.latent_channels, dim=1)
|
||||
if latent_format.latent_dimensions == 3 and latent_image.ndim == 4:
|
||||
latent_image = latent_image.unsqueeze(2)
|
||||
return latent_image
|
||||
|
||||
|
||||
@_deprecate_method(version="0.3.2", message="Warning: comfy.sample.prepare_sampling isn't used anymore and can be removed")
|
||||
def prepare_sampling(model, noise_shape, positive, negative, noise_mask):
|
||||
pass
|
||||
return model, positive, negative, noise_mask, []
|
||||
|
||||
|
||||
@_deprecate_method(version="0.3.2", message="Warning: comfy.sample.cleanup_additional_models isn't used anymore and can be removed")
|
||||
def cleanup_additional_models(models):
|
||||
pass
|
||||
|
||||
|
||||
def sample(model, noise, steps, cfg, sampler_name, scheduler, positive, negative, latent_image, denoise=1.0, disable_noise=False, start_step=None, last_step=None, force_full_denoise=False, noise_mask=None, sigmas=None, callback=None, disable_pbar=False, seed=None):
|
||||
sampler = samplers.KSampler(model, steps=steps, device=model.load_device, sampler=sampler_name, scheduler=scheduler, denoise=denoise, model_options=model.model_options)
|
||||
|
||||
@ -50,6 +70,7 @@ def sample(model, noise, steps, cfg, sampler_name, scheduler, positive, negative
|
||||
samples = samples.to(model_management.intermediate_device())
|
||||
return samples
|
||||
|
||||
|
||||
def sample_custom(model, noise, cfg, sampler, sigmas, positive, negative, latent_image, noise_mask=None, callback=None, disable_pbar=False, seed=None):
|
||||
samples = samplers.sample(model, noise, positive, negative, cfg, model.load_device, sampler, sigmas, model_options=model.model_options, latent_image=latent_image, denoise_mask=noise_mask, callback=callback, disable_pbar=disable_pbar, seed=seed)
|
||||
samples = samples.to(model_management.intermediate_device())
|
||||
|
||||
@ -14,6 +14,7 @@ from . import model_management
|
||||
from . import model_patcher
|
||||
from . import patcher_extension
|
||||
from . import sampler_helpers
|
||||
from .nested_tensor import NestedTensor
|
||||
from .component_model.deprecation import _deprecate_method
|
||||
from .controlnet import ControlBase
|
||||
from .extra_samplers import uni_pc
|
||||
@ -24,7 +25,7 @@ from .model_management_types import ModelOptions
|
||||
from .model_patcher import ModelPatcher
|
||||
from .sampler_names import SCHEDULER_NAMES, SAMPLER_NAMES, KSAMPLER_NAMES
|
||||
from .context_windows import ContextHandlerABC
|
||||
from .utils import common_upscale
|
||||
from .utils import common_upscale, pack_latents, unpack_latents
|
||||
from .patcher_extension import WrapperExecutor, get_all_wrappers, WrappersMP
|
||||
from .component_model import module_property
|
||||
|
||||
@ -755,7 +756,7 @@ def encode_model_conds(model_function, conds, noise, device, prompt_type, **kwar
|
||||
|
||||
|
||||
class Sampler:
|
||||
def sample(self):
|
||||
def sample(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
def max_denoise(self, model_wrap, sigmas):
|
||||
@ -827,7 +828,7 @@ def ksampler(sampler_name, extra_options={}, inpaint_options={}):
|
||||
return KSAMPLER(sampler_function, extra_options, inpaint_options)
|
||||
|
||||
|
||||
def process_conds(model, noise, conds, device, latent_image=None, denoise_mask=None, seed=None):
|
||||
def process_conds(model, noise, conds, device, latent_image=None, denoise_mask=None, seed=None, latent_shapes=None):
|
||||
for k in conds:
|
||||
conds[k] = conds[k][:]
|
||||
resolve_areas_and_cond_masks_multidim(conds[k], noise.shape[2:], device)
|
||||
@ -837,7 +838,7 @@ def process_conds(model, noise, conds, device, latent_image=None, denoise_mask=N
|
||||
|
||||
if hasattr(model, 'extra_conds'):
|
||||
for k in conds:
|
||||
conds[k] = encode_model_conds(model.extra_conds, conds[k], noise, device, k, latent_image=latent_image, denoise_mask=denoise_mask, seed=seed)
|
||||
conds[k] = encode_model_conds(model.extra_conds, conds[k], noise, device, k, latent_image=latent_image, denoise_mask=denoise_mask, seed=seed, latent_shapes=latent_shapes)
|
||||
|
||||
# make sure each cond area has an opposite one with the same area
|
||||
for k in conds:
|
||||
@ -1008,11 +1009,11 @@ class CFGGuider:
|
||||
def predict_noise(self, x, timestep, model_options={}, seed=None):
|
||||
return sampling_function(self.inner_model, x, timestep, self.conds.get("negative", None), self.conds.get("positive", None), self.cfg, model_options=model_options, seed=seed)
|
||||
|
||||
def inner_sample(self, noise, latent_image, device, sampler: KSAMPLER, sigmas, denoise_mask, callback, disable_pbar, seed):
|
||||
def inner_sample(self, noise, latent_image, device, sampler: KSAMPLER, sigmas, denoise_mask, callback, disable_pbar, seed, latent_shapes=None):
|
||||
if latent_image is not None and torch.count_nonzero(latent_image) > 0: # Don't shift the empty latent image.
|
||||
latent_image = self.inner_model.process_latent_in(latent_image)
|
||||
|
||||
self.conds = process_conds(self.inner_model, noise, self.conds, device, latent_image, denoise_mask, seed)
|
||||
self.conds = process_conds(self.inner_model, noise, self.conds, device, latent_image, denoise_mask, seed, latent_shapes=latent_shapes)
|
||||
|
||||
extra_model_options = model_patcher.create_model_options_clone(self.model_options)
|
||||
extra_model_options.setdefault("transformer_options", {})["sample_sigmas"] = sigmas
|
||||
@ -1026,7 +1027,7 @@ class CFGGuider:
|
||||
samples = executor.execute(self, sigmas, extra_args, callback, noise, latent_image, denoise_mask, disable_pbar)
|
||||
return self.inner_model.process_latent_out(samples.to(torch.float32))
|
||||
|
||||
def outer_sample(self, noise, latent_image, sampler: KSAMPLER, sigmas, denoise_mask=None, callback=None, disable_pbar=False, seed=None):
|
||||
def outer_sample(self, noise, latent_image, sampler: KSAMPLER, sigmas, denoise_mask=None, callback=None, disable_pbar=False, seed=None, latent_shapes=None):
|
||||
self.inner_model, self.conds, self.loaded_models = sampler_helpers.prepare_sampling(self.model_patcher, noise.shape, self.conds, self.model_options)
|
||||
device = self.model_patcher.load_device
|
||||
|
||||
@ -1040,7 +1041,7 @@ class CFGGuider:
|
||||
|
||||
try:
|
||||
self.model_patcher.pre_run()
|
||||
output = self.inner_sample(noise, latent_image, device, sampler, sigmas, denoise_mask, callback, disable_pbar, seed)
|
||||
output = self.inner_sample(noise, latent_image, device, sampler, sigmas, denoise_mask, callback, disable_pbar, seed, latent_shapes=latent_shapes)
|
||||
finally:
|
||||
self.model_patcher.cleanup()
|
||||
|
||||
@ -1053,6 +1054,12 @@ class CFGGuider:
|
||||
if sigmas.shape[-1] == 0:
|
||||
return latent_image
|
||||
|
||||
if latent_image.is_nested:
|
||||
latent_image, latent_shapes = pack_latents(latent_image.unbind())
|
||||
noise, _ = pack_latents(noise.unbind())
|
||||
else:
|
||||
latent_shapes = [latent_image.shape]
|
||||
|
||||
self.conds = {}
|
||||
for k in self.original_conds:
|
||||
self.conds[k] = list(map(lambda a: a.copy(), self.original_conds[k]))
|
||||
@ -1072,7 +1079,7 @@ class CFGGuider:
|
||||
self,
|
||||
patcher_extension.get_all_wrappers(patcher_extension.WrappersMP.OUTER_SAMPLE, self.model_options, is_model_options=True)
|
||||
)
|
||||
output = executor.execute(noise, latent_image, sampler, sigmas, denoise_mask, callback, disable_pbar, seed)
|
||||
output = executor.execute(noise, latent_image, sampler, sigmas, denoise_mask, callback, disable_pbar, seed, latent_shapes=latent_shapes)
|
||||
except ValueError as exc_info:
|
||||
if "fp8e4nv" in str(exc_info):
|
||||
logger.error(f"Load the weights for model {self.model_patcher} as fp8_e5m2 to use floating point 8-bit inference with torch.compile and triton on Ampere architecture")
|
||||
@ -1084,6 +1091,9 @@ class CFGGuider:
|
||||
self.model_patcher.restore_hook_patches()
|
||||
|
||||
del self.conds
|
||||
|
||||
if len(latent_shapes) > 1:
|
||||
output = NestedTensor(unpack_latents(output, latent_shapes))
|
||||
return output
|
||||
|
||||
|
||||
|
||||
267
comfy/sd.py
267
comfy/sd.py
@ -6,12 +6,11 @@ import logging
|
||||
import math
|
||||
import os
|
||||
import os.path
|
||||
import torch
|
||||
import yaml
|
||||
from enum import Enum
|
||||
from typing import Any, Optional
|
||||
|
||||
from humanize import naturalsize
|
||||
import torch
|
||||
import yaml
|
||||
|
||||
from . import clip_vision
|
||||
from . import diffusers_convert
|
||||
@ -34,14 +33,15 @@ from .ldm.flux.redux import ReduxImageEncoder
|
||||
from .ldm.genmo.vae import model as genmo_model
|
||||
from .ldm.hunyuan3d.vae import ShapeVAE
|
||||
from .ldm.lightricks.vae import causal_video_autoencoder as lightricks
|
||||
from .ldm.models.autoencoder import AutoencoderKL, AutoencodingEngine
|
||||
from .ldm.mmaudio.vae.autoencoder import AudioAutoencoder
|
||||
from .ldm.models.autoencoder import AutoencoderKL, AutoencodingEngine
|
||||
from .ldm.wan import vae as wan_vae
|
||||
from .ldm.wan import vae2_2 as wan_vae2_2
|
||||
from .lora import load_lora, model_lora_keys_unet, model_lora_keys_clip
|
||||
from .lora_convert import convert_lora
|
||||
from .model_management import load_models_gpu
|
||||
from .model_management import load_models_gpu, module_size
|
||||
from .model_patcher import ModelPatcher
|
||||
from .pixel_space_convert import PixelspaceConversionVAE
|
||||
from .t2i_adapter import adapter
|
||||
from .taesd import taesd
|
||||
from .text_encoders import ace
|
||||
@ -50,21 +50,25 @@ from .text_encoders import cosmos
|
||||
from .text_encoders import flux
|
||||
from .text_encoders import genmo
|
||||
from .text_encoders import hidream
|
||||
from .text_encoders import hunyuan_video
|
||||
from .text_encoders import hunyuan_image
|
||||
from .text_encoders import hunyuan_video
|
||||
from .text_encoders import hydit
|
||||
from .text_encoders import kandinsky5
|
||||
from .text_encoders import long_clipl
|
||||
from .text_encoders import lt
|
||||
from .text_encoders import lumina2
|
||||
from .text_encoders import omnigen2
|
||||
from .text_encoders import ovis
|
||||
from .text_encoders import pixart_t5
|
||||
from .text_encoders import qwen_image
|
||||
from .text_encoders import sa_t5
|
||||
from .text_encoders import sd2_clip
|
||||
from .text_encoders import sd3_clip
|
||||
from .text_encoders import wan
|
||||
from .utils import ProgressBar, FileMetadata
|
||||
from .pixel_space_convert import PixelspaceConversionVAE
|
||||
from .text_encoders import z_image
|
||||
from .utils import ProgressBar, FileMetadata, state_dict_prefix_replace
|
||||
from .taesd.taehv import TAEHV
|
||||
from .latent_formats import HunyuanVideo15, HunyuanVideo
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -101,7 +105,7 @@ def load_lora_for_models(model, clip, lora, strength_model, strength_clip, lora_
|
||||
|
||||
|
||||
class CLIP:
|
||||
def __init__(self, target: CLIPTarget = None, embedding_directory=None, no_init=False, textmodel_json_config=None, tokenizer_data: dict | None = None, parameters=0, model_options={}):
|
||||
def __init__(self, target: CLIPTarget = None, embedding_directory=None, no_init=False, textmodel_json_config=None, tokenizer_data: dict | None = None, parameters=0, state_dict=[], model_options={}):
|
||||
if tokenizer_data is None:
|
||||
tokenizer_data = dict()
|
||||
if no_init:
|
||||
@ -134,9 +138,32 @@ class CLIP:
|
||||
|
||||
self.tokenizer: "sd1_clip.SD1Tokenizer" = tokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
|
||||
self.patcher = model_patcher.ModelPatcher(self.cond_stage_model, load_device=load_device, offload_device=offload_device)
|
||||
# Match torch.float32 hardcode upcast in TE implemention
|
||||
self.patcher.set_model_compute_dtype(torch.float32)
|
||||
self.patcher.hook_mode = EnumHookMode.MinVram
|
||||
self.patcher.is_clip = True
|
||||
self.apply_hooks_to_conds = None
|
||||
if len(state_dict) > 0:
|
||||
if isinstance(state_dict, list):
|
||||
for c in state_dict:
|
||||
m, u = self.load_sd(c)
|
||||
if len(m) > 0:
|
||||
logger.warning("clip missing: {}".format(m))
|
||||
|
||||
if len(u) > 0:
|
||||
logger.debug("clip unexpected: {}".format(u))
|
||||
else:
|
||||
m, u = self.load_sd(state_dict, full_model=True)
|
||||
if len(m) > 0:
|
||||
m_filter = list(filter(lambda a: ".logit_scale" not in a and ".transformer.text_projection.weight" not in a, m))
|
||||
if len(m_filter) > 0:
|
||||
logger.warning("clip missing: {}".format(m))
|
||||
else:
|
||||
logger.debug("clip missing: {}".format(m))
|
||||
|
||||
if len(u) > 0:
|
||||
logger.debug("clip unexpected {}:".format(u))
|
||||
|
||||
if params['device'] == load_device:
|
||||
model_management.load_models_gpu([self.patcher], force_full_load=True)
|
||||
self.layer_idx = None
|
||||
@ -156,6 +183,9 @@ class CLIP:
|
||||
n.apply_hooks_to_conds = self.apply_hooks_to_conds
|
||||
return n
|
||||
|
||||
def get_ram_usage(self):
|
||||
return self.patcher.get_ram_usage()
|
||||
|
||||
def add_patches(self, patches, strength_patch=1.0, strength_model=1.0):
|
||||
return self.patcher.add_patches(patches, strength_patch, strength_model)
|
||||
|
||||
@ -199,6 +229,7 @@ class CLIP:
|
||||
self.cond_stage_model.set_clip_options({"projected_pooled": False})
|
||||
|
||||
self.load_model()
|
||||
self.cond_stage_model.set_clip_options({"execution_device": self.patcher.load_device})
|
||||
all_hooks.reset()
|
||||
self.patcher.patch_hooks(None)
|
||||
if show_pbar:
|
||||
@ -246,6 +277,7 @@ class CLIP:
|
||||
self.cond_stage_model.set_clip_options({"projected_pooled": False})
|
||||
|
||||
self.load_model()
|
||||
self.cond_stage_model.set_clip_options({"execution_device": self.patcher.load_device})
|
||||
o = self.cond_stage_model.encode_token_weights(tokens)
|
||||
cond, pooled = o[:2]
|
||||
if return_dict:
|
||||
@ -310,6 +342,7 @@ class VAE:
|
||||
self.working_dtypes = [torch.bfloat16, torch.float32]
|
||||
self.disable_offload = False
|
||||
self.not_video = False
|
||||
self.size = None
|
||||
|
||||
self.downscale_index_formula = None
|
||||
self.upscale_index_formula = None
|
||||
@ -369,7 +402,7 @@ class VAE:
|
||||
|
||||
self.memory_used_encode = lambda shape, dtype: (700 * shape[2] * shape[3]) * model_management.dtype_size(dtype)
|
||||
self.memory_used_decode = lambda shape, dtype: (700 * shape[2] * shape[3] * 32 * 32) * model_management.dtype_size(dtype)
|
||||
elif sd['decoder.conv_in.weight'].shape[1] == 32:
|
||||
elif sd['decoder.conv_in.weight'].shape[1] == 32 and sd['decoder.conv_in.weight'].ndim == 5:
|
||||
ddconfig = {"block_out_channels": [128, 256, 512, 1024, 1024], "in_channels": 3, "out_channels": 3, "num_res_blocks": 2, "ffactor_spatial": 16, "ffactor_temporal": 4, "downsample_match_channel": True, "upsample_match_channel": True, "refiner_vae": False}
|
||||
self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.weight"].shape[1]
|
||||
self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32]
|
||||
@ -395,6 +428,17 @@ class VAE:
|
||||
self.upscale_ratio = 4
|
||||
|
||||
self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.weight"].shape[1]
|
||||
if 'decoder.post_quant_conv.weight' in sd:
|
||||
sd = state_dict_prefix_replace(sd, {"decoder.post_quant_conv.": "post_quant_conv.", "encoder.quant_conv.": "quant_conv."})
|
||||
|
||||
if 'bn.running_mean' in sd:
|
||||
ddconfig["batch_norm_latent"] = True
|
||||
self.downscale_ratio *= 2
|
||||
self.upscale_ratio *= 2
|
||||
self.latent_channels *= 4
|
||||
old_memory_used_decode = self.memory_used_decode
|
||||
self.memory_used_decode = lambda shape, dtype: old_memory_used_decode(shape, dtype) * 4.0
|
||||
|
||||
if 'post_quant_conv.weight' in sd:
|
||||
self.first_stage_model = AutoencoderKL(ddconfig=ddconfig, embed_dim=sd['post_quant_conv.weight'].shape[1])
|
||||
else:
|
||||
@ -454,20 +498,20 @@ class VAE:
|
||||
elif "decoder.conv_in.conv.weight" in sd and sd['decoder.conv_in.conv.weight'].shape[1] == 32:
|
||||
ddconfig = {"block_out_channels": [128, 256, 512, 1024, 1024], "in_channels": 3, "out_channels": 3, "num_res_blocks": 2, "ffactor_spatial": 16, "ffactor_temporal": 4, "downsample_match_channel": True, "upsample_match_channel": True}
|
||||
ddconfig['z_channels'] = sd["decoder.conv_in.conv.weight"].shape[1]
|
||||
self.latent_channels = 64
|
||||
self.latent_channels = 32
|
||||
self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 16, 16)
|
||||
self.upscale_index_formula = (4, 16, 16)
|
||||
self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 16, 16)
|
||||
self.downscale_index_formula = (4, 16, 16)
|
||||
self.latent_dim = 3
|
||||
self.not_video = True
|
||||
self.not_video = False
|
||||
self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32]
|
||||
self.first_stage_model = AutoencodingEngine(regularizer_config={'target': "comfy.ldm.models.autoencoder.EmptyRegularizer"},
|
||||
encoder_config={'target': "comfy.ldm.hunyuan_video.vae_refiner.Encoder", 'params': ddconfig},
|
||||
decoder_config={'target': "comfy.ldm.hunyuan_video.vae_refiner.Decoder", 'params': ddconfig})
|
||||
|
||||
self.memory_used_encode = lambda shape, dtype: (1400 * shape[-2] * shape[-1]) * model_management.dtype_size(dtype)
|
||||
self.memory_used_decode = lambda shape, dtype: (1400 * shape[-3] * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype)
|
||||
self.memory_used_encode = lambda shape, dtype: (1400 * 9 * shape[-2] * shape[-1]) * model_management.dtype_size(dtype)
|
||||
self.memory_used_decode = lambda shape, dtype: (3600 * 4 * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype)
|
||||
elif "decoder.conv_in.conv.weight" in sd:
|
||||
ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}
|
||||
ddconfig["conv3d"] = True
|
||||
@ -479,8 +523,10 @@ class VAE:
|
||||
self.latent_dim = 3
|
||||
self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.conv.weight"].shape[1]
|
||||
self.first_stage_model = AutoencoderKL(ddconfig=ddconfig, embed_dim=sd['post_quant_conv.weight'].shape[1])
|
||||
self.memory_used_decode = lambda shape, dtype: (1500 * shape[2] * shape[3] * shape[4] * (4 * 8 * 8)) * model_management.dtype_size(dtype)
|
||||
self.memory_used_encode = lambda shape, dtype: (900 * max(shape[2], 2) * shape[3] * shape[4]) * model_management.dtype_size(dtype)
|
||||
# This is likely to significantly over-estimate with single image or low frame counts as the
|
||||
# implementation is able to completely skip caching. Rework if used as an image only VAE
|
||||
self.memory_used_decode = lambda shape, dtype: (2800 * min(8, ((shape[2] - 1) * 4) + 1) * shape[3] * shape[4] * (8 * 8)) * model_management.dtype_size(dtype)
|
||||
self.memory_used_encode = lambda shape, dtype: (1400 * min(9, shape[2]) * shape[3] * shape[4]) * model_management.dtype_size(dtype)
|
||||
self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32]
|
||||
elif "decoder.unpatcher3d.wavelets" in sd:
|
||||
self.upscale_ratio = (lambda a: max(0, a * 8 - 7), 8, 8)
|
||||
@ -509,25 +555,20 @@ class VAE:
|
||||
self.memory_used_encode = lambda shape, dtype: 3300 * shape[3] * shape[4] * model_management.dtype_size(dtype)
|
||||
self.memory_used_decode = lambda shape, dtype: 8000 * shape[3] * shape[4] * (16 * 16) * model_management.dtype_size(dtype)
|
||||
else: # Wan 2.1 VAE
|
||||
dim = sd["decoder.head.0.gamma"].shape[0]
|
||||
self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 8, 8)
|
||||
self.upscale_index_formula = (4, 8, 8)
|
||||
self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 8, 8)
|
||||
self.downscale_index_formula = (4, 8, 8)
|
||||
self.latent_dim = 3
|
||||
self.latent_channels = 16
|
||||
ddconfig = {"dim": 96, "z_dim": self.latent_channels, "dim_mult": [1, 2, 4, 4], "num_res_blocks": 2, "attn_scales": [], "temperal_downsample": [False, True, True], "dropout": 0.0}
|
||||
ddconfig = {"dim": dim, "z_dim": self.latent_channels, "dim_mult": [1, 2, 4, 4], "num_res_blocks": 2, "attn_scales": [], "temperal_downsample": [False, True, True], "dropout": 0.0}
|
||||
self.first_stage_model = wan_vae.WanVAE(**ddconfig)
|
||||
self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32]
|
||||
self.memory_used_encode = lambda shape, dtype: (1500 if shape[2] <= 4 else 6000) * shape[3] * shape[4] * model_management.dtype_size(dtype)
|
||||
self.memory_used_decode = lambda shape, dtype: (2200 if shape[2] <= 4 else 7000) * shape[3] * shape[4] * (8 * 8) * model_management.dtype_size(dtype)
|
||||
|
||||
|
||||
# todo: not sure how to detect qwen here
|
||||
wan_21_decode = 7000
|
||||
wan_21_encode = wan_21_decode - 1000
|
||||
qwen_vae_decode = int(wan_21_decode / 3)
|
||||
qwen_vae_encode = int(wan_21_encode / 3)
|
||||
encode_const = qwen_vae_encode if "qwen" in self.ckpt_name.lower() else wan_21_encode
|
||||
decode_const = qwen_vae_decode if "qwen" in self.ckpt_name.lower() else wan_21_decode
|
||||
self.memory_used_encode = lambda shape, dtype: encode_const * shape[3] * shape[4] * model_management.dtype_size(dtype)
|
||||
self.memory_used_decode = lambda shape, dtype: decode_const * shape[3] * shape[4] * (8 * 8) * model_management.dtype_size(dtype)
|
||||
# Hunyuan 3d v2 2.0 & 2.1
|
||||
elif "geo_decoder.cross_attn_decoder.ln_1.bias" in sd:
|
||||
|
||||
@ -593,6 +634,35 @@ class VAE:
|
||||
self.process_input = lambda audio: audio
|
||||
self.working_dtypes = [torch.float32]
|
||||
self.crop_input = False
|
||||
elif "decoder.22.bias" in sd: # taehv, taew and lighttae
|
||||
self.latent_channels = sd["decoder.1.weight"].shape[1]
|
||||
self.latent_dim = 3
|
||||
self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 16, 16)
|
||||
self.upscale_index_formula = (4, 16, 16)
|
||||
self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 16, 16)
|
||||
self.downscale_index_formula = (4, 16, 16)
|
||||
if self.latent_channels == 48: # Wan 2.2
|
||||
self.first_stage_model = TAEHV(latent_channels=self.latent_channels, latent_format=None) # taehv doesn't need scaling
|
||||
self.process_input = lambda image: (_ for _ in ()).throw(NotImplementedError("This light tae doesn't support encoding currently"))
|
||||
self.process_output = lambda image: image
|
||||
self.memory_used_decode = lambda shape, dtype: (1800 * (max(1, (shape[-3] ** 0.7 * 0.1)) * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype))
|
||||
elif self.latent_channels == 32 and sd["decoder.22.bias"].shape[0] == 12: # lighttae_hv15
|
||||
self.first_stage_model = TAEHV(latent_channels=self.latent_channels, latent_format=HunyuanVideo15)
|
||||
self.process_input = lambda image: (_ for _ in ()).throw(NotImplementedError("This light tae doesn't support encoding currently"))
|
||||
self.memory_used_decode = lambda shape, dtype: (1200 * (max(1, (shape[-3] ** 0.7 * 0.05)) * shape[-2] * shape[-1] * 32 * 32) * model_management.dtype_size(dtype))
|
||||
else:
|
||||
if sd["decoder.1.weight"].dtype == torch.float16: # taehv currently only available in float16, so assume it's not lighttaew2_1 as otherwise state dicts are identical
|
||||
latent_format = HunyuanVideo
|
||||
else:
|
||||
latent_format = None # lighttaew2_1 doesn't need scaling
|
||||
self.first_stage_model = TAEHV(latent_channels=self.latent_channels, latent_format=latent_format)
|
||||
self.process_input = self.process_output = lambda image: image
|
||||
self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 8, 8)
|
||||
self.upscale_index_formula = (4, 8, 8)
|
||||
self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 8, 8)
|
||||
self.downscale_index_formula = (4, 8, 8)
|
||||
self.memory_used_encode = lambda shape, dtype: (700 * (max(1, (shape[-3] ** 0.66 * 0.11)) * shape[-2] * shape[-1]) * model_management.dtype_size(dtype))
|
||||
self.memory_used_decode = lambda shape, dtype: (50 * (max(1, (shape[-3] ** 0.65 * 0.26)) * shape[-2] * shape[-1] * 32 * 32) * model_management.dtype_size(dtype))
|
||||
else:
|
||||
logger.warning("WARNING: No VAE weights detected, VAE not initalized.")
|
||||
self.first_stage_model = None
|
||||
@ -620,6 +690,8 @@ class VAE:
|
||||
|
||||
self.patcher = model_patcher.ModelPatcher(self.first_stage_model, load_device=self.device, offload_device=offload_device)
|
||||
logger.debug("VAE load device: {}, offload device: {}, dtype: {}".format(self.device, offload_device, self.vae_dtype))
|
||||
# todo: why is this being called here? for what side effects exactly?
|
||||
self.model_size()
|
||||
|
||||
def clone(self):
|
||||
n = VAE(no_init=True)
|
||||
@ -644,6 +716,15 @@ class VAE:
|
||||
n.patcher = self.patcher.clone()
|
||||
return n
|
||||
|
||||
def model_size(self):
|
||||
if self.size is not None:
|
||||
return self.size
|
||||
self.size = module_size(self.first_stage_model)
|
||||
return self.size
|
||||
|
||||
def get_ram_usage(self):
|
||||
return self.model_size()
|
||||
|
||||
def throw_exception_if_invalid(self):
|
||||
if self.first_stage_model is None:
|
||||
raise RuntimeError("ERROR: VAE is invalid: None\n\nIf the VAE is from a checkpoint loader node your checkpoint does not contain a valid VAE.")
|
||||
@ -704,6 +785,7 @@ class VAE:
|
||||
return samples
|
||||
|
||||
def encode_tiled_1d(self, samples, tile_x=256 * 2048, overlap=64 * 2048):
|
||||
extra_channel_size = 0
|
||||
if self.latent_dim == 1:
|
||||
encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).float()
|
||||
out_channels = self.latent_channels
|
||||
@ -730,6 +812,8 @@ class VAE:
|
||||
self.throw_exception_if_invalid()
|
||||
pixel_samples = None
|
||||
do_tile = False
|
||||
if self.latent_dim == 2 and samples_in.ndim == 5:
|
||||
samples_in = samples_in[:, :, 0]
|
||||
try:
|
||||
memory_used = self.memory_used_decode(samples_in.shape, self.vae_dtype)
|
||||
model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
|
||||
@ -960,6 +1044,10 @@ class CLIPType(Enum):
|
||||
OMNIGEN2 = 17
|
||||
QWEN_IMAGE = 18
|
||||
HUNYUAN_IMAGE = 19
|
||||
HUNYUAN_VIDEO_15 = 20
|
||||
OVIS = 21
|
||||
KANDINSKY5 = 22
|
||||
KANDINSKY5_IMAGE = 23
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
@ -975,8 +1063,11 @@ def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DI
|
||||
model_options = dict()
|
||||
clip_data = []
|
||||
for p in ckpt_paths:
|
||||
clip_data.append(utils.load_torch_file(p, safe_load=True))
|
||||
return load_text_encoder_state_dicts(clip_data, embedding_directory=embedding_directory, clip_type=clip_type, model_options=model_options, textmodel_json_config=textmodel_json_config)
|
||||
sd, metadata = utils.load_torch_file(p, safe_load=True, return_metadata=True)
|
||||
if model_options.get("custom_operations", None) is None:
|
||||
sd, metadata = utils.convert_old_quants(sd, model_prefix="", metadata=metadata)
|
||||
clip_data.append(sd)
|
||||
return load_text_encoder_state_dicts(clip_data, embedding_directory=embedding_directory, clip_type=clip_type, model_options=model_options)
|
||||
|
||||
|
||||
class TEModel(Enum):
|
||||
@ -993,6 +1084,10 @@ class TEModel(Enum):
|
||||
QWEN25_7B = 11
|
||||
BYT5_SMALL_GLYPH = 12
|
||||
GEMMA_3_4B = 13
|
||||
MISTRAL3_24B = 14
|
||||
MISTRAL3_24B_PRUNED_FLUX2 = 15
|
||||
QWEN3_4B = 16
|
||||
QWEN3_2B = 17
|
||||
|
||||
|
||||
def detect_te_model(sd):
|
||||
@ -1026,6 +1121,18 @@ def detect_te_model(sd):
|
||||
if weight.shape[0] == 512:
|
||||
return TEModel.QWEN25_7B
|
||||
if "model.layers.0.post_attention_layernorm.weight" in sd:
|
||||
weight = sd['model.layers.0.post_attention_layernorm.weight']
|
||||
if 'model.layers.0.self_attn.q_norm.weight' in sd:
|
||||
if weight.shape[0] == 2560:
|
||||
return TEModel.QWEN3_4B
|
||||
elif weight.shape[0] == 2048:
|
||||
return TEModel.QWEN3_2B
|
||||
if weight.shape[0] == 5120:
|
||||
if "model.layers.39.post_attention_layernorm.weight" in sd:
|
||||
return TEModel.MISTRAL3_24B
|
||||
else:
|
||||
return TEModel.MISTRAL3_24B_PRUNED_FLUX2
|
||||
|
||||
return TEModel.LLAMA3_8
|
||||
return None
|
||||
|
||||
@ -1077,7 +1184,7 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
||||
clip_target.clip = sd3_clip.sd3_clip(clip_l=False, clip_g=True, t5=False)
|
||||
clip_target.tokenizer = sd3_clip.SD3Tokenizer
|
||||
elif clip_type == CLIPType.HIDREAM:
|
||||
clip_target.clip = hidream.hidream_clip(clip_l=False, clip_g=True, t5=False, llama=False, dtype_t5=None, dtype_llama=None, t5xxl_scaled_fp8=None, llama_scaled_fp8=None)
|
||||
clip_target.clip = hidream.hidream_clip(clip_l=False, clip_g=True, t5=False, llama=False, dtype_t5=None, dtype_llama=None)
|
||||
clip_target.tokenizer = hidream.HiDreamTokenizer
|
||||
else:
|
||||
clip_target.clip = sdxl_clip.SDXLRefinerClipModel
|
||||
@ -1101,7 +1208,7 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
||||
tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None)
|
||||
elif clip_type == CLIPType.HIDREAM:
|
||||
clip_target.clip = hidream.hidream_clip(**t5xxl_detect(clip_data),
|
||||
clip_l=False, clip_g=False, t5=True, llama=False, dtype_llama=None, llama_scaled_fp8=None)
|
||||
clip_l=False, clip_g=False, t5=True, llama=False, dtype_llama=None)
|
||||
clip_target.tokenizer = hidream.HiDreamTokenizer
|
||||
else: # CLIPType.MOCHI
|
||||
clip_target.clip = genmo.mochi_te(**t5xxl_detect(clip_data))
|
||||
@ -1130,7 +1237,7 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
||||
tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None)
|
||||
elif te_model == TEModel.LLAMA3_8:
|
||||
clip_target.clip = hidream.hidream_clip(**llama_detect(clip_data),
|
||||
clip_l=False, clip_g=False, t5=False, llama=True, dtype_t5=None, t5xxl_scaled_fp8=None)
|
||||
clip_l=False, clip_g=False, t5=False, llama=True, dtype_t5=None)
|
||||
clip_target.tokenizer = hidream.HiDreamTokenizer
|
||||
elif te_model == TEModel.QWEN25_3B:
|
||||
clip_target.clip = omnigen2.te(**llama_detect(clip_data))
|
||||
@ -1142,13 +1249,23 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
||||
else:
|
||||
clip_target.clip = qwen_image.te(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = qwen_image.QwenImageTokenizer
|
||||
elif te_model == TEModel.MISTRAL3_24B or te_model == TEModel.MISTRAL3_24B_PRUNED_FLUX2:
|
||||
clip_target.clip = flux.flux2_te(**llama_detect(clip_data), pruned=te_model == TEModel.MISTRAL3_24B_PRUNED_FLUX2)
|
||||
clip_target.tokenizer = flux.Flux2Tokenizer
|
||||
tokenizer_data["tekken_model"] = clip_data[0].get("tekken_model", None)
|
||||
elif te_model == TEModel.QWEN3_4B:
|
||||
clip_target.clip = z_image.te(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = z_image.ZImageTokenizer
|
||||
elif te_model == TEModel.QWEN3_2B:
|
||||
clip_target.clip = ovis.te(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = ovis.OvisTokenizer
|
||||
else:
|
||||
# clip_l
|
||||
if clip_type == CLIPType.SD3:
|
||||
clip_target.clip = sd3_clip.sd3_clip(clip_l=True, clip_g=False, t5=False)
|
||||
clip_target.tokenizer = sd3_clip.SD3Tokenizer
|
||||
elif clip_type == CLIPType.HIDREAM:
|
||||
clip_target.clip = hidream.hidream_clip(clip_l=True, clip_g=False, t5=False, llama=False, dtype_t5=None, dtype_llama=None, t5xxl_scaled_fp8=None, llama_scaled_fp8=None)
|
||||
clip_target.clip = hidream.hidream_clip(clip_l=True, clip_g=False, t5=False, llama=False, dtype_t5=None, dtype_llama=None)
|
||||
clip_target.tokenizer = hidream.HiDreamTokenizer
|
||||
else:
|
||||
clip_target.clip = sd1_clip.SD1ClipModel
|
||||
@ -1188,6 +1305,15 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
||||
elif clip_type == CLIPType.HUNYUAN_IMAGE:
|
||||
clip_target.clip = hunyuan_image.te(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = hunyuan_image.HunyuanImageTokenizer
|
||||
elif clip_type == CLIPType.HUNYUAN_VIDEO_15:
|
||||
clip_target.clip = hunyuan_image.te(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = hunyuan_video.HunyuanVideo15Tokenizer
|
||||
elif clip_type == CLIPType.KANDINSKY5:
|
||||
clip_target.clip = kandinsky5.te(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = kandinsky5.Kandinsky5Tokenizer
|
||||
elif clip_type == CLIPType.KANDINSKY5_IMAGE:
|
||||
clip_target.clip = kandinsky5.te(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = kandinsky5.Kandinsky5TokenizerImage
|
||||
else:
|
||||
clip_target.clip = sdxl_clip.SDXLClipModel
|
||||
clip_target.tokenizer = sdxl_clip.SDXLTokenizer
|
||||
@ -1203,14 +1329,7 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
||||
parameters += utils.calculate_parameters(c)
|
||||
tokenizer_data, model_options = long_clipl.model_options_long_clip(c, tokenizer_data, model_options)
|
||||
|
||||
clip = CLIP(clip_target, embedding_directory=embedding_directory, textmodel_json_config=textmodel_json_config, parameters=parameters, tokenizer_data=tokenizer_data, model_options=model_options)
|
||||
for c in clip_data:
|
||||
m, u = clip.load_sd(c)
|
||||
if len(m) > 0:
|
||||
logger.warning("clip missing: {}".format(m))
|
||||
|
||||
if len(u) > 0:
|
||||
logger.debug("clip unexpected: {}".format(u))
|
||||
clip = CLIP(clip_target, textmodel_json_config=textmodel_json_config, embedding_directory=embedding_directory, parameters=parameters, tokenizer_data=tokenizer_data, state_dict=clip_data, model_options=model_options)
|
||||
return clip
|
||||
|
||||
|
||||
@ -1285,6 +1404,10 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c
|
||||
weight_dtype = utils.weight_dtype(sd, diffusion_model_prefix)
|
||||
load_device = model_management.get_torch_device()
|
||||
|
||||
custom_operations = model_options.get("custom_operations", None)
|
||||
if custom_operations is None:
|
||||
sd, metadata = utils.convert_old_quants(sd, diffusion_model_prefix, metadata=metadata)
|
||||
|
||||
model_config = model_detection.model_config_from_unet(sd, diffusion_model_prefix, metadata=metadata)
|
||||
if model_config is None:
|
||||
logger.warning("Warning, This is not a checkpoint file, trying to load it as a diffusion model only.")
|
||||
@ -1294,16 +1417,21 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c
|
||||
return (diffusion_model, None, VAE(sd={}), None) # The VAE object is there to throw an exception if it's actually used'
|
||||
|
||||
unet_weight_dtype = list(model_config.supported_inference_dtypes)
|
||||
if model_config.scaled_fp8 is not None:
|
||||
if model_config.quant_config is not None:
|
||||
weight_dtype = None
|
||||
|
||||
model_config.custom_operations = model_options.get("custom_operations", None)
|
||||
if custom_operations is not None:
|
||||
model_config.custom_operations = custom_operations
|
||||
|
||||
unet_dtype = model_options.get("dtype", model_options.get("weight_dtype", None))
|
||||
|
||||
if unet_dtype is None:
|
||||
unet_dtype = model_management.unet_dtype(model_params=parameters, supported_dtypes=unet_weight_dtype, weight_dtype=weight_dtype)
|
||||
|
||||
manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes)
|
||||
if model_config.quant_config is not None:
|
||||
manual_cast_dtype = model_management.unet_manual_cast(None, load_device, model_config.supported_inference_dtypes)
|
||||
else:
|
||||
manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes)
|
||||
model_config.set_inference_dtype(unet_dtype, manual_cast_dtype)
|
||||
|
||||
if model_config.clip_vision_prefix is not None:
|
||||
@ -1321,22 +1449,33 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c
|
||||
vae = VAE(sd=vae_sd, metadata=metadata)
|
||||
|
||||
if output_clip:
|
||||
if te_model_options.get("custom_operations", None) is None:
|
||||
scaled_fp8_list = []
|
||||
for k in list(sd.keys()): # Convert scaled fp8 to mixed ops
|
||||
if k.endswith(".scaled_fp8"):
|
||||
scaled_fp8_list.append(k[:-len("scaled_fp8")])
|
||||
|
||||
if len(scaled_fp8_list) > 0:
|
||||
out_sd = {}
|
||||
for k in sd:
|
||||
skip = False
|
||||
for pref in scaled_fp8_list:
|
||||
skip = skip or k.startswith(pref)
|
||||
if not skip:
|
||||
out_sd[k] = sd[k]
|
||||
|
||||
for pref in scaled_fp8_list:
|
||||
quant_sd, qmetadata = utils.convert_old_quants(sd, pref, metadata={})
|
||||
for k in quant_sd:
|
||||
out_sd[k] = quant_sd[k]
|
||||
sd = out_sd
|
||||
|
||||
clip_target = model_config.clip_target(state_dict=sd)
|
||||
if clip_target is not None:
|
||||
clip_sd = model_config.process_clip_state_dict(sd)
|
||||
if len(clip_sd) > 0:
|
||||
parameters = utils.calculate_parameters(clip_sd)
|
||||
clip = CLIP(clip_target, embedding_directory=embedding_directory, tokenizer_data=clip_sd, parameters=parameters, model_options=te_model_options)
|
||||
m, u = clip.load_sd(clip_sd, full_model=True)
|
||||
if len(m) > 0:
|
||||
m_filter = list(filter(lambda a: ".logit_scale" not in a and ".transformer.text_projection.weight" not in a, m))
|
||||
if len(m_filter) > 0:
|
||||
logger.warning("clip missing: {}".format(m))
|
||||
else:
|
||||
logger.debug("clip missing: {}".format(m))
|
||||
|
||||
if len(u) > 0:
|
||||
logger.debug("clip unexpected {}:".format(u))
|
||||
clip = CLIP(clip_target, embedding_directory=embedding_directory, tokenizer_data=clip_sd, parameters=parameters, state_dict=clip_sd, model_options=te_model_options)
|
||||
else:
|
||||
logger.warning(f"no CLIP/text encoder weights in checkpoint {ckpt_path}, the text encoder model will not be loaded.")
|
||||
|
||||
@ -1385,6 +1524,9 @@ def load_diffusion_model_state_dict(sd, model_options: dict = None, ckpt_path: O
|
||||
if len(temp_sd) > 0:
|
||||
sd = temp_sd
|
||||
|
||||
custom_operations = model_options.get("custom_operations", None)
|
||||
if custom_operations is None:
|
||||
sd, metadata = utils.convert_old_quants(sd, "", metadata=metadata)
|
||||
parameters = utils.calculate_parameters(sd)
|
||||
weight_dtype = utils.weight_dtype(sd)
|
||||
load_device = model_management.get_torch_device()
|
||||
@ -1414,7 +1556,7 @@ def load_diffusion_model_state_dict(sd, model_options: dict = None, ckpt_path: O
|
||||
|
||||
offload_device = model_management.unet_offload_device()
|
||||
unet_weight_dtype = list(model_config.supported_inference_dtypes)
|
||||
if model_config.scaled_fp8 is not None:
|
||||
if model_config.quant_config is not None:
|
||||
weight_dtype = None
|
||||
|
||||
if dtype is None:
|
||||
@ -1422,9 +1564,15 @@ def load_diffusion_model_state_dict(sd, model_options: dict = None, ckpt_path: O
|
||||
else:
|
||||
unet_dtype = dtype
|
||||
|
||||
manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes)
|
||||
if model_config.quant_config is not None:
|
||||
manual_cast_dtype = model_management.unet_manual_cast(None, load_device, model_config.supported_inference_dtypes)
|
||||
else:
|
||||
manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes)
|
||||
model_config.set_inference_dtype(unet_dtype, manual_cast_dtype)
|
||||
model_config.custom_operations = model_options.get("custom_operations", model_config.custom_operations)
|
||||
|
||||
if custom_operations is not None:
|
||||
model_config.custom_operations = custom_operations
|
||||
|
||||
if model_options.get("fp8_optimizations", False):
|
||||
model_config.optimizations["fp8"] = True
|
||||
|
||||
@ -1437,7 +1585,7 @@ def load_diffusion_model_state_dict(sd, model_options: dict = None, ckpt_path: O
|
||||
return model_patcher.ModelPatcher(model, load_device=load_device, offload_device=offload_device, ckpt_name=os.path.basename(ckpt_path))
|
||||
|
||||
|
||||
def load_diffusion_model(unet_path, model_options: dict = None):
|
||||
def load_diffusion_model(unet_path, model_options=None):
|
||||
if model_options is None:
|
||||
model_options = {}
|
||||
sd, metadata = utils.load_torch_file(unet_path, return_metadata=True)
|
||||
@ -1468,6 +1616,9 @@ def save_checkpoint(output_path, model, clip=None, vae=None, clip_vision=None, m
|
||||
if vae is not None:
|
||||
vae_sd = vae.get_sd()
|
||||
|
||||
if metadata is None:
|
||||
metadata = {}
|
||||
|
||||
model_management.load_models_gpu(load_models, force_patch_weights=True)
|
||||
clip_vision_sd = clip_vision.get_sd() if clip_vision is not None else None
|
||||
sd = model.model.state_dict_for_saving(clip_sd, vae_sd, clip_vision_sd)
|
||||
|
||||
@ -12,7 +12,10 @@ from pathlib import Path
|
||||
from typing import Tuple, Sequence, TypeVar, Callable, Optional, Union
|
||||
|
||||
import torch
|
||||
from transformers import CLIPTokenizer, PreTrainedTokenizerBase
|
||||
try:
|
||||
from transformers import CLIPTokenizer, PreTrainedTokenizerBase
|
||||
except ImportError:
|
||||
from .transformers_compat import CLIPTokenizerFast as CLIPTokenizer, PreTrainedTokenizerBase
|
||||
|
||||
from . import clip_model
|
||||
from . import model_management
|
||||
@ -28,6 +31,7 @@ except ImportError:
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def gen_empty_tokens(special_tokens, length):
|
||||
start_token = special_tokens.get("start", None)
|
||||
end_token = special_tokens.get("end", None)
|
||||
@ -115,7 +119,6 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
|
||||
model_options = {}
|
||||
if special_tokens is None:
|
||||
special_tokens = {"start": 49406, "end": 49407, "pad": 49407}
|
||||
assert layer in self.LAYERS
|
||||
|
||||
if textmodel_json_config is None and "model_name" not in model_options:
|
||||
model_options = {**model_options, "model_name": "clip_l"}
|
||||
@ -132,19 +135,17 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
|
||||
config[k] = v
|
||||
|
||||
operations = model_options.get("custom_operations", None)
|
||||
scaled_fp8 = None
|
||||
quant_config = model_options.get("quantization_metadata", None)
|
||||
|
||||
if operations is None:
|
||||
scaled_fp8 = model_options.get("scaled_fp8", None)
|
||||
if scaled_fp8 is not None:
|
||||
operations = ops.scaled_fp8_ops(fp8_matrix_mult=False, override_dtype=scaled_fp8)
|
||||
if quant_config is not None:
|
||||
operations = ops.mixed_precision_ops(quant_config, dtype, full_precision_mm=True)
|
||||
logger.debug("Using MixedPrecisionOps for text encoder")
|
||||
else:
|
||||
operations = ops.manual_cast
|
||||
|
||||
self.operations = operations
|
||||
self.transformer = model_class(config, dtype, device, self.operations)
|
||||
if scaled_fp8 is not None:
|
||||
self.transformer.scaled_fp8 = torch.nn.Parameter(torch.tensor([], dtype=scaled_fp8))
|
||||
|
||||
self.num_layers = self.transformer.num_layers
|
||||
|
||||
@ -162,6 +163,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
|
||||
self.layer_norm_hidden_state = layer_norm_hidden_state
|
||||
self.return_projected_pooled = return_projected_pooled
|
||||
self.return_attention_masks = return_attention_masks
|
||||
self.execution_device = None
|
||||
|
||||
if layer == "hidden":
|
||||
assert layer_idx is not None
|
||||
@ -178,7 +180,8 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
|
||||
def set_clip_options(self, options):
|
||||
layer_idx = options.get("layer", self.layer_idx)
|
||||
self.return_projected_pooled = options.get("projected_pooled", self.return_projected_pooled)
|
||||
if self.layer == "all":
|
||||
self.execution_device = options.get("execution_device", self.execution_device)
|
||||
if isinstance(self.layer, list) or self.layer == "all":
|
||||
pass
|
||||
elif layer_idx is None or abs(layer_idx) > self.num_layers:
|
||||
self.layer = "last"
|
||||
@ -190,6 +193,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
|
||||
self.layer = self.options_default[0]
|
||||
self.layer_idx = self.options_default[1]
|
||||
self.return_projected_pooled = self.options_default[2]
|
||||
self.execution_device = None
|
||||
|
||||
def process_tokens(self, tokens, device):
|
||||
end_token = self.special_tokens.get("end", None)
|
||||
@ -273,14 +277,20 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
|
||||
return torch.cat(embeds_out), torch.tensor(attention_masks, device=device, dtype=torch.long), num_tokens, embeds_info
|
||||
|
||||
def forward(self, tokens):
|
||||
device = self.transformer.get_input_embeddings().weight.device
|
||||
if self.execution_device is None:
|
||||
device = self.transformer.get_input_embeddings().weight.device
|
||||
else:
|
||||
device = self.execution_device
|
||||
|
||||
embeds, attention_mask, num_tokens, embeds_info = self.process_tokens(tokens, device)
|
||||
|
||||
attention_mask_model = None
|
||||
if self.enable_attention_masks:
|
||||
attention_mask_model = attention_mask
|
||||
|
||||
if self.layer == "all":
|
||||
if isinstance(self.layer, list):
|
||||
intermediate_output = self.layer
|
||||
elif self.layer == "all":
|
||||
intermediate_output = "all"
|
||||
else:
|
||||
intermediate_output = self.layer_idx
|
||||
@ -478,6 +488,7 @@ def load_embed(embedding_name, embedding_directory, embedding_size, embed_key=No
|
||||
|
||||
valid_file = None
|
||||
for embed_dir in embedding_directory:
|
||||
# todo: improve this, so that it is more compatible between linux and windows
|
||||
embed_path = os.path.abspath(os.path.join(embed_dir, embedding_name))
|
||||
embed_dir = os.path.abspath(embed_dir)
|
||||
try:
|
||||
@ -546,7 +557,7 @@ SDTokenizerT = TypeVar('SDTokenizerT', bound='SDTokenizer')
|
||||
|
||||
|
||||
class SDTokenizer:
|
||||
def __init__(self, tokenizer_path: Optional[Union[torch.Tensor, bytes, bytearray, memoryview, str, Path, Traversable]] = None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, tokenizer_data=None, tokenizer_args=None):
|
||||
def __init__(self, tokenizer_path: Optional[Union[torch.Tensor, bytes, bytearray, memoryview, str, Path, Traversable]] = None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, pad_left=False, tokenizer_data=None, tokenizer_args=None):
|
||||
if tokenizer_data is None:
|
||||
tokenizer_data = dict()
|
||||
if tokenizer_args is None:
|
||||
@ -568,6 +579,7 @@ class SDTokenizer:
|
||||
self.min_length = tokenizer_data.get("{}_min_length".format(embedding_key), min_length)
|
||||
self.end_token = None
|
||||
self.min_padding = min_padding
|
||||
self.pad_left = pad_left
|
||||
|
||||
empty = self.tokenizer('')["input_ids"]
|
||||
self.tokenizer_adds_end_token = has_end_token
|
||||
@ -635,6 +647,13 @@ class SDTokenizer:
|
||||
return (embed, "{} {}".format(embedding_name[len(stripped):], leftover))
|
||||
return (embed, leftover)
|
||||
|
||||
def pad_tokens(self, tokens, amount):
|
||||
if self.pad_left:
|
||||
for i in range(amount):
|
||||
tokens.insert(0, (self.pad_token, 1.0, 0))
|
||||
else:
|
||||
tokens.extend([(self.pad_token, 1.0, 0)] * amount)
|
||||
|
||||
def tokenize_with_weights(self, text: str, return_word_ids=False, tokenizer_options={}, **kwargs):
|
||||
'''
|
||||
Takes a prompt and converts it to a list of (token, weight, word id) elements.
|
||||
@ -720,7 +739,7 @@ class SDTokenizer:
|
||||
if self.end_token is not None:
|
||||
batch.append((self.end_token, 1.0, 0))
|
||||
if self.pad_to_max_length:
|
||||
batch.extend([(self.pad_token, 1.0, 0)] * (remaining_length))
|
||||
self.pad_tokens(batch, remaining_length)
|
||||
# start new batch
|
||||
batch = []
|
||||
if self.start_token is not None:
|
||||
@ -734,11 +753,11 @@ class SDTokenizer:
|
||||
if self.end_token is not None:
|
||||
batch.append((self.end_token, 1.0, 0))
|
||||
if min_padding is not None:
|
||||
batch.extend([(self.pad_token, 1.0, 0)] * min_padding)
|
||||
self.pad_tokens(batch, min_padding)
|
||||
if self.pad_to_max_length and len(batch) < self.max_length:
|
||||
batch.extend([(self.pad_token, 1.0, 0)] * (self.max_length - len(batch)))
|
||||
self.pad_tokens(batch, self.max_length - len(batch))
|
||||
if min_length is not None and len(batch) < min_length:
|
||||
batch.extend([(self.pad_token, 1.0, 0)] * (min_length - len(batch)))
|
||||
self.pad_tokens(batch, min_length - len(batch))
|
||||
|
||||
if not return_word_ids:
|
||||
batched_tokens = [[(t, w) for t, w, _ in x] for x in batched_tokens]
|
||||
@ -756,7 +775,7 @@ SD1TokenizerT = TypeVar("SD1TokenizerT", bound="SD1Tokenizer")
|
||||
|
||||
|
||||
class SD1Tokenizer:
|
||||
def __init__(self, embedding_directory=None, tokenizer_data: dict=None, clip_name="l", tokenizer=SDTokenizer, name=None):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data: dict = None, clip_name="l", tokenizer=SDTokenizer, name=None):
|
||||
if tokenizer_data is None:
|
||||
tokenizer_data = {}
|
||||
if name is not None:
|
||||
@ -792,11 +811,12 @@ class SD1Tokenizer:
|
||||
def state_dict(self):
|
||||
return getattr(self, self.clip).state_dict()
|
||||
|
||||
|
||||
class SD1CheckpointClipModel(SDClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options=None, textmodel_json_config=None):
|
||||
super().__init__(device=device, return_projected_pooled=False, dtype=dtype, model_options=model_options, textmodel_json_config=textmodel_json_config)
|
||||
if model_options is None:
|
||||
model_options = {}
|
||||
super().__init__(device=device, return_projected_pooled=False, dtype=dtype, model_options=model_options, textmodel_json_config=textmodel_json_config)
|
||||
|
||||
|
||||
class SD1ClipModel(torch.nn.Module):
|
||||
|
||||
@ -27,6 +27,8 @@ from .text_encoders import sd3_clip
|
||||
from .text_encoders import wan
|
||||
from .text_encoders import qwen_image
|
||||
from .text_encoders import hunyuan_image
|
||||
from .text_encoders import kandinsky5
|
||||
from .text_encoders import z_image
|
||||
|
||||
|
||||
class SD15(supported_models_base.BASE):
|
||||
@ -573,7 +575,7 @@ class SD3(supported_models_base.BASE):
|
||||
unet_extra_config = {}
|
||||
latent_format = latent_formats.SD3
|
||||
|
||||
memory_usage_factor = 1.2
|
||||
memory_usage_factor = 1.6
|
||||
|
||||
text_encoder_key_prefix = ["text_encoders."]
|
||||
|
||||
@ -798,6 +800,40 @@ class FluxSchnell(Flux):
|
||||
return out
|
||||
|
||||
|
||||
class Flux2(Flux):
|
||||
unet_config = {
|
||||
"image_model": "flux2",
|
||||
}
|
||||
|
||||
sampling_settings = {
|
||||
"shift": 2.02,
|
||||
}
|
||||
|
||||
unet_extra_config = {}
|
||||
latent_format = latent_formats.Flux2
|
||||
|
||||
supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
|
||||
|
||||
vae_key_prefix = ["vae."]
|
||||
text_encoder_key_prefix = ["text_encoders."]
|
||||
|
||||
def __init__(self, unet_config):
|
||||
super().__init__(unet_config)
|
||||
self.memory_usage_factor = self.memory_usage_factor * (2.0 * 2.0) * 2.36
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
out = model_base.Flux2(self, device=device)
|
||||
return out
|
||||
|
||||
def clip_target(self, state_dict=None):
|
||||
if state_dict is None:
|
||||
state_dict = {}
|
||||
return None # TODO
|
||||
# pref = self.text_encoder_key_prefix[0]
|
||||
# t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
|
||||
# return supported_models_base.ClipTarget(comfy.text_encoders.flux.FluxTokenizer, comfy.text_encoders.flux.flux_clip(**t5_detect))
|
||||
|
||||
|
||||
class GenmoMochi(supported_models_base.BASE):
|
||||
unet_config = {
|
||||
"image_model": "mochi_preview",
|
||||
@ -1004,7 +1040,7 @@ class CosmosT2IPredict2(supported_models_base.BASE):
|
||||
|
||||
def __init__(self, unet_config):
|
||||
super().__init__(unet_config)
|
||||
self.memory_usage_factor = (unet_config.get("model_channels", 2048) / 2048) * 0.9
|
||||
self.memory_usage_factor = (unet_config.get("model_channels", 2048) / 2048) * 0.95
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
out = model_base.CosmosPredict2(self, device=device)
|
||||
@ -1039,7 +1075,7 @@ class Lumina2(supported_models_base.BASE):
|
||||
"shift": 6.0,
|
||||
}
|
||||
|
||||
memory_usage_factor = 1.2
|
||||
memory_usage_factor = 1.4
|
||||
|
||||
unet_extra_config = {}
|
||||
latent_format = latent_formats.Flux
|
||||
@ -1061,6 +1097,27 @@ class Lumina2(supported_models_base.BASE):
|
||||
return supported_models_base.ClipTarget(lumina2.LuminaTokenizer, lumina2.te(**hunyuan_detect))
|
||||
|
||||
|
||||
class ZImage(Lumina2):
|
||||
unet_config = {
|
||||
"image_model": "lumina2",
|
||||
"dim": 3840,
|
||||
}
|
||||
|
||||
sampling_settings = {
|
||||
"multiplier": 1.0,
|
||||
"shift": 3.0,
|
||||
}
|
||||
|
||||
memory_usage_factor = 2.0
|
||||
|
||||
supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
|
||||
|
||||
def clip_target(self, state_dict={}):
|
||||
pref = self.text_encoder_key_prefix[0]
|
||||
hunyuan_detect = hunyuan_video.llama_detect(state_dict, "{}qwen3_4b.transformer.".format(pref))
|
||||
return supported_models_base.ClipTarget(z_image.ZImageTokenizer, z_image.te(**hunyuan_detect))
|
||||
|
||||
|
||||
class WAN21_T2V(supported_models_base.BASE):
|
||||
unet_config = {
|
||||
"image_model": "wan2.1",
|
||||
@ -1337,7 +1394,7 @@ class ChromaRadiance(Chroma):
|
||||
latent_format = latent_formats.ChromaRadiance
|
||||
|
||||
# Pixel-space model, no spatial compression for model input.
|
||||
memory_usage_factor = 0.038
|
||||
memory_usage_factor = 0.044
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
return model_base.ChromaRadiance(self, device=device)
|
||||
@ -1384,7 +1441,7 @@ class Omnigen2(supported_models_base.BASE):
|
||||
"shift": 2.6,
|
||||
}
|
||||
|
||||
memory_usage_factor = 1.65 # TODO
|
||||
memory_usage_factor = 1.95 # TODO
|
||||
|
||||
unet_extra_config = {}
|
||||
latent_format = latent_formats.Flux
|
||||
@ -1451,7 +1508,7 @@ class HunyuanImage21(HunyuanVideo):
|
||||
|
||||
latent_format = latent_formats.HunyuanImage21
|
||||
|
||||
memory_usage_factor = 7.7
|
||||
memory_usage_factor = 8.7
|
||||
|
||||
supported_inference_dtypes = [torch.bfloat16, torch.float32]
|
||||
|
||||
@ -1483,6 +1540,108 @@ class HunyuanImage21Refiner(HunyuanVideo):
|
||||
return out
|
||||
|
||||
|
||||
models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage]
|
||||
class HunyuanVideo15(HunyuanVideo):
|
||||
unet_config = {
|
||||
"image_model": "hunyuan_video",
|
||||
"vision_in_dim": 1152,
|
||||
}
|
||||
|
||||
sampling_settings = {
|
||||
"shift": 7.0,
|
||||
}
|
||||
memory_usage_factor = 4.0 # TODO
|
||||
supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32]
|
||||
|
||||
latent_format = latent_formats.HunyuanVideo15
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
out = model_base.HunyuanVideo15(self, device=device)
|
||||
return out
|
||||
|
||||
def clip_target(self, state_dict={}):
|
||||
pref = self.text_encoder_key_prefix[0]
|
||||
hunyuan_detect = hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
|
||||
return supported_models_base.ClipTarget(hunyuan_video.HunyuanVideo15Tokenizer, hunyuan_image.te(**hunyuan_detect))
|
||||
|
||||
|
||||
class HunyuanVideo15_SR_Distilled(HunyuanVideo):
|
||||
unet_config = {
|
||||
"image_model": "hunyuan_video",
|
||||
"vision_in_dim": 1152,
|
||||
"in_channels": 98,
|
||||
}
|
||||
|
||||
sampling_settings = {
|
||||
"shift": 2.0,
|
||||
}
|
||||
memory_usage_factor = 4.0 # TODO
|
||||
supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32]
|
||||
|
||||
latent_format = latent_formats.HunyuanVideo15
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
out = model_base.HunyuanVideo15_SR_Distilled(self, device=device)
|
||||
return out
|
||||
|
||||
def clip_target(self, state_dict={}):
|
||||
pref = self.text_encoder_key_prefix[0]
|
||||
hunyuan_detect = hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
|
||||
return supported_models_base.ClipTarget(hunyuan_video.HunyuanVideo15Tokenizer, hunyuan_image.te(**hunyuan_detect))
|
||||
|
||||
|
||||
class Kandinsky5(supported_models_base.BASE):
|
||||
unet_config = {
|
||||
"image_model": "kandinsky5",
|
||||
}
|
||||
|
||||
sampling_settings = {
|
||||
"shift": 10.0,
|
||||
}
|
||||
|
||||
unet_extra_config = {}
|
||||
latent_format = latent_formats.HunyuanVideo
|
||||
|
||||
memory_usage_factor = 1.25 # TODO
|
||||
|
||||
supported_inference_dtypes = [torch.bfloat16, torch.float32]
|
||||
|
||||
vae_key_prefix = ["vae."]
|
||||
text_encoder_key_prefix = ["text_encoders."]
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
out = model_base.Kandinsky5(self, device=device)
|
||||
return out
|
||||
|
||||
def clip_target(self, state_dict={}):
|
||||
pref = self.text_encoder_key_prefix[0]
|
||||
hunyuan_detect = hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
|
||||
return supported_models_base.ClipTarget(kandinsky5.Kandinsky5Tokenizer, kandinsky5.te(**hunyuan_detect))
|
||||
|
||||
|
||||
class Kandinsky5Image(Kandinsky5):
|
||||
unet_config = {
|
||||
"image_model": "kandinsky5",
|
||||
"model_dim": 2560,
|
||||
"visual_embed_dim": 64,
|
||||
}
|
||||
|
||||
sampling_settings = {
|
||||
"shift": 3.0,
|
||||
}
|
||||
|
||||
latent_format = latent_formats.Flux
|
||||
memory_usage_factor = 1.25 # TODO
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
out = model_base.Kandinsky5Image(self, device=device)
|
||||
return out
|
||||
|
||||
def clip_target(self, state_dict={}):
|
||||
pref = self.text_encoder_key_prefix[0]
|
||||
hunyuan_detect = hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
|
||||
return supported_models_base.ClipTarget(kandinsky5.Kandinsky5TokenizerImage, kandinsky5.te(**hunyuan_detect))
|
||||
|
||||
|
||||
models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5]
|
||||
|
||||
models += [SVD_img2vid]
|
||||
|
||||
@ -18,10 +18,11 @@
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
from . import model_base
|
||||
from . import utils
|
||||
from . import latent_formats
|
||||
from .ops import Operations
|
||||
|
||||
|
||||
class ClipTarget:
|
||||
@ -30,6 +31,7 @@ class ClipTarget:
|
||||
self.tokenizer = tokenizer
|
||||
self.params = {}
|
||||
|
||||
|
||||
class BASE:
|
||||
unet_config = {}
|
||||
unet_extra_config = {
|
||||
@ -51,8 +53,8 @@ class BASE:
|
||||
memory_usage_factor = 2.0
|
||||
|
||||
manual_cast_dtype: Optional[torch.dtype] = None
|
||||
custom_operations: Optional[Operations] = None
|
||||
scaled_fp8: Optional[torch.dtype] = None
|
||||
custom_operations: Optional[torch.dtype] = None
|
||||
quant_config = None # quantization configuration for mixed precision
|
||||
optimizations = {"fp8": False}
|
||||
|
||||
@classmethod
|
||||
@ -120,3 +122,7 @@ class BASE:
|
||||
def set_inference_dtype(self, dtype, manual_cast_dtype):
|
||||
self.unet_config['dtype'] = dtype
|
||||
self.manual_cast_dtype = manual_cast_dtype
|
||||
|
||||
def __getattr__(self, name):
|
||||
logger.warning("\nWARNING, you accessed {} from the model config object which doesn't exist. Please fix your code.\n".format(name))
|
||||
return None
|
||||
|
||||
171
comfy/taesd/taehv.py
Normal file
171
comfy/taesd/taehv.py
Normal file
@ -0,0 +1,171 @@
|
||||
# Tiny AutoEncoder for HunyuanVideo and WanVideo https://github.com/madebyollin/taehv
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from tqdm.auto import tqdm
|
||||
from collections import namedtuple, deque
|
||||
|
||||
import comfy.ops
|
||||
operations=comfy.ops.disable_weight_init
|
||||
|
||||
DecoderResult = namedtuple("DecoderResult", ("frame", "memory"))
|
||||
TWorkItem = namedtuple("TWorkItem", ("input_tensor", "block_index"))
|
||||
|
||||
def conv(n_in, n_out, **kwargs):
|
||||
return operations.Conv2d(n_in, n_out, 3, padding=1, **kwargs)
|
||||
|
||||
class Clamp(nn.Module):
|
||||
def forward(self, x):
|
||||
return torch.tanh(x / 3) * 3
|
||||
|
||||
class MemBlock(nn.Module):
|
||||
def __init__(self, n_in, n_out, act_func):
|
||||
super().__init__()
|
||||
self.conv = nn.Sequential(conv(n_in * 2, n_out), act_func, conv(n_out, n_out), act_func, conv(n_out, n_out))
|
||||
self.skip = operations.Conv2d(n_in, n_out, 1, bias=False) if n_in != n_out else nn.Identity()
|
||||
self.act = act_func
|
||||
def forward(self, x, past):
|
||||
return self.act(self.conv(torch.cat([x, past], 1)) + self.skip(x))
|
||||
|
||||
class TPool(nn.Module):
|
||||
def __init__(self, n_f, stride):
|
||||
super().__init__()
|
||||
self.stride = stride
|
||||
self.conv = operations.Conv2d(n_f*stride,n_f, 1, bias=False)
|
||||
def forward(self, x):
|
||||
_NT, C, H, W = x.shape
|
||||
return self.conv(x.reshape(-1, self.stride * C, H, W))
|
||||
|
||||
class TGrow(nn.Module):
|
||||
def __init__(self, n_f, stride):
|
||||
super().__init__()
|
||||
self.stride = stride
|
||||
self.conv = operations.Conv2d(n_f, n_f*stride, 1, bias=False)
|
||||
def forward(self, x):
|
||||
_NT, C, H, W = x.shape
|
||||
x = self.conv(x)
|
||||
return x.reshape(-1, C, H, W)
|
||||
|
||||
def apply_model_with_memblocks(model, x, parallel, show_progress_bar):
|
||||
|
||||
B, T, C, H, W = x.shape
|
||||
if parallel:
|
||||
x = x.reshape(B*T, C, H, W)
|
||||
# parallel over input timesteps, iterate over blocks
|
||||
for b in tqdm(model, disable=not show_progress_bar):
|
||||
if isinstance(b, MemBlock):
|
||||
BT, C, H, W = x.shape
|
||||
T = BT // B
|
||||
_x = x.reshape(B, T, C, H, W)
|
||||
mem = F.pad(_x, (0,0,0,0,0,0,1,0), value=0)[:,:T].reshape(x.shape)
|
||||
x = b(x, mem)
|
||||
else:
|
||||
x = b(x)
|
||||
BT, C, H, W = x.shape
|
||||
T = BT // B
|
||||
x = x.view(B, T, C, H, W)
|
||||
else:
|
||||
out = []
|
||||
work_queue = deque([TWorkItem(xt, 0) for t, xt in enumerate(x.reshape(B, T * C, H, W).chunk(T, dim=1))])
|
||||
progress_bar = tqdm(range(T), disable=not show_progress_bar)
|
||||
mem = [None] * len(model)
|
||||
while work_queue:
|
||||
xt, i = work_queue.popleft()
|
||||
if i == 0:
|
||||
progress_bar.update(1)
|
||||
if i == len(model):
|
||||
out.append(xt)
|
||||
del xt
|
||||
else:
|
||||
b = model[i]
|
||||
if isinstance(b, MemBlock):
|
||||
if mem[i] is None:
|
||||
xt_new = b(xt, xt * 0)
|
||||
mem[i] = xt.detach().clone()
|
||||
else:
|
||||
xt_new = b(xt, mem[i])
|
||||
mem[i] = xt.detach().clone()
|
||||
del xt
|
||||
work_queue.appendleft(TWorkItem(xt_new, i+1))
|
||||
elif isinstance(b, TPool):
|
||||
if mem[i] is None:
|
||||
mem[i] = []
|
||||
mem[i].append(xt.detach().clone())
|
||||
if len(mem[i]) == b.stride:
|
||||
B, C, H, W = xt.shape
|
||||
xt = b(torch.cat(mem[i], 1).view(B*b.stride, C, H, W))
|
||||
mem[i] = []
|
||||
work_queue.appendleft(TWorkItem(xt, i+1))
|
||||
elif isinstance(b, TGrow):
|
||||
xt = b(xt)
|
||||
NT, C, H, W = xt.shape
|
||||
for xt_next in reversed(xt.view(B, b.stride*C, H, W).chunk(b.stride, 1)):
|
||||
work_queue.appendleft(TWorkItem(xt_next, i+1))
|
||||
del xt
|
||||
else:
|
||||
xt = b(xt)
|
||||
work_queue.appendleft(TWorkItem(xt, i+1))
|
||||
progress_bar.close()
|
||||
x = torch.stack(out, 1)
|
||||
return x
|
||||
|
||||
|
||||
class TAEHV(nn.Module):
|
||||
def __init__(self, latent_channels, parallel=False, decoder_time_upscale=(True, True), decoder_space_upscale=(True, True, True), latent_format=None, show_progress_bar=True):
|
||||
super().__init__()
|
||||
self.image_channels = 3
|
||||
self.patch_size = 1
|
||||
self.latent_channels = latent_channels
|
||||
self.parallel = parallel
|
||||
self.latent_format = latent_format
|
||||
self.show_progress_bar = show_progress_bar
|
||||
self.process_in = latent_format().process_in if latent_format is not None else (lambda x: x)
|
||||
self.process_out = latent_format().process_out if latent_format is not None else (lambda x: x)
|
||||
if self.latent_channels in [48, 32]: # Wan 2.2 and HunyuanVideo1.5
|
||||
self.patch_size = 2
|
||||
if self.latent_channels == 32: # HunyuanVideo1.5
|
||||
act_func = nn.LeakyReLU(0.2, inplace=True)
|
||||
else: # HunyuanVideo, Wan 2.1
|
||||
act_func = nn.ReLU(inplace=True)
|
||||
|
||||
self.encoder = nn.Sequential(
|
||||
conv(self.image_channels*self.patch_size**2, 64), act_func,
|
||||
TPool(64, 2), conv(64, 64, stride=2, bias=False), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func),
|
||||
TPool(64, 2), conv(64, 64, stride=2, bias=False), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func),
|
||||
TPool(64, 1), conv(64, 64, stride=2, bias=False), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func),
|
||||
conv(64, self.latent_channels),
|
||||
)
|
||||
n_f = [256, 128, 64, 64]
|
||||
self.frames_to_trim = 2**sum(decoder_time_upscale) - 1
|
||||
self.decoder = nn.Sequential(
|
||||
Clamp(), conv(self.latent_channels, n_f[0]), act_func,
|
||||
MemBlock(n_f[0], n_f[0], act_func), MemBlock(n_f[0], n_f[0], act_func), MemBlock(n_f[0], n_f[0], act_func), nn.Upsample(scale_factor=2 if decoder_space_upscale[0] else 1), TGrow(n_f[0], 1), conv(n_f[0], n_f[1], bias=False),
|
||||
MemBlock(n_f[1], n_f[1], act_func), MemBlock(n_f[1], n_f[1], act_func), MemBlock(n_f[1], n_f[1], act_func), nn.Upsample(scale_factor=2 if decoder_space_upscale[1] else 1), TGrow(n_f[1], 2 if decoder_time_upscale[0] else 1), conv(n_f[1], n_f[2], bias=False),
|
||||
MemBlock(n_f[2], n_f[2], act_func), MemBlock(n_f[2], n_f[2], act_func), MemBlock(n_f[2], n_f[2], act_func), nn.Upsample(scale_factor=2 if decoder_space_upscale[2] else 1), TGrow(n_f[2], 2 if decoder_time_upscale[1] else 1), conv(n_f[2], n_f[3], bias=False),
|
||||
act_func, conv(n_f[3], self.image_channels*self.patch_size**2),
|
||||
)
|
||||
@property
|
||||
def show_progress_bar(self):
|
||||
return self._show_progress_bar
|
||||
|
||||
@show_progress_bar.setter
|
||||
def show_progress_bar(self, value):
|
||||
self._show_progress_bar = value
|
||||
|
||||
def encode(self, x, **kwargs):
|
||||
if self.patch_size > 1: x = F.pixel_unshuffle(x, self.patch_size)
|
||||
x = x.movedim(2, 1) # [B, C, T, H, W] -> [B, T, C, H, W]
|
||||
if x.shape[1] % 4 != 0:
|
||||
# pad at end to multiple of 4
|
||||
n_pad = 4 - x.shape[1] % 4
|
||||
padding = x[:, -1:].repeat_interleave(n_pad, dim=1)
|
||||
x = torch.cat([x, padding], 1)
|
||||
x = apply_model_with_memblocks(self.encoder, x, self.parallel, self.show_progress_bar).movedim(2, 1)
|
||||
return self.process_out(x)
|
||||
|
||||
def decode(self, x, **kwargs):
|
||||
x = self.process_in(x).movedim(2, 1) # [B, C, T, H, W] -> [B, T, C, H, W]
|
||||
x = apply_model_with_memblocks(self.decoder, x, self.parallel, self.show_progress_bar)
|
||||
if self.patch_size > 1: x = F.pixel_shuffle(x, self.patch_size)
|
||||
return x[:, self.frames_to_trim:].movedim(2, 1)
|
||||
@ -1,4 +1,4 @@
|
||||
from transformers import T5TokenizerFast
|
||||
from ..transformers_compat import T5TokenizerFast
|
||||
|
||||
from .t5 import T5
|
||||
from .. import sd1_clip
|
||||
@ -11,10 +11,10 @@ class T5XXLModel(sd1_clip.SDClipModel):
|
||||
if model_options is None:
|
||||
model_options = {}
|
||||
textmodel_json_config = get_path_as_dict(textmodel_json_config, "t5_old_config_xxl.json", package=__package__)
|
||||
t5xxl_scaled_fp8 = model_options.get("t5xxl_scaled_fp8", None)
|
||||
if t5xxl_scaled_fp8 is not None:
|
||||
t5xxl_quantization_metadata = model_options.get("t5xxl_quantization_metadata", None)
|
||||
if t5xxl_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["scaled_fp8"] = t5xxl_scaled_fp8
|
||||
model_options["quantization_metadata"] = t5xxl_quantization_metadata
|
||||
|
||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=T5, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, zero_out_masked=attention_mask, model_options=model_options)
|
||||
|
||||
@ -43,14 +43,14 @@ class CosmosT5Tokenizer(sd1_clip.SD1Tokenizer):
|
||||
|
||||
|
||||
|
||||
def te(dtype_t5=None, t5xxl_scaled_fp8=None):
|
||||
def te(dtype_t5=None, t5_quantization_metadata=None):
|
||||
class CosmosTEModel_(CosmosT5XXL):
|
||||
def __init__(self, device="cpu", dtype=None, model_options=None):
|
||||
if model_options is None:
|
||||
model_options = {}
|
||||
if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options:
|
||||
if t5_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8
|
||||
model_options["t5xxl_quantization_metadata"] = t5_quantization_metadata
|
||||
if dtype is None:
|
||||
dtype = dtype_t5
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||
|
||||
@ -1,12 +1,15 @@
|
||||
import copy
|
||||
|
||||
import torch
|
||||
from transformers import T5TokenizerFast
|
||||
from ..transformers_compat import T5TokenizerFast, LlamaTokenizerFast
|
||||
|
||||
from .sd3_clip import T5XXLModel
|
||||
from .. import sd1_clip, model_management
|
||||
from ..component_model import files
|
||||
|
||||
import json
|
||||
import base64
|
||||
|
||||
|
||||
class T5XXLTokenizer(sd1_clip.SDTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data=None):
|
||||
@ -73,14 +76,135 @@ class FluxClipModel(torch.nn.Module):
|
||||
return self.t5xxl.load_sd(sd)
|
||||
|
||||
|
||||
def flux_clip(dtype_t5=None, t5xxl_scaled_fp8=None):
|
||||
def flux_clip(dtype_t5=None, t5_quantization_metadata=None):
|
||||
class FluxClipModel_(FluxClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options=None):
|
||||
if model_options is None:
|
||||
model_options = {}
|
||||
if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options:
|
||||
if t5_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8
|
||||
model_options["t5xxl_quantization_metadata"] = t5_quantization_metadata
|
||||
super().__init__(dtype_t5=dtype_t5, device=device, dtype=dtype, model_options=model_options)
|
||||
|
||||
return FluxClipModel_
|
||||
|
||||
|
||||
def load_mistral_tokenizer(data):
|
||||
if torch.is_tensor(data):
|
||||
data = data.numpy().tobytes()
|
||||
|
||||
# we just have to use the latest transformers
|
||||
from transformers.integrations.mistral import MistralConverter
|
||||
|
||||
mistral_vocab = json.loads(data)
|
||||
|
||||
special_tokens = {}
|
||||
vocab = {}
|
||||
|
||||
max_vocab = mistral_vocab["config"]["default_vocab_size"]
|
||||
max_vocab -= len(mistral_vocab["special_tokens"])
|
||||
|
||||
for w in mistral_vocab["vocab"]:
|
||||
r = w["rank"]
|
||||
if r >= max_vocab:
|
||||
continue
|
||||
|
||||
vocab[base64.b64decode(w["token_bytes"])] = r
|
||||
|
||||
for w in mistral_vocab["special_tokens"]:
|
||||
if "token_bytes" in w:
|
||||
special_tokens[base64.b64decode(w["token_bytes"])] = w["rank"]
|
||||
else:
|
||||
special_tokens[w["token_str"]] = w["rank"]
|
||||
|
||||
all_special = []
|
||||
for v in special_tokens:
|
||||
all_special.append(v)
|
||||
|
||||
special_tokens.update(vocab)
|
||||
vocab = special_tokens
|
||||
return {"tokenizer_object": MistralConverter(vocab=vocab, additional_special_tokens=all_special).converted(), "legacy": False}
|
||||
|
||||
|
||||
class MistralTokenizerClass:
|
||||
@staticmethod
|
||||
def from_pretrained(path, **kwargs):
|
||||
return LlamaTokenizerFast(**kwargs)
|
||||
|
||||
|
||||
class Mistral3Tokenizer(sd1_clip.SDTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data=None):
|
||||
if tokenizer_data is None:
|
||||
tokenizer_data = {}
|
||||
self.tekken_data = tokenizer_data.get("tekken_model", None)
|
||||
super().__init__("", pad_with_end=False, embedding_size=5120, embedding_key='mistral3_24b', tokenizer_class=MistralTokenizerClass, has_end_token=False, pad_to_max_length=False, pad_token=11, max_length=99999999, min_length=1, pad_left=True, tokenizer_args=load_mistral_tokenizer(self.tekken_data), tokenizer_data=tokenizer_data)
|
||||
|
||||
def state_dict(self):
|
||||
return {"tekken_model": self.tekken_data}
|
||||
|
||||
|
||||
class Flux2Tokenizer(sd1_clip.SD1Tokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data=None):
|
||||
if tokenizer_data is None:
|
||||
tokenizer_data = {}
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="mistral3_24b", tokenizer=Mistral3Tokenizer)
|
||||
self.llama_template = '[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]{}[/INST]'
|
||||
|
||||
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, **kwargs):
|
||||
if llama_template is None:
|
||||
llama_text = self.llama_template.format(text)
|
||||
else:
|
||||
llama_text = llama_template.format(text)
|
||||
|
||||
tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, disable_weights=True, **kwargs)
|
||||
return tokens
|
||||
|
||||
|
||||
class Mistral3_24BModel(sd1_clip.SDClipModel):
|
||||
def __init__(self, device="cpu", layer=None, layer_idx=None, dtype=None, attention_mask=True, model_options=None, textmodel_json_config=None):
|
||||
if model_options is None:
|
||||
model_options = {}
|
||||
if layer is None:
|
||||
layer = [10, 20, 30]
|
||||
# textmodel_json_config is IGNORED
|
||||
textmodel_json_config = {}
|
||||
num_layers = model_options.get("num_layers", None)
|
||||
if num_layers is not None:
|
||||
textmodel_json_config["num_hidden_layers"] = num_layers
|
||||
if num_layers < 40:
|
||||
textmodel_json_config["final_norm"] = False
|
||||
from . import llama
|
||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"start": 1, "pad": 0}, layer_norm_hidden_state=False, model_class=llama.Mistral3Small24B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
|
||||
|
||||
|
||||
class Flux2TEModel(sd1_clip.SD1ClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options=None, name="mistral3_24b", clip_model=Mistral3_24BModel):
|
||||
if model_options is None:
|
||||
model_options = {}
|
||||
super().__init__(device=device, dtype=dtype, name=name, clip_model=clip_model, model_options=model_options)
|
||||
|
||||
def encode_token_weights(self, token_weight_pairs):
|
||||
out, pooled, extra = super().encode_token_weights(token_weight_pairs)
|
||||
|
||||
out = torch.stack((out[:, 0], out[:, 1], out[:, 2]), dim=1)
|
||||
out = out.movedim(1, 2)
|
||||
out = out.reshape(out.shape[0], out.shape[1], -1)
|
||||
return out, pooled, extra
|
||||
|
||||
|
||||
def flux2_te(dtype_llama=None, llama_quantization_metadata=None, pruned=False):
|
||||
class Flux2TEModel_(Flux2TEModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options=None):
|
||||
if model_options is None:
|
||||
model_options = {}
|
||||
if dtype_llama is not None:
|
||||
dtype = dtype_llama
|
||||
if llama_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||
if pruned:
|
||||
model_options = model_options.copy()
|
||||
model_options["num_layers"] = 30
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||
|
||||
return Flux2TEModel_
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
from transformers import T5TokenizerFast
|
||||
from ..transformers_compat import T5TokenizerFast
|
||||
|
||||
from .. import sd1_clip
|
||||
from . import sd3_clip
|
||||
@ -33,14 +33,14 @@ class MochiT5Tokenizer(sd1_clip.SD1Tokenizer):
|
||||
tokenizer_data = {}
|
||||
|
||||
|
||||
def mochi_te(dtype_t5=None, t5xxl_scaled_fp8=None):
|
||||
def mochi_te(dtype_t5=None, t5_quantization_metadata=None):
|
||||
class MochiTEModel_(MochiT5XXL):
|
||||
def __init__(self, device="cpu", dtype=None, model_options=None):
|
||||
if model_options is None:
|
||||
model_options = {}
|
||||
if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options:
|
||||
if t5_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8
|
||||
model_options["t5xxl_quantization_metadata"] = t5_quantization_metadata
|
||||
if dtype is None:
|
||||
dtype = dtype_t5
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||
|
||||
@ -9,6 +9,7 @@ from ..model_management import intermediate_device, pick_weight_dtype
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HiDreamTokenizer:
|
||||
def __init__(self, embedding_directory=None, tokenizer_data=None):
|
||||
if tokenizer_data is None:
|
||||
@ -148,17 +149,17 @@ class HiDreamTEModel(torch.nn.Module):
|
||||
return self.llama.load_sd(sd)
|
||||
|
||||
|
||||
def hidream_clip(clip_l=True, clip_g=True, t5=True, llama=True, dtype_t5=None, dtype_llama=None, t5xxl_scaled_fp8=None, llama_scaled_fp8=None):
|
||||
def hidream_clip(clip_l=True, clip_g=True, t5=True, llama=True, dtype_t5=None, dtype_llama=None, t5_quantization_metadata=None, llama_quantization_metadata=None):
|
||||
class HiDreamTEModel_(HiDreamTEModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options=None):
|
||||
if model_options is None:
|
||||
model_options = {}
|
||||
if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options:
|
||||
if t5_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8
|
||||
if llama_scaled_fp8 is not None and "llama_scaled_fp8" not in model_options:
|
||||
model_options["t5xxl_quantization_metadata"] = t5_quantization_metadata
|
||||
if llama_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["llama_scaled_fp8"] = llama_scaled_fp8
|
||||
model_options["llama_quantization_metadata"] = llama_quantization_metadata
|
||||
super().__init__(clip_l=clip_l, clip_g=clip_g, t5=t5, llama=llama, dtype_t5=dtype_t5, dtype_llama=dtype_llama, device=device, dtype=dtype, model_options=model_options)
|
||||
|
||||
return HiDreamTEModel_
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
import re
|
||||
|
||||
from transformers import ByT5Tokenizer
|
||||
from ..transformers_compat import ByT5Tokenizer
|
||||
|
||||
from .llama import Qwen25_7BVLI
|
||||
from .qwen_image import QwenImageTokenizer, QwenImageTEModel
|
||||
@ -14,7 +14,7 @@ class ByT5SmallTokenizer(sd1_clip.SDTokenizer):
|
||||
if tokenizer_data is None:
|
||||
tokenizer_data = {}
|
||||
tokenizer_path = files.get_package_as_path("byt5_tokenizer")
|
||||
super().__init__(tokenizer_path, pad_with_end=False, embedding_size=1472, embedding_key='byt5_small', tokenizer_class=ByT5Tokenizer, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_data=tokenizer_data)
|
||||
super().__init__(tokenizer_path, embedding_directory=None, pad_with_end=False, embedding_size=1472, embedding_key='byt5_small', tokenizer_class=ByT5Tokenizer, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_data=tokenizer_data)
|
||||
|
||||
|
||||
class HunyuanImageTokenizer(QwenImageTokenizer):
|
||||
@ -49,14 +49,16 @@ class HunyuanImageTokenizer(QwenImageTokenizer):
|
||||
|
||||
|
||||
class Qwen25_7BVLIModel(sd1_clip.SDClipModel):
|
||||
def __init__(self, device="cpu", layer="hidden", layer_idx=-3, dtype=None, attention_mask=True, model_options=None):
|
||||
def __init__(self, device="cpu", layer="hidden", layer_idx=-3, dtype=None, attention_mask=True, model_options=None, textmodel_json_config=None):
|
||||
if model_options is None:
|
||||
model_options = {}
|
||||
llama_scaled_fp8 = model_options.get("qwen_scaled_fp8", None)
|
||||
if llama_scaled_fp8 is not None:
|
||||
llama_quantization_metadata = model_options.get("llama_quantization_metadata", None)
|
||||
if llama_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["scaled_fp8"] = llama_scaled_fp8
|
||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=Qwen25_7BVLI, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
|
||||
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||
if textmodel_json_config is None:
|
||||
textmodel_json_config = {}
|
||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=Qwen25_7BVLI, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
|
||||
|
||||
|
||||
class ByT5SmallModel(sd1_clip.SDClipModel):
|
||||
@ -108,14 +110,14 @@ class HunyuanImageTEModel(QwenImageTEModel):
|
||||
return super().load_sd(sd)
|
||||
|
||||
|
||||
def te(byt5=True, dtype_llama=None, llama_scaled_fp8=None):
|
||||
def te(byt5=True, dtype_llama=None, llama_quantization_metadata=None):
|
||||
class QwenImageTEModel_(HunyuanImageTEModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options=None):
|
||||
if model_options is None:
|
||||
model_options = {}
|
||||
if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options:
|
||||
if llama_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["qwen_scaled_fp8"] = llama_scaled_fp8
|
||||
model_options["llama_quantization_metadata"] = llama_quantization_metadata
|
||||
if dtype_llama is not None:
|
||||
dtype = dtype_llama
|
||||
super().__init__(byt5=byt5, device=device, dtype=dtype, model_options=model_options)
|
||||
|
||||
@ -1,11 +1,13 @@
|
||||
import torch
|
||||
import numbers
|
||||
from transformers import LlamaTokenizerFast
|
||||
from ..transformers_compat import LlamaTokenizerFast
|
||||
|
||||
from .hunyuan_image import HunyuanImageTokenizer
|
||||
from .llama import Llama2
|
||||
from .. import sd1_clip
|
||||
from ..component_model import files
|
||||
from ..model_management import pick_weight_dtype
|
||||
from ..utils import detect_layer_quantization
|
||||
|
||||
|
||||
def llama_detect(state_dict, prefix=""):
|
||||
@ -14,9 +16,9 @@ def llama_detect(state_dict, prefix=""):
|
||||
if t5_key in state_dict:
|
||||
out["dtype_llama"] = state_dict[t5_key].dtype
|
||||
|
||||
scaled_fp8_key = "{}scaled_fp8".format(prefix)
|
||||
if scaled_fp8_key in state_dict:
|
||||
out["llama_scaled_fp8"] = state_dict[scaled_fp8_key].dtype
|
||||
quant = detect_layer_quantization(state_dict, prefix)
|
||||
if quant is not None:
|
||||
out["llama_quantization_metadata"] = quant
|
||||
|
||||
return out
|
||||
|
||||
@ -35,10 +37,10 @@ class LLAMAModel(sd1_clip.SDClipModel):
|
||||
special_tokens = {"start": 128000, "pad": 128258}
|
||||
if model_options is None:
|
||||
model_options = {}
|
||||
llama_scaled_fp8 = model_options.get("llama_scaled_fp8", None)
|
||||
if llama_scaled_fp8 is not None:
|
||||
llama_quantization_metadata = model_options.get("llama_quantization_metadata", None)
|
||||
if llama_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["scaled_fp8"] = llama_scaled_fp8
|
||||
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||
|
||||
textmodel_json_config = textmodel_json_config or {}
|
||||
vocab_size = model_options.get("vocab_size", None)
|
||||
@ -83,6 +85,15 @@ class HunyuanVideoTokenizer:
|
||||
return {}
|
||||
|
||||
|
||||
class HunyuanVideo15Tokenizer(HunyuanImageTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
|
||||
self.llama_template = "<|im_start|>system\nYou are a helpful assistant. Describe the video by detailing the following aspects:\n1. The main content and theme of the video.\n2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects.\n3. Actions, events, behaviors temporal relationships, physical movement changes of the objects.\n4. background environment, light, style and atmosphere.\n5. camera angles, movements, and transitions used in the video.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
|
||||
|
||||
def tokenize_with_weights(self, text: str, return_word_ids=False, **kwargs):
|
||||
return super().tokenize_with_weights(text, return_word_ids, prevent_empty_text=True, **kwargs)
|
||||
|
||||
|
||||
class HunyuanVideoClipModel(torch.nn.Module):
|
||||
def __init__(self, dtype_llama=None, device="cpu", dtype=None, model_options=None):
|
||||
super().__init__()
|
||||
@ -161,14 +172,14 @@ class HunyuanVideoClipModel(torch.nn.Module):
|
||||
return self.llama.load_sd(sd)
|
||||
|
||||
|
||||
def hunyuan_video_clip(dtype_llama=None, llama_scaled_fp8=None):
|
||||
def hunyuan_video_clip(dtype_llama=None, llama_quantization_metadata=None):
|
||||
class HunyuanVideoClipModel_(HunyuanVideoClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options=None):
|
||||
if model_options is None:
|
||||
model_options = {}
|
||||
if llama_scaled_fp8 is not None and "llama_scaled_fp8" not in model_options:
|
||||
if llama_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["llama_scaled_fp8"] = llama_scaled_fp8
|
||||
model_options["llama_quantization_metadata"] = llama_quantization_metadata
|
||||
super().__init__(dtype_llama=dtype_llama, device=device, dtype=dtype, model_options=model_options)
|
||||
|
||||
return HunyuanVideoClipModel_
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
import copy
|
||||
|
||||
import torch
|
||||
from transformers import BertTokenizer
|
||||
from ..transformers_compat import BertTokenizerFast as BertTokenizer
|
||||
|
||||
from .bert import BertModel
|
||||
from .spiece_tokenizer import SPieceTokenizer
|
||||
|
||||
70
comfy/text_encoders/kandinsky5.py
Normal file
70
comfy/text_encoders/kandinsky5.py
Normal file
@ -0,0 +1,70 @@
|
||||
from .. import sd1_clip
|
||||
from .qwen_image import QwenImageTokenizer, QwenImageTEModel
|
||||
from .llama import Qwen25_7BVLI
|
||||
|
||||
|
||||
class Kandinsky5Tokenizer(QwenImageTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
|
||||
self.llama_template = "<|im_start|>system\nYou are a prompt engineer. Describe the video in detail.\nDescribe how the camera moves or shakes, describe the zoom and view angle, whether it follows the objects.\nDescribe the location of the video, main characters or objects and their action.\nDescribe the dynamism of the video and presented actions.\nName the visual style of the video: whether it is a professional footage, user generated content, some kind of animation, video game or screen content.\nDescribe the visual effects, postprocessing and transitions if they are presented in the video.\nPay attention to the order of key actions shown in the scene.<|im_end|>\n<|im_start|>user\n{}<|im_end|>"
|
||||
self.clip_l = sd1_clip.SDTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
|
||||
|
||||
def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
|
||||
out = super().tokenize_with_weights(text, return_word_ids, **kwargs)
|
||||
out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids, **kwargs)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class Kandinsky5TokenizerImage(Kandinsky5Tokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
|
||||
self.llama_template = "<|im_start|>system\nYou are a promt engineer. Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>"
|
||||
|
||||
|
||||
class Qwen25_7BVLIModel(sd1_clip.SDClipModel):
|
||||
def __init__(self, device="cpu", layer="hidden", layer_idx=-1, dtype=None, attention_mask=True, model_options={}, textmodel_json_config=None):
|
||||
llama_quantization_metadata = model_options.get("llama_quantization_metadata", None)
|
||||
if llama_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||
if textmodel_json_config is None:
|
||||
textmodel_json_config = {}
|
||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=Qwen25_7BVLI, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
|
||||
|
||||
|
||||
class Kandinsky5TEModel(QwenImageTEModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
super(QwenImageTEModel, self).__init__(device=device, dtype=dtype, name="qwen25_7b", clip_model=Qwen25_7BVLIModel, model_options=model_options)
|
||||
self.clip_l = sd1_clip.SDClipModel(device=device, dtype=dtype, return_projected_pooled=False, model_options=model_options)
|
||||
|
||||
def encode_token_weights(self, token_weight_pairs):
|
||||
cond, p, extra = super().encode_token_weights(token_weight_pairs, template_end=-1)
|
||||
l_out, l_pooled = self.clip_l.encode_token_weights(token_weight_pairs["l"])
|
||||
|
||||
return cond, l_pooled, extra
|
||||
|
||||
def set_clip_options(self, options):
|
||||
super().set_clip_options(options)
|
||||
self.clip_l.set_clip_options(options)
|
||||
|
||||
def reset_clip_options(self):
|
||||
super().reset_clip_options()
|
||||
self.clip_l.reset_clip_options()
|
||||
|
||||
def load_sd(self, sd):
|
||||
if "text_model.encoder.layers.1.mlp.fc1.weight" in sd:
|
||||
return self.clip_l.load_sd(sd)
|
||||
else:
|
||||
return super().load_sd(sd)
|
||||
|
||||
def te(dtype_llama=None, llama_quantization_metadata=None):
|
||||
class Kandinsky5TEModel_(Kandinsky5TEModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
if llama_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["llama_quantization_metadata"] = llama_quantization_metadata
|
||||
if dtype_llama is not None:
|
||||
dtype = dtype_llama
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||
return Kandinsky5TEModel_
|
||||
@ -33,6 +33,30 @@ class Llama2Config:
|
||||
q_norm = None
|
||||
k_norm = None
|
||||
rope_scale = None
|
||||
final_norm: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
class Mistral3Small24BConfig:
|
||||
vocab_size: int = 131072
|
||||
hidden_size: int = 5120
|
||||
intermediate_size: int = 32768
|
||||
num_hidden_layers: int = 40
|
||||
num_attention_heads: int = 32
|
||||
num_key_value_heads: int = 8
|
||||
max_position_embeddings: int = 8192
|
||||
rms_norm_eps: float = 1e-5
|
||||
rope_theta: float = 1000000000.0
|
||||
transformer_type: str = "llama"
|
||||
head_dim = 128
|
||||
rms_norm_add = False
|
||||
mlp_activation = "silu"
|
||||
qkv_bias = False
|
||||
rope_dims = None
|
||||
q_norm = None
|
||||
k_norm = None
|
||||
rope_scale = None
|
||||
final_norm: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -55,6 +79,53 @@ class Qwen25_3BConfig:
|
||||
q_norm = None
|
||||
k_norm = None
|
||||
rope_scale = None
|
||||
final_norm: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
class Qwen3_4BConfig:
|
||||
vocab_size: int = 151936
|
||||
hidden_size: int = 2560
|
||||
intermediate_size: int = 9728
|
||||
num_hidden_layers: int = 36
|
||||
num_attention_heads: int = 32
|
||||
num_key_value_heads: int = 8
|
||||
max_position_embeddings: int = 40960
|
||||
rms_norm_eps: float = 1e-6
|
||||
rope_theta: float = 1000000.0
|
||||
transformer_type: str = "llama"
|
||||
head_dim = 128
|
||||
rms_norm_add = False
|
||||
mlp_activation = "silu"
|
||||
qkv_bias = False
|
||||
rope_dims = None
|
||||
q_norm = "gemma3"
|
||||
k_norm = "gemma3"
|
||||
rope_scale = None
|
||||
final_norm: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
class Ovis25_2BConfig:
|
||||
vocab_size: int = 151936
|
||||
hidden_size: int = 2048
|
||||
intermediate_size: int = 6144
|
||||
num_hidden_layers: int = 28
|
||||
num_attention_heads: int = 16
|
||||
num_key_value_heads: int = 8
|
||||
max_position_embeddings: int = 40960
|
||||
rms_norm_eps: float = 1e-6
|
||||
rope_theta: float = 1000000.0
|
||||
transformer_type: str = "llama"
|
||||
head_dim = 128
|
||||
rms_norm_add = False
|
||||
mlp_activation = "silu"
|
||||
qkv_bias = False
|
||||
rope_dims = None
|
||||
q_norm = "gemma3"
|
||||
k_norm = "gemma3"
|
||||
rope_scale = None
|
||||
final_norm: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -77,6 +148,7 @@ class Qwen25_7BVLI_Config:
|
||||
q_norm = None
|
||||
k_norm = None
|
||||
rope_scale = None
|
||||
final_norm: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -100,6 +172,7 @@ class Gemma2_2B_Config:
|
||||
k_norm = None
|
||||
sliding_attention = None
|
||||
rope_scale = None
|
||||
final_norm: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -123,6 +196,7 @@ class Gemma3_4B_Config:
|
||||
k_norm = "gemma3"
|
||||
sliding_attention = [False, False, False, False, False, 1024]
|
||||
rope_scale = [1.0, 8.0]
|
||||
final_norm: bool = True
|
||||
|
||||
|
||||
class RMSNorm(nn.Module):
|
||||
@ -375,7 +449,12 @@ class Llama2_(nn.Module):
|
||||
transformer(config, index=i, device=device, dtype=dtype, ops=ops)
|
||||
for i in range(config.num_hidden_layers)
|
||||
])
|
||||
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
|
||||
|
||||
if config.final_norm:
|
||||
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
|
||||
else:
|
||||
self.norm = None
|
||||
|
||||
# self.lm_head = ops.Linear(config.hidden_size, config.vocab_size, bias=False, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, position_ids=None, embeds_info=[]):
|
||||
@ -411,8 +490,12 @@ class Llama2_(nn.Module):
|
||||
|
||||
intermediate = None
|
||||
all_intermediate = None
|
||||
only_layers = None
|
||||
if intermediate_output is not None:
|
||||
if intermediate_output == "all":
|
||||
if isinstance(intermediate_output, list):
|
||||
all_intermediate = []
|
||||
only_layers = set(intermediate_output)
|
||||
elif intermediate_output == "all":
|
||||
all_intermediate = []
|
||||
intermediate_output = None
|
||||
elif intermediate_output < 0:
|
||||
@ -420,7 +503,8 @@ class Llama2_(nn.Module):
|
||||
|
||||
for i, layer in enumerate(self.layers):
|
||||
if all_intermediate is not None:
|
||||
all_intermediate.append(x.unsqueeze(1).clone())
|
||||
if only_layers is None or (i in only_layers):
|
||||
all_intermediate.append(x.unsqueeze(1).clone())
|
||||
x = layer(
|
||||
x=x,
|
||||
attention_mask=mask,
|
||||
@ -430,14 +514,17 @@ class Llama2_(nn.Module):
|
||||
if i == intermediate_output:
|
||||
intermediate = x.clone()
|
||||
|
||||
x = self.norm(x)
|
||||
if self.norm is not None:
|
||||
x = self.norm(x)
|
||||
|
||||
if all_intermediate is not None:
|
||||
all_intermediate.append(x.unsqueeze(1).clone())
|
||||
if only_layers is None or ((i + 1) in only_layers):
|
||||
all_intermediate.append(x.unsqueeze(1).clone())
|
||||
|
||||
if all_intermediate is not None:
|
||||
intermediate = torch.cat(all_intermediate, dim=1)
|
||||
|
||||
if intermediate is not None and final_layer_norm_intermediate:
|
||||
if intermediate is not None and final_layer_norm_intermediate and self.norm is not None:
|
||||
intermediate = self.norm(intermediate)
|
||||
|
||||
return x, intermediate
|
||||
@ -466,6 +553,16 @@ class Llama2(BaseLlama, torch.nn.Module):
|
||||
self.dtype = dtype
|
||||
|
||||
|
||||
class Mistral3Small24B(BaseLlama, torch.nn.Module):
|
||||
def __init__(self, config_dict, dtype, device, operations):
|
||||
super().__init__()
|
||||
config = Mistral3Small24BConfig(**config_dict)
|
||||
self.num_layers = config.num_hidden_layers
|
||||
|
||||
self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
|
||||
self.dtype = dtype
|
||||
|
||||
|
||||
class Qwen25_3B(BaseLlama, torch.nn.Module):
|
||||
def __init__(self, config_dict, dtype, device, operations):
|
||||
super().__init__()
|
||||
@ -476,6 +573,26 @@ class Qwen25_3B(BaseLlama, torch.nn.Module):
|
||||
self.dtype = dtype
|
||||
|
||||
|
||||
class Qwen3_4B(BaseLlama, torch.nn.Module):
|
||||
def __init__(self, config_dict, dtype, device, operations):
|
||||
super().__init__()
|
||||
config = Qwen3_4BConfig(**config_dict)
|
||||
self.num_layers = config.num_hidden_layers
|
||||
|
||||
self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
|
||||
self.dtype = dtype
|
||||
|
||||
|
||||
class Ovis25_2B(BaseLlama, torch.nn.Module):
|
||||
def __init__(self, config_dict, dtype, device, operations):
|
||||
super().__init__()
|
||||
config = Ovis25_2BConfig(**config_dict)
|
||||
self.num_layers = config.num_hidden_layers
|
||||
|
||||
self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
|
||||
self.dtype = dtype
|
||||
|
||||
|
||||
class Qwen25_7BVLI(BaseLlama, torch.nn.Module):
|
||||
def __init__(self, config_dict, dtype, device, operations):
|
||||
super().__init__()
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
from transformers import T5TokenizerFast
|
||||
from ..transformers_compat import T5TokenizerFast
|
||||
|
||||
from .genmo import mochi_te
|
||||
from .. import sd1_clip
|
||||
|
||||
@ -8,29 +8,35 @@ class Gemma2BTokenizer(sd1_clip.SDTokenizer):
|
||||
if tokenizer_data is None:
|
||||
tokenizer_data = {}
|
||||
tokenizer = tokenizer_data.get("spiece_model", None)
|
||||
super().__init__(tokenizer, pad_with_end=False, embedding_size=2304, embedding_key='gemma2_2b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_args={"add_bos": True, "add_eos": False}, tokenizer_data=tokenizer_data)
|
||||
super().__init__(tokenizer, pad_with_end=False, embedding_size=2304, embedding_directory=None, embedding_key='gemma2_2b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_args={"add_bos": True, "add_eos": False}, tokenizer_data=tokenizer_data)
|
||||
|
||||
def state_dict(self):
|
||||
return {"spiece_model": self.tokenizer.serialize_model()}
|
||||
|
||||
|
||||
class Gemma3_4BTokenizer(sd1_clip.SDTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data=None):
|
||||
if tokenizer_data is None:
|
||||
tokenizer_data = {}
|
||||
tokenizer = tokenizer_data.get("spiece_model", None)
|
||||
super().__init__(tokenizer, pad_with_end=False, embedding_size=2560, embedding_key='gemma3_4b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_args={"add_bos": True, "add_eos": False}, tokenizer_data=tokenizer_data)
|
||||
super().__init__(tokenizer, pad_with_end=False, embedding_size=2560, embedding_directory=None, embedding_key='gemma3_4b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_args={"add_bos": True, "add_eos": False}, tokenizer_data=tokenizer_data)
|
||||
|
||||
def state_dict(self):
|
||||
return {"spiece_model": self.tokenizer.serialize_model()}
|
||||
|
||||
|
||||
class LuminaTokenizer(sd1_clip.SD1Tokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data=None):
|
||||
if tokenizer_data is None:
|
||||
tokenizer_data = {}
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="gemma2_2b", tokenizer=Gemma2BTokenizer)
|
||||
|
||||
|
||||
class NTokenizer(sd1_clip.SD1Tokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="gemma3_4b", tokenizer=Gemma3_4BTokenizer)
|
||||
|
||||
|
||||
class Gemma2_2BModel(sd1_clip.SDClipModel):
|
||||
def __init__(self, device="cpu", layer="hidden", layer_idx=-2, dtype=None, attention_mask=True, model_options=None, textmodel_json_config=None):
|
||||
if model_options is None:
|
||||
@ -38,9 +44,13 @@ class Gemma2_2BModel(sd1_clip.SDClipModel):
|
||||
textmodel_json_config = textmodel_json_config or {}
|
||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"start": 2, "pad": 0}, layer_norm_hidden_state=False, model_class=Gemma2_2B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
|
||||
|
||||
|
||||
class Gemma3_4BModel(sd1_clip.SDClipModel):
|
||||
def __init__(self, device="cpu", layer="hidden", layer_idx=-2, dtype=None, attention_mask=True, model_options={}):
|
||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"start": 2, "pad": 0}, layer_norm_hidden_state=False, model_class=Gemma3_4B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
|
||||
def __init__(self, device="cpu", layer="hidden", layer_idx=-2, dtype=None, attention_mask=True, model_options={}, textmodel_json_config=None):
|
||||
if textmodel_json_config is None:
|
||||
textmodel_json_config = {}
|
||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"start": 2, "pad": 0}, layer_norm_hidden_state=False, model_class=Gemma3_4B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
|
||||
|
||||
|
||||
class LuminaModel(sd1_clip.SD1ClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options=None, name="gemma2_2b", clip_model=Gemma2_2BModel):
|
||||
@ -49,21 +59,22 @@ class LuminaModel(sd1_clip.SD1ClipModel):
|
||||
super().__init__(device=device, dtype=dtype, name=name, clip_model=clip_model, model_options=model_options)
|
||||
|
||||
|
||||
|
||||
def te(dtype_llama=None, llama_scaled_fp8=None, model_type="gemma2_2b"):
|
||||
def te(dtype_llama=None, llama_quantization_metadata=None, model_type="gemma2_2b"):
|
||||
model = None
|
||||
if model_type == "gemma2_2b":
|
||||
model = Gemma2_2BModel
|
||||
elif model_type == "gemma3_4b":
|
||||
model = Gemma3_4BModel
|
||||
|
||||
class LuminaTEModel_(LuminaModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options=None):
|
||||
if model_options is None:
|
||||
model_options = {}
|
||||
if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options:
|
||||
if llama_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["scaled_fp8"] = llama_scaled_fp8
|
||||
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||
if dtype_llama is not None:
|
||||
dtype = dtype_llama
|
||||
super().__init__(device=device, dtype=dtype, name=model_type, model_options=model_options, clip_model=model)
|
||||
|
||||
return LuminaTEModel_
|
||||
|
||||
@ -1,8 +1,7 @@
|
||||
from transformers import Qwen2Tokenizer
|
||||
from .. import sd1_clip
|
||||
from .llama import Qwen25_3B
|
||||
import os
|
||||
from ..transformers_compat import Qwen2Tokenizer
|
||||
|
||||
from .llama import Qwen25_3B
|
||||
from .. import sd1_clip
|
||||
from ..component_model import files
|
||||
|
||||
|
||||
@ -10,8 +9,8 @@ class Qwen25_3BTokenizer(sd1_clip.SDTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data=None):
|
||||
if tokenizer_data is None:
|
||||
tokenizer_data = {}
|
||||
tokenizer_path = files.get_package_as_path("comfy.text_encoders.qwen25_tokenizer")
|
||||
super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2048, embedding_key='qwen25_3b', tokenizer_class=Qwen2Tokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=151643, tokenizer_data=tokenizer_data)
|
||||
tokenizer_path = files.get_package_as_path("comfy.text_encoders.qwen25_tokenizer")
|
||||
super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2048, embedding_directory=embedding_directory, embedding_key='qwen25_3b', tokenizer_class=Qwen2Tokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=151643, tokenizer_data=tokenizer_data)
|
||||
|
||||
|
||||
class Omnigen2Tokenizer(sd1_clip.SD1Tokenizer):
|
||||
@ -21,20 +20,20 @@ class Omnigen2Tokenizer(sd1_clip.SD1Tokenizer):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="qwen25_3b", tokenizer=Qwen25_3BTokenizer)
|
||||
self.llama_template = '<|im_start|>system\nYou are a helpful assistant that generates high-quality images based on user instructions.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n'
|
||||
|
||||
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None,**kwargs):
|
||||
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, **kwargs):
|
||||
if llama_template is None:
|
||||
llama_text = self.llama_template.format(text)
|
||||
else:
|
||||
llama_text = llama_template.format(text)
|
||||
return super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, **kwargs)
|
||||
|
||||
|
||||
class Qwen25_3BModel(sd1_clip.SDClipModel):
|
||||
def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options=None, textmodel_json_config=None):
|
||||
if model_options is None:
|
||||
model_options = {}
|
||||
textmodel_json_config = textmodel_json_config or {}
|
||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=Qwen25_3B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
|
||||
|
||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=Qwen25_3B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
|
||||
|
||||
|
||||
class Omnigen2Model(sd1_clip.SD1ClipModel):
|
||||
@ -44,15 +43,16 @@ class Omnigen2Model(sd1_clip.SD1ClipModel):
|
||||
super().__init__(device=device, dtype=dtype, name="qwen25_3b", clip_model=Qwen25_3BModel, model_options=model_options)
|
||||
|
||||
|
||||
def te(dtype_llama=None, llama_scaled_fp8=None):
|
||||
def te(dtype_llama=None, llama_quantization_metadata=None):
|
||||
class Omnigen2TEModel_(Omnigen2Model):
|
||||
def __init__(self, device="cpu", dtype=None, model_options=None):
|
||||
if model_options is None:
|
||||
model_options = {}
|
||||
if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options:
|
||||
if llama_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["scaled_fp8"] = llama_scaled_fp8
|
||||
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||
if dtype_llama is not None:
|
||||
dtype = dtype_llama
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||
|
||||
return Omnigen2TEModel_
|
||||
|
||||
74
comfy/text_encoders/ovis.py
Normal file
74
comfy/text_encoders/ovis.py
Normal file
@ -0,0 +1,74 @@
|
||||
import numbers
|
||||
|
||||
import torch
|
||||
from ..transformers_compat import Qwen2Tokenizer
|
||||
from . import llama
|
||||
from .. import sd1_clip
|
||||
from ..component_model import files
|
||||
|
||||
|
||||
class Qwen3Tokenizer(sd1_clip.SDTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
tokenizer_path = files.get_package_as_path("comfy.text_encoders.qwen25_tokenizer")
|
||||
super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2048, embedding_key='qwen3_2b', tokenizer_class=Qwen2Tokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=284, pad_token=151643, tokenizer_data=tokenizer_data)
|
||||
|
||||
|
||||
class OvisTokenizer(sd1_clip.SD1Tokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="qwen3_2b", tokenizer=Qwen3Tokenizer)
|
||||
self.llama_template = "<|im_start|>user\nDescribe the image by detailing the color, quantity, text, shape, size, texture, spatial relationships of the objects and background: {}<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
|
||||
|
||||
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, **kwargs):
|
||||
if llama_template is None:
|
||||
llama_text = self.llama_template.format(text)
|
||||
else:
|
||||
llama_text = llama_template.format(text)
|
||||
|
||||
tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, disable_weights=True, **kwargs)
|
||||
return tokens
|
||||
|
||||
|
||||
class Ovis25_2BModel(sd1_clip.SDClipModel):
|
||||
def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options=None, textmodel_json_config=None):
|
||||
if model_options is None:
|
||||
model_options = {}
|
||||
# textmodel_json_config is IGNORED
|
||||
textmodel_json_config = {}
|
||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=llama.Ovis25_2B, enable_attention_masks=attention_mask, return_attention_masks=False, zero_out_masked=True, model_options=model_options)
|
||||
|
||||
|
||||
class OvisTEModel(sd1_clip.SD1ClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
super().__init__(device=device, dtype=dtype, name="qwen3_2b", clip_model=Ovis25_2BModel, model_options=model_options)
|
||||
|
||||
def encode_token_weights(self, token_weight_pairs, template_end=-1):
|
||||
out, pooled = super().encode_token_weights(token_weight_pairs)
|
||||
tok_pairs = token_weight_pairs["qwen3_2b"][0]
|
||||
count_im_start = 0
|
||||
if template_end == -1:
|
||||
for i, v in enumerate(tok_pairs):
|
||||
elem = v[0]
|
||||
if not torch.is_tensor(elem):
|
||||
if isinstance(elem, numbers.Integral):
|
||||
if elem == 4004 and count_im_start < 1:
|
||||
template_end = i
|
||||
count_im_start += 1
|
||||
|
||||
if out.shape[1] > (template_end + 1):
|
||||
if tok_pairs[template_end + 1][0] == 25:
|
||||
template_end += 1
|
||||
|
||||
out = out[:, template_end:]
|
||||
return out, pooled, {}
|
||||
|
||||
|
||||
def te(dtype_llama=None, llama_quantization_metadata=None):
|
||||
class OvisTEModel_(OvisTEModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
if dtype_llama is not None:
|
||||
dtype = dtype_llama
|
||||
if llama_quantization_metadata is not None:
|
||||
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||
|
||||
return OvisTEModel_
|
||||
@ -1,4 +1,4 @@
|
||||
from transformers import T5TokenizerFast
|
||||
from ..transformers_compat import T5TokenizerFast
|
||||
|
||||
from . import sd3_clip
|
||||
from .. import sd1_clip
|
||||
@ -35,19 +35,19 @@ class T5XXLTokenizer(sd1_clip.SDTokenizer):
|
||||
|
||||
class PixArtTokenizer(sd1_clip.SD1Tokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data=None):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="t5xxl", tokenizer=T5XXLTokenizer)
|
||||
if tokenizer_data is None:
|
||||
tokenizer_data = {}
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="t5xxl", tokenizer=T5XXLTokenizer)
|
||||
|
||||
|
||||
def pixart_te(dtype_t5=None, t5xxl_scaled_fp8=None):
|
||||
def pixart_te(dtype_t5=None, t5_quantization_metadata=None):
|
||||
class PixArtTEModel_(PixArtT5XXL):
|
||||
def __init__(self, device="cpu", dtype=None, model_options=None):
|
||||
if model_options is None:
|
||||
model_options = {}
|
||||
if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options:
|
||||
if t5_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8
|
||||
model_options["t5xxl_quantization_metadata"] = t5_quantization_metadata
|
||||
if dtype is None:
|
||||
dtype = dtype_t5
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||
|
||||
@ -179,36 +179,36 @@
|
||||
"special": false
|
||||
},
|
||||
"151665": {
|
||||
"content": "<|img|>",
|
||||
"content": "<tool_response>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
"special": false
|
||||
},
|
||||
"151666": {
|
||||
"content": "<|endofimg|>",
|
||||
"content": "</tool_response>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
"special": false
|
||||
},
|
||||
"151667": {
|
||||
"content": "<|meta|>",
|
||||
"content": "<think>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
"special": false
|
||||
},
|
||||
"151668": {
|
||||
"content": "<|endofmeta|>",
|
||||
"content": "</think>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
"special": false
|
||||
}
|
||||
},
|
||||
"additional_special_tokens": [
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
import numbers
|
||||
import torch
|
||||
from transformers import Qwen2Tokenizer
|
||||
from ..transformers_compat import Qwen2Tokenizer
|
||||
|
||||
from .llama import Qwen25_7BVLI
|
||||
from .. import sd1_clip
|
||||
@ -12,7 +12,7 @@ class Qwen25_7BVLITokenizer(sd1_clip.SDTokenizer):
|
||||
if tokenizer_data is None:
|
||||
tokenizer_data = {}
|
||||
tokenizer_path = files.get_package_as_path("comfy.text_encoders.qwen25_tokenizer")
|
||||
super().__init__(tokenizer_path, pad_with_end=False, embedding_size=3584, embedding_key='qwen25_7b', tokenizer_class=Qwen2Tokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=151643, tokenizer_data=tokenizer_data)
|
||||
super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory, embedding_size=3584, embedding_key='qwen25_7b', tokenizer_class=Qwen2Tokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=151643, tokenizer_data=tokenizer_data)
|
||||
|
||||
|
||||
class QwenImageTokenizer(sd1_clip.SD1Tokenizer):
|
||||
@ -23,12 +23,14 @@ class QwenImageTokenizer(sd1_clip.SD1Tokenizer):
|
||||
self.llama_template = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
|
||||
self.llama_template_images = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"
|
||||
|
||||
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], **kwargs):
|
||||
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], prevent_empty_text=False, **kwargs):
|
||||
skip_template = False
|
||||
if text.startswith('<|im_start|>'):
|
||||
skip_template = True
|
||||
if text.startswith('<|start_header_id|>'):
|
||||
skip_template = True
|
||||
if prevent_empty_text and text == '':
|
||||
text = ' '
|
||||
|
||||
if skip_template:
|
||||
llama_text = text
|
||||
@ -94,14 +96,14 @@ class QwenImageTEModel(sd1_clip.SD1ClipModel):
|
||||
return out, pooled, extra
|
||||
|
||||
|
||||
def te(dtype_llama=None, llama_scaled_fp8=None):
|
||||
def te(dtype_llama=None, llama_quantization_metadata=None):
|
||||
class QwenImageTEModel_(QwenImageTEModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options=None):
|
||||
if model_options is None:
|
||||
model_options = {}
|
||||
if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options:
|
||||
if llama_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["scaled_fp8"] = llama_scaled_fp8
|
||||
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||
if dtype_llama is not None:
|
||||
dtype = dtype_llama
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
from transformers import T5TokenizerFast
|
||||
from ..transformers_compat import T5TokenizerFast
|
||||
|
||||
from .t5 import T5
|
||||
from .. import sd1_clip
|
||||
|
||||
@ -1,8 +1,9 @@
|
||||
import copy
|
||||
import logging
|
||||
import comfy.utils
|
||||
|
||||
import torch
|
||||
from transformers import T5TokenizerFast
|
||||
from ..transformers_compat import T5TokenizerFast
|
||||
|
||||
from .t5 import T5
|
||||
from .. import sd1_clip, model_management
|
||||
@ -17,10 +18,10 @@ class T5XXLModel(sd1_clip.SDClipModel):
|
||||
if model_options is None:
|
||||
model_options = {}
|
||||
textmodel_json_config = files.get_path_as_dict(textmodel_json_config, "t5_config_xxl.json", package=__package__)
|
||||
t5xxl_scaled_fp8 = model_options.get("t5xxl_scaled_fp8", None)
|
||||
if t5xxl_scaled_fp8 is not None:
|
||||
t5xxl_quantization_metadata = model_options.get("t5xxl_quantization_metadata", None)
|
||||
if t5xxl_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["scaled_fp8"] = t5xxl_scaled_fp8
|
||||
model_options["quantization_metadata"] = t5xxl_quantization_metadata
|
||||
|
||||
model_options = {**model_options, "model_name": "t5xxl"}
|
||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=T5, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
|
||||
@ -32,9 +33,9 @@ def t5_xxl_detect(state_dict, prefix=""):
|
||||
if t5_key in state_dict:
|
||||
out["dtype_t5"] = state_dict[t5_key].dtype
|
||||
|
||||
scaled_fp8_key = "{}scaled_fp8".format(prefix)
|
||||
if scaled_fp8_key in state_dict:
|
||||
out["t5xxl_scaled_fp8"] = state_dict[scaled_fp8_key].dtype
|
||||
quant = comfy.utils.detect_layer_quantization(state_dict, prefix)
|
||||
if quant is not None:
|
||||
out["t5_quantization_metadata"] = quant
|
||||
|
||||
return out
|
||||
|
||||
@ -175,14 +176,14 @@ class SD3ClipModel(torch.nn.Module):
|
||||
return self.t5xxl.load_sd(sd)
|
||||
|
||||
|
||||
def sd3_clip(clip_l=True, clip_g=True, t5=True, dtype_t5=None, t5xxl_scaled_fp8=None, t5_attention_mask=False):
|
||||
def sd3_clip(clip_l=True, clip_g=True, t5=True, dtype_t5=None, t5_quantization_metadata=None, t5_attention_mask=False):
|
||||
class SD3ClipModel_(SD3ClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options=None):
|
||||
if model_options is None:
|
||||
model_options = {}
|
||||
if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options:
|
||||
if t5_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8
|
||||
model_options["t5xxl_quantization_metadata"] = t5_quantization_metadata
|
||||
super().__init__(clip_l=clip_l, clip_g=clip_g, t5=t5, dtype_t5=dtype_t5, t5_attention_mask=t5_attention_mask, device=device, dtype=dtype, model_options=model_options)
|
||||
|
||||
return SD3ClipModel_
|
||||
|
||||
@ -37,14 +37,14 @@ class WanT5Model(sd1_clip.SD1ClipModel):
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options, name="umt5xxl", clip_model=UMT5XXlModel, **kwargs)
|
||||
|
||||
|
||||
def te(dtype_t5=None, t5xxl_scaled_fp8=None):
|
||||
def te(dtype_t5=None, t5_quantization_metadata=None):
|
||||
class WanTEModel(WanT5Model):
|
||||
def __init__(self, device="cpu", dtype=None, model_options=None):
|
||||
if model_options is None:
|
||||
model_options = {}
|
||||
if t5xxl_scaled_fp8 is not None and "scaled_fp8" not in model_options:
|
||||
if t5_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["scaled_fp8"] = t5xxl_scaled_fp8
|
||||
model_options["quantization_metadata"] = t5_quantization_metadata
|
||||
if dtype_t5 is not None:
|
||||
dtype = dtype_t5
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||
|
||||
53
comfy/text_encoders/z_image.py
Normal file
53
comfy/text_encoders/z_image.py
Normal file
@ -0,0 +1,53 @@
|
||||
from ..transformers_compat import Qwen2Tokenizer
|
||||
|
||||
from . import llama
|
||||
from .. import sd1_clip
|
||||
from ..component_model import files
|
||||
|
||||
|
||||
class Qwen3Tokenizer(sd1_clip.SDTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data=None):
|
||||
if tokenizer_data is None:
|
||||
tokenizer_data = {}
|
||||
tokenizer_path = files.get_package_as_path("comfy.text_encoders.qwen25_tokenizer")
|
||||
super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2560, embedding_key='qwen3_4b', tokenizer_class=Qwen2Tokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=151643, tokenizer_data=tokenizer_data)
|
||||
|
||||
|
||||
class ZImageTokenizer(sd1_clip.SD1Tokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="qwen3_4b", tokenizer=Qwen3Tokenizer)
|
||||
self.llama_template = "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
|
||||
|
||||
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, **kwargs):
|
||||
if llama_template is None:
|
||||
llama_text = self.llama_template.format(text)
|
||||
else:
|
||||
llama_text = llama_template.format(text)
|
||||
|
||||
tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, disable_weights=True, **kwargs)
|
||||
return tokens
|
||||
|
||||
|
||||
class Qwen3_4BModel(sd1_clip.SDClipModel):
|
||||
def __init__(self, device="cpu", layer="hidden", layer_idx=-2, dtype=None, attention_mask=True, model_options=None, textmodel_json_config=None):
|
||||
if model_options is None:
|
||||
model_options = {}
|
||||
# textmodel_json_config is IGNORED
|
||||
textmodel_json_config = {}
|
||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=llama.Qwen3_4B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
|
||||
|
||||
|
||||
class ZImageTEModel(sd1_clip.SD1ClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
super().__init__(device=device, dtype=dtype, name="qwen3_4b", clip_model=Qwen3_4BModel, model_options=model_options)
|
||||
|
||||
|
||||
def te(dtype_llama=None, llama_quantization_metadata=None):
|
||||
class ZImageTEModel_(ZImageTEModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
if dtype_llama is not None:
|
||||
dtype = dtype_llama
|
||||
if llama_quantization_metadata is not None:
|
||||
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||
return ZImageTEModel_
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user