mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2025-12-16 17:42:58 +08:00
Compare commits
130 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9304e47351 | ||
|
|
bc606d7d64 | ||
|
|
645ee1881e | ||
|
|
3d082c3206 | ||
|
|
683569de55 | ||
|
|
ea2c117bc3 | ||
|
|
fc4af86068 | ||
|
|
41bcf0619d | ||
|
|
d02d0e5744 | ||
|
|
70541d4e77 | ||
|
|
77b2f7c228 | ||
|
|
43e0d4e3cc | ||
|
|
dbd330454a | ||
|
|
33c7f1179d | ||
|
|
af91eb6c99 | ||
|
|
5cb1e0c9a0 | ||
|
|
51347f9fb8 | ||
|
|
a5e85017d8 | ||
|
|
5ac3b26a7d | ||
|
|
6592bffc60 | ||
|
|
971cefe7d4 | ||
|
|
da2bfb5b0a | ||
|
|
c5a47a1692 | ||
|
|
908fd7d749 | ||
|
|
5495589db3 | ||
|
|
982876d59a | ||
|
|
338d9ae3bb | ||
|
|
eeb020b9b7 | ||
|
|
ae65433a60 | ||
|
|
fdebe18296 | ||
|
|
f8321eb57b | ||
|
|
93948e3fc5 | ||
|
|
e711aaf1a7 | ||
|
|
57ddb7fd13 | ||
|
|
17c92a9f28 | ||
|
|
36357bbcc3 | ||
|
|
f668c2e3c9 | ||
|
|
fc657f471a | ||
|
|
791e30ff50 | ||
|
|
e2a800e7ef | ||
|
|
9d252f3b70 | ||
|
|
b9fb542703 | ||
|
|
cabc4d351f | ||
|
|
e136b6dbb0 | ||
|
|
d50f342c90 | ||
|
|
3b0368aa34 | ||
|
|
935493f6c1 | ||
|
|
60ee574748 | ||
|
|
8e889c535d | ||
|
|
fd271dedfd | ||
|
|
c3c6313fc7 | ||
|
|
85c4b4ae26 | ||
|
|
058f084371 | ||
|
|
ec7f65187d | ||
|
|
56fa7dbe38 | ||
|
|
329480da5a | ||
|
|
4086acf3c2 | ||
|
|
50ca97e776 | ||
|
|
7ac7d69d94 | ||
|
|
76f18e955d | ||
|
|
d7a0aef650 | ||
|
|
913f86b727 | ||
|
|
117bf3f2bd | ||
|
|
ae676ed105 | ||
|
|
fd109325db | ||
|
|
bed12674a1 | ||
|
|
092ee8a500 | ||
|
|
79d17ba233 | ||
|
|
6fd463aec9 | ||
|
|
43071e3de3 | ||
|
|
0ec05b1481 | ||
|
|
35fa091340 | ||
|
|
3c8456223c | ||
|
|
9bc893c5bb | ||
|
|
f4bdf5f830 | ||
|
|
6be85c7920 | ||
|
|
ea17add3c6 | ||
|
|
ecdc8697d5 | ||
|
|
dce518c2b4 | ||
|
|
440268d394 | ||
|
|
87c104bfc1 | ||
|
|
19f2192d69 | ||
|
|
519c941165 | ||
|
|
861817d22d | ||
|
|
c120eee5ba | ||
|
|
73f5649196 | ||
|
|
3f512f5659 | ||
|
|
b94d394a64 | ||
|
|
277237ccc1 | ||
|
|
daaceac769 | ||
|
|
33d6aec3b7 | ||
|
|
44baa0b7f3 | ||
|
|
a17cf1c387 | ||
|
|
b4a20acc54 | ||
|
|
c55dc857d5 | ||
|
|
878db3a727 | ||
|
|
30c259cac8 | ||
|
|
1cb7e22a95 | ||
|
|
2640acb31c | ||
|
|
7dbd5dfe91 | ||
|
|
f8b981ae9a | ||
|
|
4967f81778 | ||
|
|
0a6746898d | ||
|
|
5151cff293 | ||
|
|
af96d9812d | ||
|
|
52a32e2b32 | ||
|
|
b907085709 | ||
|
|
065a2fbbec | ||
|
|
0ff0457892 | ||
|
|
6484ac89dc | ||
|
|
f55c98a89f | ||
|
|
ca7808f240 | ||
|
|
52e778fff3 | ||
|
|
9d8a817985 | ||
|
|
b59750a86a | ||
|
|
3f382a4f98 | ||
|
|
f17251bec6 | ||
|
|
c38e7d6599 | ||
|
|
eaf68c9b5b | ||
|
|
cc6a8dcd1a | ||
|
|
a2d60aad0f | ||
|
|
d8433c63fd | ||
|
|
dd41b74549 | ||
|
|
55f654db3d | ||
|
|
58c6ed541d | ||
|
|
234c3dc85f | ||
|
|
8908ee2628 | ||
|
|
1105e0d139 | ||
|
|
8938aa3f30 | ||
|
|
f16219e3aa |
@ -53,6 +53,16 @@ try:
|
|||||||
repo.stash(ident)
|
repo.stash(ident)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
print("nothing to stash") # noqa: T201
|
print("nothing to stash") # noqa: T201
|
||||||
|
except:
|
||||||
|
print("Could not stash, cleaning index and trying again.") # noqa: T201
|
||||||
|
repo.state_cleanup()
|
||||||
|
repo.index.read_tree(repo.head.peel().tree)
|
||||||
|
repo.index.write()
|
||||||
|
try:
|
||||||
|
repo.stash(ident)
|
||||||
|
except KeyError:
|
||||||
|
print("nothing to stash.") # noqa: T201
|
||||||
|
|
||||||
backup_branch_name = 'backup_branch_{}'.format(datetime.today().strftime('%Y-%m-%d_%H_%M_%S'))
|
backup_branch_name = 'backup_branch_{}'.format(datetime.today().strftime('%Y-%m-%d_%H_%M_%S'))
|
||||||
print("creating backup branch: {}".format(backup_branch_name)) # noqa: T201
|
print("creating backup branch: {}".format(backup_branch_name)) # noqa: T201
|
||||||
try:
|
try:
|
||||||
@ -66,8 +76,10 @@ if branch is None:
|
|||||||
try:
|
try:
|
||||||
ref = repo.lookup_reference('refs/remotes/origin/master')
|
ref = repo.lookup_reference('refs/remotes/origin/master')
|
||||||
except:
|
except:
|
||||||
print("pulling.") # noqa: T201
|
print("fetching.") # noqa: T201
|
||||||
pull(repo)
|
for remote in repo.remotes:
|
||||||
|
if remote.name == "origin":
|
||||||
|
remote.fetch()
|
||||||
ref = repo.lookup_reference('refs/remotes/origin/master')
|
ref = repo.lookup_reference('refs/remotes/origin/master')
|
||||||
repo.checkout(ref)
|
repo.checkout(ref)
|
||||||
branch = repo.lookup_branch('master')
|
branch = repo.lookup_branch('master')
|
||||||
@ -149,3 +161,4 @@ try:
|
|||||||
shutil.copy(stable_update_script, stable_update_script_to)
|
shutil.copy(stable_update_script, stable_update_script_to)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
As of the time of writing this you need this preview driver for best results:
|
As of the time of writing this you need this driver for best results:
|
||||||
https://www.amd.com/en/resources/support-articles/release-notes/RN-AMDGPU-WINDOWS-PYTORCH-PREVIEW.html
|
https://www.amd.com/en/resources/support-articles/release-notes/RN-AMDGPU-WINDOWS-PYTORCH-7-1-1.html
|
||||||
|
|
||||||
HOW TO RUN:
|
HOW TO RUN:
|
||||||
|
|
||||||
@ -25,3 +25,4 @@ In the ComfyUI directory you will find a file: extra_model_paths.yaml.example
|
|||||||
Rename this file to: extra_model_paths.yaml and edit it with your favorite text editor.
|
Rename this file to: extra_model_paths.yaml and edit it with your favorite text editor.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
4
.github/workflows/release-stable-all.yml
vendored
4
.github/workflows/release-stable-all.yml
vendored
@ -65,11 +65,11 @@ jobs:
|
|||||||
contents: "write"
|
contents: "write"
|
||||||
packages: "write"
|
packages: "write"
|
||||||
pull-requests: "read"
|
pull-requests: "read"
|
||||||
name: "Release AMD ROCm 6.4.4"
|
name: "Release AMD ROCm 7.1.1"
|
||||||
uses: ./.github/workflows/stable-release.yml
|
uses: ./.github/workflows/stable-release.yml
|
||||||
with:
|
with:
|
||||||
git_tag: ${{ inputs.git_tag }}
|
git_tag: ${{ inputs.git_tag }}
|
||||||
cache_tag: "rocm644"
|
cache_tag: "rocm711"
|
||||||
python_minor: "12"
|
python_minor: "12"
|
||||||
python_patch: "10"
|
python_patch: "10"
|
||||||
rel_name: "amd"
|
rel_name: "amd"
|
||||||
|
|||||||
1
.github/workflows/test-ci.yml
vendored
1
.github/workflows/test-ci.yml
vendored
@ -5,6 +5,7 @@ on:
|
|||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
|
- release/**
|
||||||
paths-ignore:
|
paths-ignore:
|
||||||
- 'app/**'
|
- 'app/**'
|
||||||
- 'input/**'
|
- 'input/**'
|
||||||
|
|||||||
4
.github/workflows/test-execution.yml
vendored
4
.github/workflows/test-execution.yml
vendored
@ -2,9 +2,9 @@ name: Execution Tests
|
|||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
branches: [ main, master ]
|
branches: [ main, master, release/** ]
|
||||||
pull_request:
|
pull_request:
|
||||||
branches: [ main, master ]
|
branches: [ main, master, release/** ]
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
test:
|
test:
|
||||||
|
|||||||
4
.github/workflows/test-launch.yml
vendored
4
.github/workflows/test-launch.yml
vendored
@ -2,9 +2,9 @@ name: Test server launches without errors
|
|||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
branches: [ main, master ]
|
branches: [ main, master, release/** ]
|
||||||
pull_request:
|
pull_request:
|
||||||
branches: [ main, master ]
|
branches: [ main, master, release/** ]
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
test:
|
test:
|
||||||
|
|||||||
4
.github/workflows/test-unit.yml
vendored
4
.github/workflows/test-unit.yml
vendored
@ -2,9 +2,9 @@ name: Unit Tests
|
|||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
branches: [ main, master ]
|
branches: [ main, master, release/** ]
|
||||||
pull_request:
|
pull_request:
|
||||||
branches: [ main, master ]
|
branches: [ main, master, release/** ]
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
test:
|
test:
|
||||||
|
|||||||
1
.github/workflows/update-version.yml
vendored
1
.github/workflows/update-version.yml
vendored
@ -6,6 +6,7 @@ on:
|
|||||||
- "pyproject.toml"
|
- "pyproject.toml"
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
|
- release/**
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
update-version:
|
update-version:
|
||||||
|
|||||||
@ -1,3 +1,2 @@
|
|||||||
# Admins
|
# Admins
|
||||||
* @comfyanonymous
|
* @comfyanonymous @kosinkadink @guill
|
||||||
* @kosinkadink
|
|
||||||
|
|||||||
28
README.md
28
README.md
@ -68,6 +68,7 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
|
|||||||
- [Qwen Image](https://comfyanonymous.github.io/ComfyUI_examples/qwen_image/)
|
- [Qwen Image](https://comfyanonymous.github.io/ComfyUI_examples/qwen_image/)
|
||||||
- [Hunyuan Image 2.1](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_image/)
|
- [Hunyuan Image 2.1](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_image/)
|
||||||
- [Flux 2](https://comfyanonymous.github.io/ComfyUI_examples/flux2/)
|
- [Flux 2](https://comfyanonymous.github.io/ComfyUI_examples/flux2/)
|
||||||
|
- [Z Image](https://comfyanonymous.github.io/ComfyUI_examples/z_image/)
|
||||||
- Image Editing Models
|
- Image Editing Models
|
||||||
- [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/)
|
- [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/)
|
||||||
- [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model)
|
- [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model)
|
||||||
@ -80,6 +81,7 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
|
|||||||
- [Hunyuan Video](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_video/)
|
- [Hunyuan Video](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_video/)
|
||||||
- [Wan 2.1](https://comfyanonymous.github.io/ComfyUI_examples/wan/)
|
- [Wan 2.1](https://comfyanonymous.github.io/ComfyUI_examples/wan/)
|
||||||
- [Wan 2.2](https://comfyanonymous.github.io/ComfyUI_examples/wan22/)
|
- [Wan 2.2](https://comfyanonymous.github.io/ComfyUI_examples/wan22/)
|
||||||
|
- [Hunyuan Video 1.5](https://docs.comfy.org/tutorials/video/hunyuan/hunyuan-video-1-5)
|
||||||
- Audio Models
|
- Audio Models
|
||||||
- [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
|
- [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
|
||||||
- [ACE Step](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
|
- [ACE Step](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
|
||||||
@ -318,6 +320,32 @@ For models compatible with Iluvatar Extension for PyTorch. Here's a step-by-step
|
|||||||
1. Install the Iluvatar Corex Toolkit by adhering to the platform-specific instructions on the [Installation](https://support.iluvatar.com/#/DocumentCentre?id=1&nameCenter=2&productId=520117912052801536)
|
1. Install the Iluvatar Corex Toolkit by adhering to the platform-specific instructions on the [Installation](https://support.iluvatar.com/#/DocumentCentre?id=1&nameCenter=2&productId=520117912052801536)
|
||||||
2. Launch ComfyUI by running `python main.py`
|
2. Launch ComfyUI by running `python main.py`
|
||||||
|
|
||||||
|
|
||||||
|
## [ComfyUI-Manager](https://github.com/Comfy-Org/ComfyUI-Manager/tree/manager-v4)
|
||||||
|
|
||||||
|
**ComfyUI-Manager** is an extension that allows you to easily install, update, and manage custom nodes for ComfyUI.
|
||||||
|
|
||||||
|
### Setup
|
||||||
|
|
||||||
|
1. Install the manager dependencies:
|
||||||
|
```bash
|
||||||
|
pip install -r manager_requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Enable the manager with the `--enable-manager` flag when running ComfyUI:
|
||||||
|
```bash
|
||||||
|
python main.py --enable-manager
|
||||||
|
```
|
||||||
|
|
||||||
|
### Command Line Options
|
||||||
|
|
||||||
|
| Flag | Description |
|
||||||
|
|------|-------------|
|
||||||
|
| `--enable-manager` | Enable ComfyUI-Manager |
|
||||||
|
| `--enable-manager-legacy-ui` | Use the legacy manager UI instead of the new UI (requires `--enable-manager`) |
|
||||||
|
| `--disable-manager-ui` | Disable the manager UI and endpoints while keeping background features like security checks and scheduled installation completion (requires `--enable-manager`) |
|
||||||
|
|
||||||
|
|
||||||
# Running
|
# Running
|
||||||
|
|
||||||
```python main.py```
|
```python main.py```
|
||||||
|
|||||||
@ -58,8 +58,13 @@ class InternalRoutes:
|
|||||||
return web.json_response({"error": "Invalid directory type"}, status=400)
|
return web.json_response({"error": "Invalid directory type"}, status=400)
|
||||||
|
|
||||||
directory = get_directory_by_type(directory_type)
|
directory = get_directory_by_type(directory_type)
|
||||||
|
|
||||||
|
def is_visible_file(entry: os.DirEntry) -> bool:
|
||||||
|
"""Filter out hidden files (e.g., .DS_Store on macOS)."""
|
||||||
|
return entry.is_file() and not entry.name.startswith('.')
|
||||||
|
|
||||||
sorted_files = sorted(
|
sorted_files = sorted(
|
||||||
(entry for entry in os.scandir(directory) if entry.is_file()),
|
(entry for entry in os.scandir(directory) if is_visible_file(entry)),
|
||||||
key=lambda entry: -entry.stat().st_mtime
|
key=lambda entry: -entry.stat().st_mtime
|
||||||
)
|
)
|
||||||
return web.json_response([entry.name for entry in sorted_files], status=200)
|
return web.json_response([entry.name for entry in sorted_files], status=200)
|
||||||
|
|||||||
@ -59,6 +59,9 @@ class UserManager():
|
|||||||
user = "default"
|
user = "default"
|
||||||
if args.multi_user and "comfy-user" in request.headers:
|
if args.multi_user and "comfy-user" in request.headers:
|
||||||
user = request.headers["comfy-user"]
|
user = request.headers["comfy-user"]
|
||||||
|
# Block System Users (use same error message to prevent probing)
|
||||||
|
if user.startswith(folder_paths.SYSTEM_USER_PREFIX):
|
||||||
|
raise KeyError("Unknown user: " + user)
|
||||||
|
|
||||||
if user not in self.users:
|
if user not in self.users:
|
||||||
raise KeyError("Unknown user: " + user)
|
raise KeyError("Unknown user: " + user)
|
||||||
@ -66,15 +69,16 @@ class UserManager():
|
|||||||
return user
|
return user
|
||||||
|
|
||||||
def get_request_user_filepath(self, request, file, type="userdata", create_dir=True):
|
def get_request_user_filepath(self, request, file, type="userdata", create_dir=True):
|
||||||
user_directory = folder_paths.get_user_directory()
|
|
||||||
|
|
||||||
if type == "userdata":
|
if type == "userdata":
|
||||||
root_dir = user_directory
|
root_dir = folder_paths.get_user_directory()
|
||||||
else:
|
else:
|
||||||
raise KeyError("Unknown filepath type:" + type)
|
raise KeyError("Unknown filepath type:" + type)
|
||||||
|
|
||||||
user = self.get_request_user_id(request)
|
user = self.get_request_user_id(request)
|
||||||
path = user_root = os.path.abspath(os.path.join(root_dir, user))
|
user_root = folder_paths.get_public_user_directory(user)
|
||||||
|
if user_root is None:
|
||||||
|
return None
|
||||||
|
path = user_root
|
||||||
|
|
||||||
# prevent leaving /{type}
|
# prevent leaving /{type}
|
||||||
if os.path.commonpath((root_dir, user_root)) != root_dir:
|
if os.path.commonpath((root_dir, user_root)) != root_dir:
|
||||||
@ -101,7 +105,11 @@ class UserManager():
|
|||||||
name = name.strip()
|
name = name.strip()
|
||||||
if not name:
|
if not name:
|
||||||
raise ValueError("username not provided")
|
raise ValueError("username not provided")
|
||||||
|
if name.startswith(folder_paths.SYSTEM_USER_PREFIX):
|
||||||
|
raise ValueError("System User prefix not allowed")
|
||||||
user_id = re.sub("[^a-zA-Z0-9-_]+", '-', name)
|
user_id = re.sub("[^a-zA-Z0-9-_]+", '-', name)
|
||||||
|
if user_id.startswith(folder_paths.SYSTEM_USER_PREFIX):
|
||||||
|
raise ValueError("System User prefix not allowed")
|
||||||
user_id = user_id + "_" + str(uuid.uuid4())
|
user_id = user_id + "_" + str(uuid.uuid4())
|
||||||
|
|
||||||
self.users[user_id] = name
|
self.users[user_id] = name
|
||||||
@ -132,7 +140,10 @@ class UserManager():
|
|||||||
if username in self.users.values():
|
if username in self.users.values():
|
||||||
return web.json_response({"error": "Duplicate username."}, status=400)
|
return web.json_response({"error": "Duplicate username."}, status=400)
|
||||||
|
|
||||||
user_id = self.add_user(username)
|
try:
|
||||||
|
user_id = self.add_user(username)
|
||||||
|
except ValueError as e:
|
||||||
|
return web.json_response({"error": str(e)}, status=400)
|
||||||
return web.json_response(user_id)
|
return web.json_response(user_id)
|
||||||
|
|
||||||
@routes.get("/userdata")
|
@routes.get("/userdata")
|
||||||
@ -424,7 +435,7 @@ class UserManager():
|
|||||||
return source
|
return source
|
||||||
|
|
||||||
dest = get_user_data_path(request, check_exists=False, param="dest")
|
dest = get_user_data_path(request, check_exists=False, param="dest")
|
||||||
if not isinstance(source, str):
|
if not isinstance(dest, str):
|
||||||
return dest
|
return dest
|
||||||
|
|
||||||
overwrite = request.query.get("overwrite", 'true') != "false"
|
overwrite = request.query.get("overwrite", 'true') != "false"
|
||||||
|
|||||||
@ -97,6 +97,13 @@ class LatentPreviewMethod(enum.Enum):
|
|||||||
Latent2RGB = "latent2rgb"
|
Latent2RGB = "latent2rgb"
|
||||||
TAESD = "taesd"
|
TAESD = "taesd"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_string(cls, value: str):
|
||||||
|
for member in cls:
|
||||||
|
if member.value == value:
|
||||||
|
return member
|
||||||
|
return None
|
||||||
|
|
||||||
parser.add_argument("--preview-method", type=LatentPreviewMethod, default=LatentPreviewMethod.NoPreviews, help="Default preview method for sampler nodes.", action=EnumAction)
|
parser.add_argument("--preview-method", type=LatentPreviewMethod, default=LatentPreviewMethod.NoPreviews, help="Default preview method for sampler nodes.", action=EnumAction)
|
||||||
|
|
||||||
parser.add_argument("--preview-size", type=int, default=512, help="Sets the maximum preview size for sampler nodes.")
|
parser.add_argument("--preview-size", type=int, default=512, help="Sets the maximum preview size for sampler nodes.")
|
||||||
@ -121,6 +128,12 @@ upcast.add_argument("--force-upcast-attention", action="store_true", help="Force
|
|||||||
upcast.add_argument("--dont-upcast-attention", action="store_true", help="Disable all upcasting of attention. Should be unnecessary except for debugging.")
|
upcast.add_argument("--dont-upcast-attention", action="store_true", help="Disable all upcasting of attention. Should be unnecessary except for debugging.")
|
||||||
|
|
||||||
|
|
||||||
|
parser.add_argument("--enable-manager", action="store_true", help="Enable the ComfyUI-Manager feature.")
|
||||||
|
manager_group = parser.add_mutually_exclusive_group()
|
||||||
|
manager_group.add_argument("--disable-manager-ui", action="store_true", help="Disables only the ComfyUI-Manager UI and endpoints. Scheduled installations and similar background tasks will still operate.")
|
||||||
|
manager_group.add_argument("--enable-manager-legacy-ui", action="store_true", help="Enables the legacy UI of ComfyUI-Manager")
|
||||||
|
|
||||||
|
|
||||||
vram_group = parser.add_mutually_exclusive_group()
|
vram_group = parser.add_mutually_exclusive_group()
|
||||||
vram_group.add_argument("--gpu-only", action="store_true", help="Store and run everything (text encoders/CLIP models, etc... on the GPU).")
|
vram_group.add_argument("--gpu-only", action="store_true", help="Store and run everything (text encoders/CLIP models, etc... on the GPU).")
|
||||||
vram_group.add_argument("--highvram", action="store_true", help="By default models will be unloaded to CPU memory after being used. This option keeps them in GPU memory.")
|
vram_group.add_argument("--highvram", action="store_true", help="By default models will be unloaded to CPU memory after being used. This option keeps them in GPU memory.")
|
||||||
@ -131,7 +144,8 @@ vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for e
|
|||||||
|
|
||||||
parser.add_argument("--reserve-vram", type=float, default=None, help="Set the amount of vram in GB you want to reserve for use by your OS/other software. By default some amount is reserved depending on your OS.")
|
parser.add_argument("--reserve-vram", type=float, default=None, help="Set the amount of vram in GB you want to reserve for use by your OS/other software. By default some amount is reserved depending on your OS.")
|
||||||
|
|
||||||
parser.add_argument("--async-offload", action="store_true", help="Use async weight offloading.")
|
parser.add_argument("--async-offload", nargs='?', const=2, type=int, default=None, metavar="NUM_STREAMS", help="Use async weight offloading. An optional argument controls the amount of offload streams. Default is 2. Enabled by default on Nvidia.")
|
||||||
|
parser.add_argument("--disable-async-offload", action="store_true", help="Disable async weight offloading.")
|
||||||
|
|
||||||
parser.add_argument("--force-non-blocking", action="store_true", help="Force ComfyUI to use non-blocking operations for all applicable tensors. This may improve performance on some non-Nvidia systems but can cause issues with some workflows.")
|
parser.add_argument("--force-non-blocking", action="store_true", help="Force ComfyUI to use non-blocking operations for all applicable tensors. This may improve performance on some non-Nvidia systems but can cause issues with some workflows.")
|
||||||
|
|
||||||
@ -167,6 +181,7 @@ parser.add_argument("--multi-user", action="store_true", help="Enables per-user
|
|||||||
parser.add_argument("--verbose", default='INFO', const='DEBUG', nargs="?", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Set the logging level')
|
parser.add_argument("--verbose", default='INFO', const='DEBUG', nargs="?", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Set the logging level')
|
||||||
parser.add_argument("--log-stdout", action="store_true", help="Send normal process output to stdout instead of stderr (default).")
|
parser.add_argument("--log-stdout", action="store_true", help="Send normal process output to stdout instead of stderr (default).")
|
||||||
|
|
||||||
|
|
||||||
# The default built-in provider hosted under web/
|
# The default built-in provider hosted under web/
|
||||||
DEFAULT_VERSION_STRING = "comfyanonymous/ComfyUI@latest"
|
DEFAULT_VERSION_STRING = "comfyanonymous/ComfyUI@latest"
|
||||||
|
|
||||||
|
|||||||
@ -51,32 +51,43 @@ class ContextHandlerABC(ABC):
|
|||||||
|
|
||||||
|
|
||||||
class IndexListContextWindow(ContextWindowABC):
|
class IndexListContextWindow(ContextWindowABC):
|
||||||
def __init__(self, index_list: list[int], dim: int=0):
|
def __init__(self, index_list: list[int], dim: int=0, total_frames: int=0):
|
||||||
self.index_list = index_list
|
self.index_list = index_list
|
||||||
self.context_length = len(index_list)
|
self.context_length = len(index_list)
|
||||||
self.dim = dim
|
self.dim = dim
|
||||||
|
self.total_frames = total_frames
|
||||||
|
self.center_ratio = (min(index_list) + max(index_list)) / (2 * total_frames)
|
||||||
|
|
||||||
def get_tensor(self, full: torch.Tensor, device=None, dim=None) -> torch.Tensor:
|
def get_tensor(self, full: torch.Tensor, device=None, dim=None, retain_index_list=[]) -> torch.Tensor:
|
||||||
if dim is None:
|
if dim is None:
|
||||||
dim = self.dim
|
dim = self.dim
|
||||||
if dim == 0 and full.shape[dim] == 1:
|
if dim == 0 and full.shape[dim] == 1:
|
||||||
return full
|
return full
|
||||||
idx = [slice(None)] * dim + [self.index_list]
|
idx = tuple([slice(None)] * dim + [self.index_list])
|
||||||
return full[idx].to(device)
|
window = full[idx]
|
||||||
|
if retain_index_list:
|
||||||
|
idx = tuple([slice(None)] * dim + [retain_index_list])
|
||||||
|
window[idx] = full[idx]
|
||||||
|
return window.to(device)
|
||||||
|
|
||||||
def add_window(self, full: torch.Tensor, to_add: torch.Tensor, dim=None) -> torch.Tensor:
|
def add_window(self, full: torch.Tensor, to_add: torch.Tensor, dim=None) -> torch.Tensor:
|
||||||
if dim is None:
|
if dim is None:
|
||||||
dim = self.dim
|
dim = self.dim
|
||||||
idx = [slice(None)] * dim + [self.index_list]
|
idx = tuple([slice(None)] * dim + [self.index_list])
|
||||||
full[idx] += to_add
|
full[idx] += to_add
|
||||||
return full
|
return full
|
||||||
|
|
||||||
|
def get_region_index(self, num_regions: int) -> int:
|
||||||
|
region_idx = int(self.center_ratio * num_regions)
|
||||||
|
return min(max(region_idx, 0), num_regions - 1)
|
||||||
|
|
||||||
|
|
||||||
class IndexListCallbacks:
|
class IndexListCallbacks:
|
||||||
EVALUATE_CONTEXT_WINDOWS = "evaluate_context_windows"
|
EVALUATE_CONTEXT_WINDOWS = "evaluate_context_windows"
|
||||||
COMBINE_CONTEXT_WINDOW_RESULTS = "combine_context_window_results"
|
COMBINE_CONTEXT_WINDOW_RESULTS = "combine_context_window_results"
|
||||||
EXECUTE_START = "execute_start"
|
EXECUTE_START = "execute_start"
|
||||||
EXECUTE_CLEANUP = "execute_cleanup"
|
EXECUTE_CLEANUP = "execute_cleanup"
|
||||||
|
RESIZE_COND_ITEM = "resize_cond_item"
|
||||||
|
|
||||||
def init_callbacks(self):
|
def init_callbacks(self):
|
||||||
return {}
|
return {}
|
||||||
@ -94,7 +105,8 @@ class ContextFuseMethod:
|
|||||||
|
|
||||||
ContextResults = collections.namedtuple("ContextResults", ['window_idx', 'sub_conds_out', 'sub_conds', 'window'])
|
ContextResults = collections.namedtuple("ContextResults", ['window_idx', 'sub_conds_out', 'sub_conds', 'window'])
|
||||||
class IndexListContextHandler(ContextHandlerABC):
|
class IndexListContextHandler(ContextHandlerABC):
|
||||||
def __init__(self, context_schedule: ContextSchedule, fuse_method: ContextFuseMethod, context_length: int=1, context_overlap: int=0, context_stride: int=1, closed_loop=False, dim=0):
|
def __init__(self, context_schedule: ContextSchedule, fuse_method: ContextFuseMethod, context_length: int=1, context_overlap: int=0, context_stride: int=1,
|
||||||
|
closed_loop: bool=False, dim:int=0, freenoise: bool=False, cond_retain_index_list: list[int]=[], split_conds_to_windows: bool=False):
|
||||||
self.context_schedule = context_schedule
|
self.context_schedule = context_schedule
|
||||||
self.fuse_method = fuse_method
|
self.fuse_method = fuse_method
|
||||||
self.context_length = context_length
|
self.context_length = context_length
|
||||||
@ -103,13 +115,18 @@ class IndexListContextHandler(ContextHandlerABC):
|
|||||||
self.closed_loop = closed_loop
|
self.closed_loop = closed_loop
|
||||||
self.dim = dim
|
self.dim = dim
|
||||||
self._step = 0
|
self._step = 0
|
||||||
|
self.freenoise = freenoise
|
||||||
|
self.cond_retain_index_list = [int(x.strip()) for x in cond_retain_index_list.split(",")] if cond_retain_index_list else []
|
||||||
|
self.split_conds_to_windows = split_conds_to_windows
|
||||||
|
|
||||||
self.callbacks = {}
|
self.callbacks = {}
|
||||||
|
|
||||||
def should_use_context(self, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]) -> bool:
|
def should_use_context(self, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]) -> bool:
|
||||||
# for now, assume first dim is batch - should have stored on BaseModel in actual implementation
|
# for now, assume first dim is batch - should have stored on BaseModel in actual implementation
|
||||||
if x_in.size(self.dim) > self.context_length:
|
if x_in.size(self.dim) > self.context_length:
|
||||||
logging.info(f"Using context windows {self.context_length} for {x_in.size(self.dim)} frames.")
|
logging.info(f"Using context windows {self.context_length} with overlap {self.context_overlap} for {x_in.size(self.dim)} frames.")
|
||||||
|
if self.cond_retain_index_list:
|
||||||
|
logging.info(f"Retaining original cond for indexes: {self.cond_retain_index_list}")
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -123,6 +140,11 @@ class IndexListContextHandler(ContextHandlerABC):
|
|||||||
return None
|
return None
|
||||||
# reuse or resize cond items to match context requirements
|
# reuse or resize cond items to match context requirements
|
||||||
resized_cond = []
|
resized_cond = []
|
||||||
|
# if multiple conds, split based on primary region
|
||||||
|
if self.split_conds_to_windows and len(cond_in) > 1:
|
||||||
|
region = window.get_region_index(len(cond_in))
|
||||||
|
logging.info(f"Splitting conds to windows; using region {region} for window {window[0]}-{window[-1]} with center ratio {window.center_ratio:.3f}")
|
||||||
|
cond_in = [cond_in[region]]
|
||||||
# cond object is a list containing a dict - outer list is irrelevant, so just loop through it
|
# cond object is a list containing a dict - outer list is irrelevant, so just loop through it
|
||||||
for actual_cond in cond_in:
|
for actual_cond in cond_in:
|
||||||
resized_actual_cond = actual_cond.copy()
|
resized_actual_cond = actual_cond.copy()
|
||||||
@ -145,13 +167,32 @@ class IndexListContextHandler(ContextHandlerABC):
|
|||||||
new_cond_item = cond_item.copy()
|
new_cond_item = cond_item.copy()
|
||||||
# when in dictionary, look for tensors and CONDCrossAttn [comfy/conds.py] (has cond attr that is a tensor)
|
# when in dictionary, look for tensors and CONDCrossAttn [comfy/conds.py] (has cond attr that is a tensor)
|
||||||
for cond_key, cond_value in new_cond_item.items():
|
for cond_key, cond_value in new_cond_item.items():
|
||||||
|
# Allow callbacks to handle custom conditioning items
|
||||||
|
handled = False
|
||||||
|
for callback in comfy.patcher_extension.get_all_callbacks(
|
||||||
|
IndexListCallbacks.RESIZE_COND_ITEM, self.callbacks
|
||||||
|
):
|
||||||
|
result = callback(cond_key, cond_value, window, x_in, device, new_cond_item)
|
||||||
|
if result is not None:
|
||||||
|
new_cond_item[cond_key] = result
|
||||||
|
handled = True
|
||||||
|
break
|
||||||
|
if handled:
|
||||||
|
continue
|
||||||
if isinstance(cond_value, torch.Tensor):
|
if isinstance(cond_value, torch.Tensor):
|
||||||
if cond_value.ndim < self.dim and cond_value.size(0) == x_in.size(self.dim):
|
if (self.dim < cond_value.ndim and cond_value(self.dim) == x_in.size(self.dim)) or \
|
||||||
|
(cond_value.ndim < self.dim and cond_value.size(0) == x_in.size(self.dim)):
|
||||||
new_cond_item[cond_key] = window.get_tensor(cond_value, device)
|
new_cond_item[cond_key] = window.get_tensor(cond_value, device)
|
||||||
|
# Handle audio_embed (temporal dim is 1)
|
||||||
|
elif cond_key == "audio_embed" and hasattr(cond_value, "cond") and isinstance(cond_value.cond, torch.Tensor):
|
||||||
|
audio_cond = cond_value.cond
|
||||||
|
if audio_cond.ndim > 1 and audio_cond.size(1) == x_in.size(self.dim):
|
||||||
|
new_cond_item[cond_key] = cond_value._copy_with(window.get_tensor(audio_cond, device, dim=1))
|
||||||
# if has cond that is a Tensor, check if needs to be subset
|
# if has cond that is a Tensor, check if needs to be subset
|
||||||
elif hasattr(cond_value, "cond") and isinstance(cond_value.cond, torch.Tensor):
|
elif hasattr(cond_value, "cond") and isinstance(cond_value.cond, torch.Tensor):
|
||||||
if cond_value.cond.ndim < self.dim and cond_value.cond.size(0) == x_in.size(self.dim):
|
if (self.dim < cond_value.cond.ndim and cond_value.cond.size(self.dim) == x_in.size(self.dim)) or \
|
||||||
new_cond_item[cond_key] = cond_value._copy_with(window.get_tensor(cond_value.cond, device))
|
(cond_value.cond.ndim < self.dim and cond_value.cond.size(0) == x_in.size(self.dim)):
|
||||||
|
new_cond_item[cond_key] = cond_value._copy_with(window.get_tensor(cond_value.cond, device, retain_index_list=self.cond_retain_index_list))
|
||||||
elif cond_key == "num_video_frames": # for SVD
|
elif cond_key == "num_video_frames": # for SVD
|
||||||
new_cond_item[cond_key] = cond_value._copy_with(cond_value.cond)
|
new_cond_item[cond_key] = cond_value._copy_with(cond_value.cond)
|
||||||
new_cond_item[cond_key].cond = window.context_length
|
new_cond_item[cond_key].cond = window.context_length
|
||||||
@ -164,7 +205,7 @@ class IndexListContextHandler(ContextHandlerABC):
|
|||||||
return resized_cond
|
return resized_cond
|
||||||
|
|
||||||
def set_step(self, timestep: torch.Tensor, model_options: dict[str]):
|
def set_step(self, timestep: torch.Tensor, model_options: dict[str]):
|
||||||
mask = torch.isclose(model_options["transformer_options"]["sample_sigmas"], timestep, rtol=0.0001)
|
mask = torch.isclose(model_options["transformer_options"]["sample_sigmas"], timestep[0], rtol=0.0001)
|
||||||
matches = torch.nonzero(mask)
|
matches = torch.nonzero(mask)
|
||||||
if torch.numel(matches) == 0:
|
if torch.numel(matches) == 0:
|
||||||
raise Exception("No sample_sigmas matched current timestep; something went wrong.")
|
raise Exception("No sample_sigmas matched current timestep; something went wrong.")
|
||||||
@ -173,7 +214,7 @@ class IndexListContextHandler(ContextHandlerABC):
|
|||||||
def get_context_windows(self, model: BaseModel, x_in: torch.Tensor, model_options: dict[str]) -> list[IndexListContextWindow]:
|
def get_context_windows(self, model: BaseModel, x_in: torch.Tensor, model_options: dict[str]) -> list[IndexListContextWindow]:
|
||||||
full_length = x_in.size(self.dim) # TODO: choose dim based on model
|
full_length = x_in.size(self.dim) # TODO: choose dim based on model
|
||||||
context_windows = self.context_schedule.func(full_length, self, model_options)
|
context_windows = self.context_schedule.func(full_length, self, model_options)
|
||||||
context_windows = [IndexListContextWindow(window, dim=self.dim) for window in context_windows]
|
context_windows = [IndexListContextWindow(window, dim=self.dim, total_frames=full_length) for window in context_windows]
|
||||||
return context_windows
|
return context_windows
|
||||||
|
|
||||||
def execute(self, calc_cond_batch: Callable, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]):
|
def execute(self, calc_cond_batch: Callable, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]):
|
||||||
@ -250,8 +291,8 @@ class IndexListContextHandler(ContextHandlerABC):
|
|||||||
prev_weight = (bias_total / (bias_total + bias))
|
prev_weight = (bias_total / (bias_total + bias))
|
||||||
new_weight = (bias / (bias_total + bias))
|
new_weight = (bias / (bias_total + bias))
|
||||||
# account for dims of tensors
|
# account for dims of tensors
|
||||||
idx_window = [slice(None)] * self.dim + [idx]
|
idx_window = tuple([slice(None)] * self.dim + [idx])
|
||||||
pos_window = [slice(None)] * self.dim + [pos]
|
pos_window = tuple([slice(None)] * self.dim + [pos])
|
||||||
# apply new values
|
# apply new values
|
||||||
conds_final[i][idx_window] = conds_final[i][idx_window] * prev_weight + sub_conds_out[i][pos_window] * new_weight
|
conds_final[i][idx_window] = conds_final[i][idx_window] * prev_weight + sub_conds_out[i][pos_window] * new_weight
|
||||||
biases_final[i][idx] = bias_total + bias
|
biases_final[i][idx] = bias_total + bias
|
||||||
@ -287,6 +328,28 @@ def create_prepare_sampling_wrapper(model: ModelPatcher):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _sampler_sample_wrapper(executor, guider, sigmas, extra_args, callback, noise, *args, **kwargs):
|
||||||
|
model_options = extra_args.get("model_options", None)
|
||||||
|
if model_options is None:
|
||||||
|
raise Exception("model_options not found in sampler_sample_wrapper; this should never happen, something went wrong.")
|
||||||
|
handler: IndexListContextHandler = model_options.get("context_handler", None)
|
||||||
|
if handler is None:
|
||||||
|
raise Exception("context_handler not found in sampler_sample_wrapper; this should never happen, something went wrong.")
|
||||||
|
if not handler.freenoise:
|
||||||
|
return executor(guider, sigmas, extra_args, callback, noise, *args, **kwargs)
|
||||||
|
noise = apply_freenoise(noise, handler.dim, handler.context_length, handler.context_overlap, extra_args["seed"])
|
||||||
|
|
||||||
|
return executor(guider, sigmas, extra_args, callback, noise, *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def create_sampler_sample_wrapper(model: ModelPatcher):
|
||||||
|
model.add_wrapper_with_key(
|
||||||
|
comfy.patcher_extension.WrappersMP.SAMPLER_SAMPLE,
|
||||||
|
"ContextWindows_sampler_sample",
|
||||||
|
_sampler_sample_wrapper
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def match_weights_to_dim(weights: list[float], x_in: torch.Tensor, dim: int, device=None) -> torch.Tensor:
|
def match_weights_to_dim(weights: list[float], x_in: torch.Tensor, dim: int, device=None) -> torch.Tensor:
|
||||||
total_dims = len(x_in.shape)
|
total_dims = len(x_in.shape)
|
||||||
weights_tensor = torch.Tensor(weights).to(device=device)
|
weights_tensor = torch.Tensor(weights).to(device=device)
|
||||||
@ -538,3 +601,29 @@ def shift_window_to_end(window: list[int], num_frames: int):
|
|||||||
for i in range(len(window)):
|
for i in range(len(window)):
|
||||||
# 2) add end_delta to each val to slide windows to end
|
# 2) add end_delta to each val to slide windows to end
|
||||||
window[i] = window[i] + end_delta
|
window[i] = window[i] + end_delta
|
||||||
|
|
||||||
|
|
||||||
|
# https://github.com/Kosinkadink/ComfyUI-AnimateDiff-Evolved/blob/90fb1331201a4b29488089e4fbffc0d82cc6d0a9/animatediff/sample_settings.py#L465
|
||||||
|
def apply_freenoise(noise: torch.Tensor, dim: int, context_length: int, context_overlap: int, seed: int):
|
||||||
|
logging.info("Context windows: Applying FreeNoise")
|
||||||
|
generator = torch.Generator(device='cpu').manual_seed(seed)
|
||||||
|
latent_video_length = noise.shape[dim]
|
||||||
|
delta = context_length - context_overlap
|
||||||
|
|
||||||
|
for start_idx in range(0, latent_video_length - context_length, delta):
|
||||||
|
place_idx = start_idx + context_length
|
||||||
|
|
||||||
|
actual_delta = min(delta, latent_video_length - place_idx)
|
||||||
|
if actual_delta <= 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
list_idx = torch.randperm(actual_delta, generator=generator, device='cpu') + start_idx
|
||||||
|
|
||||||
|
source_slice = [slice(None)] * noise.ndim
|
||||||
|
source_slice[dim] = list_idx
|
||||||
|
target_slice = [slice(None)] * noise.ndim
|
||||||
|
target_slice[dim] = slice(place_idx, place_idx + actual_delta)
|
||||||
|
|
||||||
|
noise[tuple(target_slice)] = noise[tuple(source_slice)]
|
||||||
|
|
||||||
|
return noise
|
||||||
|
|||||||
@ -1557,10 +1557,13 @@ def sample_er_sde(model, x, sigmas, extra_args=None, callback=None, disable=None
|
|||||||
|
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=0.5):
|
def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=0.5, solver_type="phi_1"):
|
||||||
"""SEEDS-2 - Stochastic Explicit Exponential Derivative-free Solvers (VP Data Prediction) stage 2.
|
"""SEEDS-2 - Stochastic Explicit Exponential Derivative-free Solvers (VP Data Prediction) stage 2.
|
||||||
arXiv: https://arxiv.org/abs/2305.14267 (NeurIPS 2023)
|
arXiv: https://arxiv.org/abs/2305.14267 (NeurIPS 2023)
|
||||||
"""
|
"""
|
||||||
|
if solver_type not in {"phi_1", "phi_2"}:
|
||||||
|
raise ValueError("solver_type must be 'phi_1' or 'phi_2'")
|
||||||
|
|
||||||
extra_args = {} if extra_args is None else extra_args
|
extra_args = {} if extra_args is None else extra_args
|
||||||
seed = extra_args.get("seed", None)
|
seed = extra_args.get("seed", None)
|
||||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||||
@ -1600,8 +1603,14 @@ def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=Non
|
|||||||
denoised_2 = model(x_2, sigma_s_1 * s_in, **extra_args)
|
denoised_2 = model(x_2, sigma_s_1 * s_in, **extra_args)
|
||||||
|
|
||||||
# Step 2
|
# Step 2
|
||||||
denoised_d = torch.lerp(denoised, denoised_2, fac)
|
if solver_type == "phi_1":
|
||||||
x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x - alpha_t * ei_h_phi_1(-h_eta) * denoised_d
|
denoised_d = torch.lerp(denoised, denoised_2, fac)
|
||||||
|
x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x - alpha_t * ei_h_phi_1(-h_eta) * denoised_d
|
||||||
|
elif solver_type == "phi_2":
|
||||||
|
b2 = ei_h_phi_2(-h_eta) / r
|
||||||
|
b1 = ei_h_phi_1(-h_eta) - b2
|
||||||
|
x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x - alpha_t * (b1 * denoised + b2 * denoised_2)
|
||||||
|
|
||||||
if inject_noise:
|
if inject_noise:
|
||||||
segment_factor = (r - 1) * h * eta
|
segment_factor = (r - 1) * h * eta
|
||||||
sde_noise = sde_noise * segment_factor.exp()
|
sde_noise = sde_noise * segment_factor.exp()
|
||||||
|
|||||||
@ -6,6 +6,7 @@ class LatentFormat:
|
|||||||
latent_dimensions = 2
|
latent_dimensions = 2
|
||||||
latent_rgb_factors = None
|
latent_rgb_factors = None
|
||||||
latent_rgb_factors_bias = None
|
latent_rgb_factors_bias = None
|
||||||
|
latent_rgb_factors_reshape = None
|
||||||
taesd_decoder_name = None
|
taesd_decoder_name = None
|
||||||
|
|
||||||
def process_in(self, latent):
|
def process_in(self, latent):
|
||||||
@ -181,6 +182,45 @@ class Flux(SD3):
|
|||||||
class Flux2(LatentFormat):
|
class Flux2(LatentFormat):
|
||||||
latent_channels = 128
|
latent_channels = 128
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.latent_rgb_factors =[
|
||||||
|
[0.0058, 0.0113, 0.0073],
|
||||||
|
[0.0495, 0.0443, 0.0836],
|
||||||
|
[-0.0099, 0.0096, 0.0644],
|
||||||
|
[0.2144, 0.3009, 0.3652],
|
||||||
|
[0.0166, -0.0039, -0.0054],
|
||||||
|
[0.0157, 0.0103, -0.0160],
|
||||||
|
[-0.0398, 0.0902, -0.0235],
|
||||||
|
[-0.0052, 0.0095, 0.0109],
|
||||||
|
[-0.3527, -0.2712, -0.1666],
|
||||||
|
[-0.0301, -0.0356, -0.0180],
|
||||||
|
[-0.0107, 0.0078, 0.0013],
|
||||||
|
[0.0746, 0.0090, -0.0941],
|
||||||
|
[0.0156, 0.0169, 0.0070],
|
||||||
|
[-0.0034, -0.0040, -0.0114],
|
||||||
|
[0.0032, 0.0181, 0.0080],
|
||||||
|
[-0.0939, -0.0008, 0.0186],
|
||||||
|
[0.0018, 0.0043, 0.0104],
|
||||||
|
[0.0284, 0.0056, -0.0127],
|
||||||
|
[-0.0024, -0.0022, -0.0030],
|
||||||
|
[0.1207, -0.0026, 0.0065],
|
||||||
|
[0.0128, 0.0101, 0.0142],
|
||||||
|
[0.0137, -0.0072, -0.0007],
|
||||||
|
[0.0095, 0.0092, -0.0059],
|
||||||
|
[0.0000, -0.0077, -0.0049],
|
||||||
|
[-0.0465, -0.0204, -0.0312],
|
||||||
|
[0.0095, 0.0012, -0.0066],
|
||||||
|
[0.0290, -0.0034, 0.0025],
|
||||||
|
[0.0220, 0.0169, -0.0048],
|
||||||
|
[-0.0332, -0.0457, -0.0468],
|
||||||
|
[-0.0085, 0.0389, 0.0609],
|
||||||
|
[-0.0076, 0.0003, -0.0043],
|
||||||
|
[-0.0111, -0.0460, -0.0614],
|
||||||
|
]
|
||||||
|
|
||||||
|
self.latent_rgb_factors_bias = [-0.0329, -0.0718, -0.0851]
|
||||||
|
self.latent_rgb_factors_reshape = lambda t: t.reshape(t.shape[0], 32, 2, 2, t.shape[-2], t.shape[-1]).permute(0, 1, 4, 2, 5, 3).reshape(t.shape[0], 32, t.shape[-2] * 2, t.shape[-1] * 2)
|
||||||
|
|
||||||
def process_in(self, latent):
|
def process_in(self, latent):
|
||||||
return latent
|
return latent
|
||||||
|
|
||||||
@ -391,6 +431,7 @@ class HunyuanVideo(LatentFormat):
|
|||||||
]
|
]
|
||||||
|
|
||||||
latent_rgb_factors_bias = [ 0.0259, -0.0192, -0.0761]
|
latent_rgb_factors_bias = [ 0.0259, -0.0192, -0.0761]
|
||||||
|
taesd_decoder_name = "taehv"
|
||||||
|
|
||||||
class Cosmos1CV8x8x8(LatentFormat):
|
class Cosmos1CV8x8x8(LatentFormat):
|
||||||
latent_channels = 16
|
latent_channels = 16
|
||||||
@ -454,7 +495,7 @@ class Wan21(LatentFormat):
|
|||||||
]).view(1, self.latent_channels, 1, 1, 1)
|
]).view(1, self.latent_channels, 1, 1, 1)
|
||||||
|
|
||||||
|
|
||||||
self.taesd_decoder_name = None #TODO
|
self.taesd_decoder_name = "lighttaew2_1"
|
||||||
|
|
||||||
def process_in(self, latent):
|
def process_in(self, latent):
|
||||||
latents_mean = self.latents_mean.to(latent.device, latent.dtype)
|
latents_mean = self.latents_mean.to(latent.device, latent.dtype)
|
||||||
@ -525,6 +566,7 @@ class Wan22(Wan21):
|
|||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.scale_factor = 1.0
|
self.scale_factor = 1.0
|
||||||
|
self.taesd_decoder_name = "lighttaew2_2"
|
||||||
self.latents_mean = torch.tensor([
|
self.latents_mean = torch.tensor([
|
||||||
-0.2289, -0.0052, -0.1323, -0.2339, -0.2799, 0.0174, 0.1838, 0.1557,
|
-0.2289, -0.0052, -0.1323, -0.2339, -0.2799, 0.0174, 0.1838, 0.1557,
|
||||||
-0.1382, 0.0542, 0.2813, 0.0891, 0.1570, -0.0098, 0.0375, -0.1825,
|
-0.1382, 0.0542, 0.2813, 0.0891, 0.1570, -0.0098, 0.0375, -0.1825,
|
||||||
@ -679,6 +721,7 @@ class HunyuanVideo15(LatentFormat):
|
|||||||
latent_channels = 32
|
latent_channels = 32
|
||||||
latent_dimensions = 3
|
latent_dimensions = 3
|
||||||
scale_factor = 1.03682
|
scale_factor = 1.03682
|
||||||
|
taesd_decoder_name = "lighttaehy1_5"
|
||||||
|
|
||||||
class Hunyuan3Dv2(LatentFormat):
|
class Hunyuan3Dv2(LatentFormat):
|
||||||
latent_channels = 64
|
latent_channels = 64
|
||||||
|
|||||||
@ -40,7 +40,8 @@ class ChromaParams:
|
|||||||
out_dim: int
|
out_dim: int
|
||||||
hidden_dim: int
|
hidden_dim: int
|
||||||
n_layers: int
|
n_layers: int
|
||||||
|
txt_ids_dims: list
|
||||||
|
vec_in_dim: int
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -37,7 +37,7 @@ class ChromaRadianceParams(ChromaParams):
|
|||||||
nerf_final_head_type: str
|
nerf_final_head_type: str
|
||||||
# None means use the same dtype as the model.
|
# None means use the same dtype as the model.
|
||||||
nerf_embedder_dtype: Optional[torch.dtype]
|
nerf_embedder_dtype: Optional[torch.dtype]
|
||||||
|
use_x0: bool
|
||||||
|
|
||||||
class ChromaRadiance(Chroma):
|
class ChromaRadiance(Chroma):
|
||||||
"""
|
"""
|
||||||
@ -159,6 +159,9 @@ class ChromaRadiance(Chroma):
|
|||||||
self.skip_dit = []
|
self.skip_dit = []
|
||||||
self.lite = False
|
self.lite = False
|
||||||
|
|
||||||
|
if params.use_x0:
|
||||||
|
self.register_buffer("__x0__", torch.tensor([]))
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def _nerf_final_layer(self) -> nn.Module:
|
def _nerf_final_layer(self) -> nn.Module:
|
||||||
if self.params.nerf_final_head_type == "linear":
|
if self.params.nerf_final_head_type == "linear":
|
||||||
@ -276,6 +279,12 @@ class ChromaRadiance(Chroma):
|
|||||||
params_dict |= overrides
|
params_dict |= overrides
|
||||||
return params.__class__(**params_dict)
|
return params.__class__(**params_dict)
|
||||||
|
|
||||||
|
def _apply_x0_residual(self, predicted, noisy, timesteps):
|
||||||
|
|
||||||
|
# non zero during training to prevent 0 div
|
||||||
|
eps = 0.0
|
||||||
|
return (noisy - predicted) / (timesteps.view(-1,1,1,1) + eps)
|
||||||
|
|
||||||
def _forward(
|
def _forward(
|
||||||
self,
|
self,
|
||||||
x: Tensor,
|
x: Tensor,
|
||||||
@ -316,4 +325,11 @@ class ChromaRadiance(Chroma):
|
|||||||
transformer_options,
|
transformer_options,
|
||||||
attn_mask=kwargs.get("attention_mask", None),
|
attn_mask=kwargs.get("attention_mask", None),
|
||||||
)
|
)
|
||||||
return self.forward_nerf(img, img_out, params)[:, :, :h, :w]
|
|
||||||
|
out = self.forward_nerf(img, img_out, params)[:, :, :h, :w]
|
||||||
|
|
||||||
|
# If x0 variant → v-pred, just return this instead
|
||||||
|
if hasattr(self, "__x0__"):
|
||||||
|
out = self._apply_x0_residual(out, img, timestep)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|||||||
@ -57,6 +57,35 @@ class MLPEmbedder(nn.Module):
|
|||||||
def forward(self, x: Tensor) -> Tensor:
|
def forward(self, x: Tensor) -> Tensor:
|
||||||
return self.out_layer(self.silu(self.in_layer(x)))
|
return self.out_layer(self.silu(self.in_layer(x)))
|
||||||
|
|
||||||
|
class YakMLP(nn.Module):
|
||||||
|
def __init__(self, hidden_size: int, intermediate_size: int, dtype=None, device=None, operations=None):
|
||||||
|
super().__init__()
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.intermediate_size = intermediate_size
|
||||||
|
self.gate_proj = operations.Linear(self.hidden_size, self.intermediate_size, bias=True, dtype=dtype, device=device)
|
||||||
|
self.up_proj = operations.Linear(self.hidden_size, self.intermediate_size, bias=True, dtype=dtype, device=device)
|
||||||
|
self.down_proj = operations.Linear(self.intermediate_size, self.hidden_size, bias=True, dtype=dtype, device=device)
|
||||||
|
self.act_fn = nn.SiLU()
|
||||||
|
|
||||||
|
def forward(self, x: Tensor) -> Tensor:
|
||||||
|
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
|
||||||
|
return down_proj
|
||||||
|
|
||||||
|
def build_mlp(hidden_size, mlp_hidden_dim, mlp_silu_act=False, yak_mlp=False, dtype=None, device=None, operations=None):
|
||||||
|
if yak_mlp:
|
||||||
|
return YakMLP(hidden_size, mlp_hidden_dim, dtype=dtype, device=device, operations=operations)
|
||||||
|
if mlp_silu_act:
|
||||||
|
return nn.Sequential(
|
||||||
|
operations.Linear(hidden_size, mlp_hidden_dim * 2, bias=False, dtype=dtype, device=device),
|
||||||
|
SiLUActivation(),
|
||||||
|
operations.Linear(mlp_hidden_dim, hidden_size, bias=False, dtype=dtype, device=device),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return nn.Sequential(
|
||||||
|
operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
|
||||||
|
nn.GELU(approximate="tanh"),
|
||||||
|
operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
|
||||||
|
)
|
||||||
|
|
||||||
class RMSNorm(torch.nn.Module):
|
class RMSNorm(torch.nn.Module):
|
||||||
def __init__(self, dim: int, dtype=None, device=None, operations=None):
|
def __init__(self, dim: int, dtype=None, device=None, operations=None):
|
||||||
@ -140,7 +169,7 @@ class SiLUActivation(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class DoubleStreamBlock(nn.Module):
|
class DoubleStreamBlock(nn.Module):
|
||||||
def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, modulation=True, mlp_silu_act=False, proj_bias=True, dtype=None, device=None, operations=None):
|
def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, modulation=True, mlp_silu_act=False, proj_bias=True, yak_mlp=False, dtype=None, device=None, operations=None):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
mlp_hidden_dim = int(hidden_size * mlp_ratio)
|
mlp_hidden_dim = int(hidden_size * mlp_ratio)
|
||||||
@ -156,18 +185,7 @@ class DoubleStreamBlock(nn.Module):
|
|||||||
|
|
||||||
self.img_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
|
self.img_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
|
||||||
|
|
||||||
if mlp_silu_act:
|
self.img_mlp = build_mlp(hidden_size, mlp_hidden_dim, mlp_silu_act=mlp_silu_act, yak_mlp=yak_mlp, dtype=dtype, device=device, operations=operations)
|
||||||
self.img_mlp = nn.Sequential(
|
|
||||||
operations.Linear(hidden_size, mlp_hidden_dim * 2, bias=False, dtype=dtype, device=device),
|
|
||||||
SiLUActivation(),
|
|
||||||
operations.Linear(mlp_hidden_dim, hidden_size, bias=False, dtype=dtype, device=device),
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
self.img_mlp = nn.Sequential(
|
|
||||||
operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
|
|
||||||
nn.GELU(approximate="tanh"),
|
|
||||||
operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
|
|
||||||
)
|
|
||||||
|
|
||||||
if self.modulation:
|
if self.modulation:
|
||||||
self.txt_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
|
self.txt_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
|
||||||
@ -177,18 +195,7 @@ class DoubleStreamBlock(nn.Module):
|
|||||||
|
|
||||||
self.txt_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
|
self.txt_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
|
||||||
|
|
||||||
if mlp_silu_act:
|
self.txt_mlp = build_mlp(hidden_size, mlp_hidden_dim, mlp_silu_act=mlp_silu_act, yak_mlp=yak_mlp, dtype=dtype, device=device, operations=operations)
|
||||||
self.txt_mlp = nn.Sequential(
|
|
||||||
operations.Linear(hidden_size, mlp_hidden_dim * 2, bias=False, dtype=dtype, device=device),
|
|
||||||
SiLUActivation(),
|
|
||||||
operations.Linear(mlp_hidden_dim, hidden_size, bias=False, dtype=dtype, device=device),
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
self.txt_mlp = nn.Sequential(
|
|
||||||
operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
|
|
||||||
nn.GELU(approximate="tanh"),
|
|
||||||
operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
|
|
||||||
)
|
|
||||||
|
|
||||||
self.flipped_img_txt = flipped_img_txt
|
self.flipped_img_txt = flipped_img_txt
|
||||||
|
|
||||||
@ -275,6 +282,7 @@ class SingleStreamBlock(nn.Module):
|
|||||||
modulation=True,
|
modulation=True,
|
||||||
mlp_silu_act=False,
|
mlp_silu_act=False,
|
||||||
bias=True,
|
bias=True,
|
||||||
|
yak_mlp=False,
|
||||||
dtype=None,
|
dtype=None,
|
||||||
device=None,
|
device=None,
|
||||||
operations=None
|
operations=None
|
||||||
@ -288,12 +296,17 @@ class SingleStreamBlock(nn.Module):
|
|||||||
self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
|
self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
|
||||||
|
|
||||||
self.mlp_hidden_dim_first = self.mlp_hidden_dim
|
self.mlp_hidden_dim_first = self.mlp_hidden_dim
|
||||||
|
self.yak_mlp = yak_mlp
|
||||||
if mlp_silu_act:
|
if mlp_silu_act:
|
||||||
self.mlp_hidden_dim_first = int(hidden_size * mlp_ratio * 2)
|
self.mlp_hidden_dim_first = int(hidden_size * mlp_ratio * 2)
|
||||||
self.mlp_act = SiLUActivation()
|
self.mlp_act = SiLUActivation()
|
||||||
else:
|
else:
|
||||||
self.mlp_act = nn.GELU(approximate="tanh")
|
self.mlp_act = nn.GELU(approximate="tanh")
|
||||||
|
|
||||||
|
if self.yak_mlp:
|
||||||
|
self.mlp_hidden_dim_first *= 2
|
||||||
|
self.mlp_act = nn.SiLU()
|
||||||
|
|
||||||
# qkv and mlp_in
|
# qkv and mlp_in
|
||||||
self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim_first, bias=bias, dtype=dtype, device=device)
|
self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim_first, bias=bias, dtype=dtype, device=device)
|
||||||
# proj and mlp_out
|
# proj and mlp_out
|
||||||
@ -325,7 +338,10 @@ class SingleStreamBlock(nn.Module):
|
|||||||
attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options)
|
attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options)
|
||||||
del q, k, v
|
del q, k, v
|
||||||
# compute activation in mlp stream, cat again and run second linear layer
|
# compute activation in mlp stream, cat again and run second linear layer
|
||||||
mlp = self.mlp_act(mlp)
|
if self.yak_mlp:
|
||||||
|
mlp = self.mlp_act(mlp[..., self.mlp_hidden_dim_first // 2:]) * mlp[..., :self.mlp_hidden_dim_first // 2]
|
||||||
|
else:
|
||||||
|
mlp = self.mlp_act(mlp)
|
||||||
output = self.linear2(torch.cat((attn, mlp), 2))
|
output = self.linear2(torch.cat((attn, mlp), 2))
|
||||||
x += apply_mod(output, mod.gate, None, modulation_dims)
|
x += apply_mod(output, mod.gate, None, modulation_dims)
|
||||||
if x.dtype == torch.float16:
|
if x.dtype == torch.float16:
|
||||||
|
|||||||
@ -15,7 +15,8 @@ from .layers import (
|
|||||||
MLPEmbedder,
|
MLPEmbedder,
|
||||||
SingleStreamBlock,
|
SingleStreamBlock,
|
||||||
timestep_embedding,
|
timestep_embedding,
|
||||||
Modulation
|
Modulation,
|
||||||
|
RMSNorm
|
||||||
)
|
)
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -34,11 +35,14 @@ class FluxParams:
|
|||||||
patch_size: int
|
patch_size: int
|
||||||
qkv_bias: bool
|
qkv_bias: bool
|
||||||
guidance_embed: bool
|
guidance_embed: bool
|
||||||
|
txt_ids_dims: list
|
||||||
global_modulation: bool = False
|
global_modulation: bool = False
|
||||||
mlp_silu_act: bool = False
|
mlp_silu_act: bool = False
|
||||||
ops_bias: bool = True
|
ops_bias: bool = True
|
||||||
default_ref_method: str = "offset"
|
default_ref_method: str = "offset"
|
||||||
ref_index_scale: float = 1.0
|
ref_index_scale: float = 1.0
|
||||||
|
yak_mlp: bool = False
|
||||||
|
txt_norm: bool = False
|
||||||
|
|
||||||
|
|
||||||
class Flux(nn.Module):
|
class Flux(nn.Module):
|
||||||
@ -76,6 +80,11 @@ class Flux(nn.Module):
|
|||||||
)
|
)
|
||||||
self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device)
|
self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device)
|
||||||
|
|
||||||
|
if params.txt_norm:
|
||||||
|
self.txt_norm = RMSNorm(params.context_in_dim, dtype=dtype, device=device, operations=operations)
|
||||||
|
else:
|
||||||
|
self.txt_norm = None
|
||||||
|
|
||||||
self.double_blocks = nn.ModuleList(
|
self.double_blocks = nn.ModuleList(
|
||||||
[
|
[
|
||||||
DoubleStreamBlock(
|
DoubleStreamBlock(
|
||||||
@ -86,6 +95,7 @@ class Flux(nn.Module):
|
|||||||
modulation=params.global_modulation is False,
|
modulation=params.global_modulation is False,
|
||||||
mlp_silu_act=params.mlp_silu_act,
|
mlp_silu_act=params.mlp_silu_act,
|
||||||
proj_bias=params.ops_bias,
|
proj_bias=params.ops_bias,
|
||||||
|
yak_mlp=params.yak_mlp,
|
||||||
dtype=dtype, device=device, operations=operations
|
dtype=dtype, device=device, operations=operations
|
||||||
)
|
)
|
||||||
for _ in range(params.depth)
|
for _ in range(params.depth)
|
||||||
@ -94,7 +104,7 @@ class Flux(nn.Module):
|
|||||||
|
|
||||||
self.single_blocks = nn.ModuleList(
|
self.single_blocks = nn.ModuleList(
|
||||||
[
|
[
|
||||||
SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, modulation=params.global_modulation is False, mlp_silu_act=params.mlp_silu_act, bias=params.ops_bias, dtype=dtype, device=device, operations=operations)
|
SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, modulation=params.global_modulation is False, mlp_silu_act=params.mlp_silu_act, bias=params.ops_bias, yak_mlp=params.yak_mlp, dtype=dtype, device=device, operations=operations)
|
||||||
for _ in range(params.depth_single_blocks)
|
for _ in range(params.depth_single_blocks)
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
@ -150,6 +160,8 @@ class Flux(nn.Module):
|
|||||||
y = torch.zeros((img.shape[0], self.params.vec_in_dim), device=img.device, dtype=img.dtype)
|
y = torch.zeros((img.shape[0], self.params.vec_in_dim), device=img.device, dtype=img.dtype)
|
||||||
vec = vec + self.vector_in(y[:, :self.params.vec_in_dim])
|
vec = vec + self.vector_in(y[:, :self.params.vec_in_dim])
|
||||||
|
|
||||||
|
if self.txt_norm is not None:
|
||||||
|
txt = self.txt_norm(txt)
|
||||||
txt = self.txt_in(txt)
|
txt = self.txt_in(txt)
|
||||||
|
|
||||||
vec_orig = vec
|
vec_orig = vec
|
||||||
@ -171,7 +183,10 @@ class Flux(nn.Module):
|
|||||||
pe = None
|
pe = None
|
||||||
|
|
||||||
blocks_replace = patches_replace.get("dit", {})
|
blocks_replace = patches_replace.get("dit", {})
|
||||||
|
transformer_options["total_blocks"] = len(self.double_blocks)
|
||||||
|
transformer_options["block_type"] = "double"
|
||||||
for i, block in enumerate(self.double_blocks):
|
for i, block in enumerate(self.double_blocks):
|
||||||
|
transformer_options["block_index"] = i
|
||||||
if ("double_block", i) in blocks_replace:
|
if ("double_block", i) in blocks_replace:
|
||||||
def block_wrap(args):
|
def block_wrap(args):
|
||||||
out = {}
|
out = {}
|
||||||
@ -215,7 +230,10 @@ class Flux(nn.Module):
|
|||||||
if self.params.global_modulation:
|
if self.params.global_modulation:
|
||||||
vec, _ = self.single_stream_modulation(vec_orig)
|
vec, _ = self.single_stream_modulation(vec_orig)
|
||||||
|
|
||||||
|
transformer_options["total_blocks"] = len(self.single_blocks)
|
||||||
|
transformer_options["block_type"] = "single"
|
||||||
for i, block in enumerate(self.single_blocks):
|
for i, block in enumerate(self.single_blocks):
|
||||||
|
transformer_options["block_index"] = i
|
||||||
if ("single_block", i) in blocks_replace:
|
if ("single_block", i) in blocks_replace:
|
||||||
def block_wrap(args):
|
def block_wrap(args):
|
||||||
out = {}
|
out = {}
|
||||||
@ -326,8 +344,9 @@ class Flux(nn.Module):
|
|||||||
|
|
||||||
txt_ids = torch.zeros((bs, context.shape[1], len(self.params.axes_dim)), device=x.device, dtype=torch.float32)
|
txt_ids = torch.zeros((bs, context.shape[1], len(self.params.axes_dim)), device=x.device, dtype=torch.float32)
|
||||||
|
|
||||||
if len(self.params.axes_dim) == 4: # Flux 2
|
if len(self.params.txt_ids_dims) > 0:
|
||||||
txt_ids[:, :, 3] = torch.linspace(0, context.shape[1] - 1, steps=context.shape[1], device=x.device, dtype=torch.float32)
|
for i in self.params.txt_ids_dims:
|
||||||
|
txt_ids[:, :, i] = torch.linspace(0, context.shape[1] - 1, steps=context.shape[1], device=x.device, dtype=torch.float32)
|
||||||
|
|
||||||
out = self.forward_orig(img, img_ids, context, txt_ids, timestep, y, guidance, control, transformer_options, attn_mask=kwargs.get("attention_mask", None))
|
out = self.forward_orig(img, img_ids, context, txt_ids, timestep, y, guidance, control, transformer_options, attn_mask=kwargs.get("attention_mask", None))
|
||||||
out = out[:, :img_tokens]
|
out = out[:, :img_tokens]
|
||||||
|
|||||||
@ -43,6 +43,7 @@ class HunyuanVideoParams:
|
|||||||
meanflow: bool
|
meanflow: bool
|
||||||
use_cond_type_embedding: bool
|
use_cond_type_embedding: bool
|
||||||
vision_in_dim: int
|
vision_in_dim: int
|
||||||
|
meanflow_sum: bool
|
||||||
|
|
||||||
|
|
||||||
class SelfAttentionRef(nn.Module):
|
class SelfAttentionRef(nn.Module):
|
||||||
@ -317,7 +318,7 @@ class HunyuanVideo(nn.Module):
|
|||||||
timesteps_r = transformer_options['sample_sigmas'][w[0] + 1]
|
timesteps_r = transformer_options['sample_sigmas'][w[0] + 1]
|
||||||
timesteps_r = timesteps_r.unsqueeze(0).to(device=timesteps.device, dtype=timesteps.dtype)
|
timesteps_r = timesteps_r.unsqueeze(0).to(device=timesteps.device, dtype=timesteps.dtype)
|
||||||
vec_r = self.time_r_in(timestep_embedding(timesteps_r, 256, time_factor=1000.0).to(img.dtype))
|
vec_r = self.time_r_in(timestep_embedding(timesteps_r, 256, time_factor=1000.0).to(img.dtype))
|
||||||
vec = (vec + vec_r) / 2
|
vec = (vec + vec_r) if self.params.meanflow_sum else (vec + vec_r) / 2
|
||||||
|
|
||||||
if ref_latent is not None:
|
if ref_latent is not None:
|
||||||
ref_latent_ids = self.img_ids(ref_latent)
|
ref_latent_ids = self.img_ids(ref_latent)
|
||||||
|
|||||||
@ -1,7 +1,8 @@
|
|||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from comfy.ldm.hunyuan_video.vae_refiner import RMS_norm, ResnetBlock, VideoConv3d
|
from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, VideoConv3d
|
||||||
|
from comfy.ldm.hunyuan_video.vae_refiner import RMS_norm
|
||||||
import model_management, model_patcher
|
import model_management, model_patcher
|
||||||
|
|
||||||
class SRResidualCausalBlock3D(nn.Module):
|
class SRResidualCausalBlock3D(nn.Module):
|
||||||
|
|||||||
@ -1,42 +1,12 @@
|
|||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, AttnBlock, VideoConv3d, Normalize
|
from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, AttnBlock, CarriedConv3d, Normalize, conv_carry_causal_3d, torch_cat_if_needed
|
||||||
import comfy.ops
|
import comfy.ops
|
||||||
import comfy.ldm.models.autoencoder
|
import comfy.ldm.models.autoencoder
|
||||||
import comfy.model_management
|
import comfy.model_management
|
||||||
ops = comfy.ops.disable_weight_init
|
ops = comfy.ops.disable_weight_init
|
||||||
|
|
||||||
class NoPadConv3d(nn.Module):
|
|
||||||
def __init__(self, n_channels, out_channels, kernel_size, stride=1, dilation=1, padding=0, **kwargs):
|
|
||||||
super().__init__()
|
|
||||||
self.conv = ops.Conv3d(n_channels, out_channels, kernel_size, stride=stride, dilation=dilation, **kwargs)
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
return self.conv(x)
|
|
||||||
|
|
||||||
|
|
||||||
def conv_carry_causal_3d(xl, op, conv_carry_in=None, conv_carry_out=None):
|
|
||||||
|
|
||||||
x = xl[0]
|
|
||||||
xl.clear()
|
|
||||||
|
|
||||||
if conv_carry_out is not None:
|
|
||||||
to_push = x[:, :, -2:, :, :].clone()
|
|
||||||
conv_carry_out.append(to_push)
|
|
||||||
|
|
||||||
if isinstance(op, NoPadConv3d):
|
|
||||||
if conv_carry_in is None:
|
|
||||||
x = torch.nn.functional.pad(x, (1, 1, 1, 1, 2, 0), mode = 'replicate')
|
|
||||||
else:
|
|
||||||
carry_len = conv_carry_in[0].shape[2]
|
|
||||||
x = torch.cat([conv_carry_in.pop(0), x], dim=2)
|
|
||||||
x = torch.nn.functional.pad(x, (1, 1, 1, 1, 2 - carry_len, 0), mode = 'replicate')
|
|
||||||
|
|
||||||
out = op(x)
|
|
||||||
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
class RMS_norm(nn.Module):
|
class RMS_norm(nn.Module):
|
||||||
def __init__(self, dim):
|
def __init__(self, dim):
|
||||||
@ -49,7 +19,7 @@ class RMS_norm(nn.Module):
|
|||||||
return F.normalize(x, dim=1) * self.scale * comfy.model_management.cast_to(self.gamma, dtype=x.dtype, device=x.device)
|
return F.normalize(x, dim=1) * self.scale * comfy.model_management.cast_to(self.gamma, dtype=x.dtype, device=x.device)
|
||||||
|
|
||||||
class DnSmpl(nn.Module):
|
class DnSmpl(nn.Module):
|
||||||
def __init__(self, ic, oc, tds=True, refiner_vae=True, op=VideoConv3d):
|
def __init__(self, ic, oc, tds, refiner_vae, op):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
fct = 2 * 2 * 2 if tds else 1 * 2 * 2
|
fct = 2 * 2 * 2 if tds else 1 * 2 * 2
|
||||||
assert oc % fct == 0
|
assert oc % fct == 0
|
||||||
@ -109,7 +79,7 @@ class DnSmpl(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class UpSmpl(nn.Module):
|
class UpSmpl(nn.Module):
|
||||||
def __init__(self, ic, oc, tus=True, refiner_vae=True, op=VideoConv3d):
|
def __init__(self, ic, oc, tus, refiner_vae, op):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
fct = 2 * 2 * 2 if tus else 1 * 2 * 2
|
fct = 2 * 2 * 2 if tus else 1 * 2 * 2
|
||||||
self.conv = op(ic, oc * fct, kernel_size=3, stride=1, padding=1)
|
self.conv = op(ic, oc * fct, kernel_size=3, stride=1, padding=1)
|
||||||
@ -163,23 +133,6 @@ class UpSmpl(nn.Module):
|
|||||||
|
|
||||||
return h + x
|
return h + x
|
||||||
|
|
||||||
class HunyuanRefinerResnetBlock(ResnetBlock):
|
|
||||||
def __init__(self, in_channels, out_channels, conv_op=NoPadConv3d, norm_op=RMS_norm):
|
|
||||||
super().__init__(in_channels=in_channels, out_channels=out_channels, temb_channels=0, conv_op=conv_op, norm_op=norm_op)
|
|
||||||
|
|
||||||
def forward(self, x, conv_carry_in=None, conv_carry_out=None):
|
|
||||||
h = x
|
|
||||||
h = [ self.swish(self.norm1(x)) ]
|
|
||||||
h = conv_carry_causal_3d(h, self.conv1, conv_carry_in=conv_carry_in, conv_carry_out=conv_carry_out)
|
|
||||||
|
|
||||||
h = [ self.dropout(self.swish(self.norm2(h))) ]
|
|
||||||
h = conv_carry_causal_3d(h, self.conv2, conv_carry_in=conv_carry_in, conv_carry_out=conv_carry_out)
|
|
||||||
|
|
||||||
if self.in_channels != self.out_channels:
|
|
||||||
x = self.nin_shortcut(x)
|
|
||||||
|
|
||||||
return x+h
|
|
||||||
|
|
||||||
class Encoder(nn.Module):
|
class Encoder(nn.Module):
|
||||||
def __init__(self, in_channels, z_channels, block_out_channels, num_res_blocks,
|
def __init__(self, in_channels, z_channels, block_out_channels, num_res_blocks,
|
||||||
ffactor_spatial, ffactor_temporal, downsample_match_channel=True, refiner_vae=True, **_):
|
ffactor_spatial, ffactor_temporal, downsample_match_channel=True, refiner_vae=True, **_):
|
||||||
@ -191,7 +144,7 @@ class Encoder(nn.Module):
|
|||||||
|
|
||||||
self.refiner_vae = refiner_vae
|
self.refiner_vae = refiner_vae
|
||||||
if self.refiner_vae:
|
if self.refiner_vae:
|
||||||
conv_op = NoPadConv3d
|
conv_op = CarriedConv3d
|
||||||
norm_op = RMS_norm
|
norm_op = RMS_norm
|
||||||
else:
|
else:
|
||||||
conv_op = ops.Conv3d
|
conv_op = ops.Conv3d
|
||||||
@ -206,9 +159,10 @@ class Encoder(nn.Module):
|
|||||||
|
|
||||||
for i, tgt in enumerate(block_out_channels):
|
for i, tgt in enumerate(block_out_channels):
|
||||||
stage = nn.Module()
|
stage = nn.Module()
|
||||||
stage.block = nn.ModuleList([HunyuanRefinerResnetBlock(in_channels=ch if j == 0 else tgt,
|
stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt,
|
||||||
out_channels=tgt,
|
out_channels=tgt,
|
||||||
conv_op=conv_op, norm_op=norm_op)
|
temb_channels=0,
|
||||||
|
conv_op=conv_op, norm_op=norm_op)
|
||||||
for j in range(num_res_blocks)])
|
for j in range(num_res_blocks)])
|
||||||
ch = tgt
|
ch = tgt
|
||||||
if i < depth:
|
if i < depth:
|
||||||
@ -218,9 +172,9 @@ class Encoder(nn.Module):
|
|||||||
self.down.append(stage)
|
self.down.append(stage)
|
||||||
|
|
||||||
self.mid = nn.Module()
|
self.mid = nn.Module()
|
||||||
self.mid.block_1 = HunyuanRefinerResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
|
self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
|
||||||
self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op)
|
self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op)
|
||||||
self.mid.block_2 = HunyuanRefinerResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
|
self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
|
||||||
|
|
||||||
self.norm_out = norm_op(ch)
|
self.norm_out = norm_op(ch)
|
||||||
self.conv_out = conv_op(ch, z_channels << 1, 3, 1, 1)
|
self.conv_out = conv_op(ch, z_channels << 1, 3, 1, 1)
|
||||||
@ -246,22 +200,20 @@ class Encoder(nn.Module):
|
|||||||
conv_carry_out = []
|
conv_carry_out = []
|
||||||
if i == len(x) - 1:
|
if i == len(x) - 1:
|
||||||
conv_carry_out = None
|
conv_carry_out = None
|
||||||
|
|
||||||
x1 = [ x1 ]
|
x1 = [ x1 ]
|
||||||
x1 = conv_carry_causal_3d(x1, self.conv_in, conv_carry_in, conv_carry_out)
|
x1 = conv_carry_causal_3d(x1, self.conv_in, conv_carry_in, conv_carry_out)
|
||||||
|
|
||||||
for stage in self.down:
|
for stage in self.down:
|
||||||
for blk in stage.block:
|
for blk in stage.block:
|
||||||
x1 = blk(x1, conv_carry_in, conv_carry_out)
|
x1 = blk(x1, None, conv_carry_in, conv_carry_out)
|
||||||
if hasattr(stage, 'downsample'):
|
if hasattr(stage, 'downsample'):
|
||||||
x1 = stage.downsample(x1, conv_carry_in, conv_carry_out)
|
x1 = stage.downsample(x1, conv_carry_in, conv_carry_out)
|
||||||
|
|
||||||
out.append(x1)
|
out.append(x1)
|
||||||
conv_carry_in = conv_carry_out
|
conv_carry_in = conv_carry_out
|
||||||
|
|
||||||
if len(out) > 1:
|
out = torch_cat_if_needed(out, dim=2)
|
||||||
out = torch.cat(out, dim=2)
|
|
||||||
else:
|
|
||||||
out = out[0]
|
|
||||||
|
|
||||||
x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(out)))
|
x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(out)))
|
||||||
del out
|
del out
|
||||||
@ -288,7 +240,7 @@ class Decoder(nn.Module):
|
|||||||
|
|
||||||
self.refiner_vae = refiner_vae
|
self.refiner_vae = refiner_vae
|
||||||
if self.refiner_vae:
|
if self.refiner_vae:
|
||||||
conv_op = NoPadConv3d
|
conv_op = CarriedConv3d
|
||||||
norm_op = RMS_norm
|
norm_op = RMS_norm
|
||||||
else:
|
else:
|
||||||
conv_op = ops.Conv3d
|
conv_op = ops.Conv3d
|
||||||
@ -298,9 +250,9 @@ class Decoder(nn.Module):
|
|||||||
self.conv_in = conv_op(z_channels, ch, kernel_size=3, stride=1, padding=1)
|
self.conv_in = conv_op(z_channels, ch, kernel_size=3, stride=1, padding=1)
|
||||||
|
|
||||||
self.mid = nn.Module()
|
self.mid = nn.Module()
|
||||||
self.mid.block_1 = HunyuanRefinerResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
|
self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
|
||||||
self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op)
|
self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op)
|
||||||
self.mid.block_2 = HunyuanRefinerResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
|
self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
|
||||||
|
|
||||||
self.up = nn.ModuleList()
|
self.up = nn.ModuleList()
|
||||||
depth = (ffactor_spatial >> 1).bit_length()
|
depth = (ffactor_spatial >> 1).bit_length()
|
||||||
@ -308,9 +260,10 @@ class Decoder(nn.Module):
|
|||||||
|
|
||||||
for i, tgt in enumerate(block_out_channels):
|
for i, tgt in enumerate(block_out_channels):
|
||||||
stage = nn.Module()
|
stage = nn.Module()
|
||||||
stage.block = nn.ModuleList([HunyuanRefinerResnetBlock(in_channels=ch if j == 0 else tgt,
|
stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt,
|
||||||
out_channels=tgt,
|
out_channels=tgt,
|
||||||
conv_op=conv_op, norm_op=norm_op)
|
temb_channels=0,
|
||||||
|
conv_op=conv_op, norm_op=norm_op)
|
||||||
for j in range(num_res_blocks + 1)])
|
for j in range(num_res_blocks + 1)])
|
||||||
ch = tgt
|
ch = tgt
|
||||||
if i < depth:
|
if i < depth:
|
||||||
@ -340,7 +293,7 @@ class Decoder(nn.Module):
|
|||||||
conv_carry_out = None
|
conv_carry_out = None
|
||||||
for stage in self.up:
|
for stage in self.up:
|
||||||
for blk in stage.block:
|
for blk in stage.block:
|
||||||
x1 = blk(x1, conv_carry_in, conv_carry_out)
|
x1 = blk(x1, None, conv_carry_in, conv_carry_out)
|
||||||
if hasattr(stage, 'upsample'):
|
if hasattr(stage, 'upsample'):
|
||||||
x1 = stage.upsample(x1, conv_carry_in, conv_carry_out)
|
x1 = stage.upsample(x1, conv_carry_in, conv_carry_out)
|
||||||
|
|
||||||
@ -350,10 +303,7 @@ class Decoder(nn.Module):
|
|||||||
conv_carry_in = conv_carry_out
|
conv_carry_in = conv_carry_out
|
||||||
del x
|
del x
|
||||||
|
|
||||||
if len(out) > 1:
|
out = torch_cat_if_needed(out, dim=2)
|
||||||
out = torch.cat(out, dim=2)
|
|
||||||
else:
|
|
||||||
out = out[0]
|
|
||||||
|
|
||||||
if not self.refiner_vae:
|
if not self.refiner_vae:
|
||||||
if z.shape[-3] == 1:
|
if z.shape[-3] == 1:
|
||||||
|
|||||||
413
comfy/ldm/kandinsky5/model.py
Normal file
413
comfy/ldm/kandinsky5/model.py
Normal file
@ -0,0 +1,413 @@
|
|||||||
|
import torch
|
||||||
|
from torch import nn
|
||||||
|
import math
|
||||||
|
|
||||||
|
import comfy.ldm.common_dit
|
||||||
|
from comfy.ldm.modules.attention import optimized_attention
|
||||||
|
from comfy.ldm.flux.math import apply_rope1
|
||||||
|
from comfy.ldm.flux.layers import EmbedND
|
||||||
|
|
||||||
|
def attention(q, k, v, heads, transformer_options={}):
|
||||||
|
return optimized_attention(
|
||||||
|
q.transpose(1, 2),
|
||||||
|
k.transpose(1, 2),
|
||||||
|
v.transpose(1, 2),
|
||||||
|
heads=heads,
|
||||||
|
skip_reshape=True,
|
||||||
|
transformer_options=transformer_options
|
||||||
|
)
|
||||||
|
|
||||||
|
def apply_scale_shift_norm(norm, x, scale, shift):
|
||||||
|
return torch.addcmul(shift, norm(x), scale + 1.0)
|
||||||
|
|
||||||
|
def apply_gate_sum(x, out, gate):
|
||||||
|
return torch.addcmul(x, gate, out)
|
||||||
|
|
||||||
|
def get_shift_scale_gate(params):
|
||||||
|
shift, scale, gate = torch.chunk(params, 3, dim=-1)
|
||||||
|
return tuple(x.unsqueeze(1) for x in (shift, scale, gate))
|
||||||
|
|
||||||
|
def get_freqs(dim, max_period=10000.0):
|
||||||
|
return torch.exp(-math.log(max_period) * torch.arange(start=0, end=dim, dtype=torch.float32) / dim)
|
||||||
|
|
||||||
|
|
||||||
|
class TimeEmbeddings(nn.Module):
|
||||||
|
def __init__(self, model_dim, time_dim, max_period=10000.0, operation_settings=None):
|
||||||
|
super().__init__()
|
||||||
|
assert model_dim % 2 == 0
|
||||||
|
self.model_dim = model_dim
|
||||||
|
self.max_period = max_period
|
||||||
|
self.register_buffer("freqs", get_freqs(model_dim // 2, max_period), persistent=False)
|
||||||
|
operations = operation_settings.get("operations")
|
||||||
|
self.in_layer = operations.Linear(model_dim, time_dim, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||||
|
self.activation = nn.SiLU()
|
||||||
|
self.out_layer = operations.Linear(time_dim, time_dim, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||||
|
|
||||||
|
def forward(self, timestep, dtype):
|
||||||
|
args = torch.outer(timestep, self.freqs.to(device=timestep.device))
|
||||||
|
time_embed = torch.cat([torch.cos(args), torch.sin(args)], dim=-1).to(dtype)
|
||||||
|
time_embed = self.out_layer(self.activation(self.in_layer(time_embed)))
|
||||||
|
return time_embed
|
||||||
|
|
||||||
|
|
||||||
|
class TextEmbeddings(nn.Module):
|
||||||
|
def __init__(self, text_dim, model_dim, operation_settings=None):
|
||||||
|
super().__init__()
|
||||||
|
operations = operation_settings.get("operations")
|
||||||
|
self.in_layer = operations.Linear(text_dim, model_dim, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||||
|
self.norm = operations.LayerNorm(model_dim, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||||
|
|
||||||
|
def forward(self, text_embed):
|
||||||
|
text_embed = self.in_layer(text_embed)
|
||||||
|
return self.norm(text_embed).type_as(text_embed)
|
||||||
|
|
||||||
|
|
||||||
|
class VisualEmbeddings(nn.Module):
|
||||||
|
def __init__(self, visual_dim, model_dim, patch_size, operation_settings=None):
|
||||||
|
super().__init__()
|
||||||
|
self.patch_size = patch_size
|
||||||
|
operations = operation_settings.get("operations")
|
||||||
|
self.in_layer = operations.Linear(visual_dim, model_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = x.movedim(1, -1) # B C T H W -> B T H W C
|
||||||
|
B, T, H, W, dim = x.shape
|
||||||
|
pt, ph, pw = self.patch_size
|
||||||
|
|
||||||
|
x = x.view(
|
||||||
|
B,
|
||||||
|
T // pt, pt,
|
||||||
|
H // ph, ph,
|
||||||
|
W // pw, pw,
|
||||||
|
dim,
|
||||||
|
).permute(0, 1, 3, 5, 2, 4, 6, 7).flatten(4, 7)
|
||||||
|
|
||||||
|
return self.in_layer(x)
|
||||||
|
|
||||||
|
|
||||||
|
class Modulation(nn.Module):
|
||||||
|
def __init__(self, time_dim, model_dim, num_params, operation_settings=None):
|
||||||
|
super().__init__()
|
||||||
|
self.activation = nn.SiLU()
|
||||||
|
self.out_layer = operation_settings.get("operations").Linear(time_dim, num_params * model_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return self.out_layer(self.activation(x))
|
||||||
|
|
||||||
|
|
||||||
|
class SelfAttention(nn.Module):
|
||||||
|
def __init__(self, num_channels, head_dim, operation_settings=None):
|
||||||
|
super().__init__()
|
||||||
|
assert num_channels % head_dim == 0
|
||||||
|
self.num_heads = num_channels // head_dim
|
||||||
|
self.head_dim = head_dim
|
||||||
|
|
||||||
|
operations = operation_settings.get("operations")
|
||||||
|
self.to_query = operations.Linear(num_channels, num_channels, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||||
|
self.to_key = operations.Linear(num_channels, num_channels, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||||
|
self.to_value = operations.Linear(num_channels, num_channels, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||||
|
self.query_norm = operations.RMSNorm(head_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||||
|
self.key_norm = operations.RMSNorm(head_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||||
|
|
||||||
|
self.out_layer = operations.Linear(num_channels, num_channels, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||||
|
self.num_chunks = 2
|
||||||
|
|
||||||
|
def _compute_qk(self, x, freqs, proj_fn, norm_fn):
|
||||||
|
result = proj_fn(x).view(*x.shape[:-1], self.num_heads, -1)
|
||||||
|
return apply_rope1(norm_fn(result), freqs)
|
||||||
|
|
||||||
|
def _forward(self, x, freqs, transformer_options={}):
|
||||||
|
q = self._compute_qk(x, freqs, self.to_query, self.query_norm)
|
||||||
|
k = self._compute_qk(x, freqs, self.to_key, self.key_norm)
|
||||||
|
v = self.to_value(x).view(*x.shape[:-1], self.num_heads, -1)
|
||||||
|
out = attention(q, k, v, self.num_heads, transformer_options=transformer_options)
|
||||||
|
return self.out_layer(out)
|
||||||
|
|
||||||
|
def _forward_chunked(self, x, freqs, transformer_options={}):
|
||||||
|
def process_chunks(proj_fn, norm_fn):
|
||||||
|
x_chunks = torch.chunk(x, self.num_chunks, dim=1)
|
||||||
|
freqs_chunks = torch.chunk(freqs, self.num_chunks, dim=1)
|
||||||
|
chunks = []
|
||||||
|
for x_chunk, freqs_chunk in zip(x_chunks, freqs_chunks):
|
||||||
|
chunks.append(self._compute_qk(x_chunk, freqs_chunk, proj_fn, norm_fn))
|
||||||
|
return torch.cat(chunks, dim=1)
|
||||||
|
|
||||||
|
q = process_chunks(self.to_query, self.query_norm)
|
||||||
|
k = process_chunks(self.to_key, self.key_norm)
|
||||||
|
v = self.to_value(x).view(*x.shape[:-1], self.num_heads, -1)
|
||||||
|
out = attention(q, k, v, self.num_heads, transformer_options=transformer_options)
|
||||||
|
return self.out_layer(out)
|
||||||
|
|
||||||
|
def forward(self, x, freqs, transformer_options={}):
|
||||||
|
if x.shape[1] > 8192:
|
||||||
|
return self._forward_chunked(x, freqs, transformer_options=transformer_options)
|
||||||
|
else:
|
||||||
|
return self._forward(x, freqs, transformer_options=transformer_options)
|
||||||
|
|
||||||
|
|
||||||
|
class CrossAttention(SelfAttention):
|
||||||
|
def get_qkv(self, x, context):
|
||||||
|
q = self.to_query(x).view(*x.shape[:-1], self.num_heads, -1)
|
||||||
|
k = self.to_key(context).view(*context.shape[:-1], self.num_heads, -1)
|
||||||
|
v = self.to_value(context).view(*context.shape[:-1], self.num_heads, -1)
|
||||||
|
return q, k, v
|
||||||
|
|
||||||
|
def forward(self, x, context, transformer_options={}):
|
||||||
|
q, k, v = self.get_qkv(x, context)
|
||||||
|
out = attention(self.query_norm(q), self.key_norm(k), v, self.num_heads, transformer_options=transformer_options)
|
||||||
|
return self.out_layer(out)
|
||||||
|
|
||||||
|
|
||||||
|
class FeedForward(nn.Module):
|
||||||
|
def __init__(self, dim, ff_dim, operation_settings=None):
|
||||||
|
super().__init__()
|
||||||
|
operations = operation_settings.get("operations")
|
||||||
|
self.in_layer = operations.Linear(dim, ff_dim, bias=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||||
|
self.activation = nn.GELU()
|
||||||
|
self.out_layer = operations.Linear(ff_dim, dim, bias=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||||
|
self.num_chunks = 4
|
||||||
|
|
||||||
|
def _forward(self, x):
|
||||||
|
return self.out_layer(self.activation(self.in_layer(x)))
|
||||||
|
|
||||||
|
def _forward_chunked(self, x):
|
||||||
|
chunks = torch.chunk(x, self.num_chunks, dim=1)
|
||||||
|
output_chunks = []
|
||||||
|
for chunk in chunks:
|
||||||
|
output_chunks.append(self._forward(chunk))
|
||||||
|
return torch.cat(output_chunks, dim=1)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
if x.shape[1] > 8192:
|
||||||
|
return self._forward_chunked(x)
|
||||||
|
else:
|
||||||
|
return self._forward(x)
|
||||||
|
|
||||||
|
|
||||||
|
class OutLayer(nn.Module):
|
||||||
|
def __init__(self, model_dim, time_dim, visual_dim, patch_size, operation_settings=None):
|
||||||
|
super().__init__()
|
||||||
|
self.patch_size = patch_size
|
||||||
|
self.modulation = Modulation(time_dim, model_dim, 2, operation_settings=operation_settings)
|
||||||
|
operations = operation_settings.get("operations")
|
||||||
|
self.norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||||
|
self.out_layer = operations.Linear(model_dim, math.prod(patch_size) * visual_dim, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||||
|
|
||||||
|
def forward(self, visual_embed, time_embed):
|
||||||
|
B, T, H, W, _ = visual_embed.shape
|
||||||
|
shift, scale = torch.chunk(self.modulation(time_embed), 2, dim=-1)
|
||||||
|
scale = scale[:, None, None, None, :]
|
||||||
|
shift = shift[:, None, None, None, :]
|
||||||
|
visual_embed = apply_scale_shift_norm(self.norm, visual_embed, scale, shift)
|
||||||
|
x = self.out_layer(visual_embed)
|
||||||
|
|
||||||
|
out_dim = x.shape[-1] // (self.patch_size[0] * self.patch_size[1] * self.patch_size[2])
|
||||||
|
x = x.view(
|
||||||
|
B, T, H, W,
|
||||||
|
out_dim,
|
||||||
|
self.patch_size[0], self.patch_size[1], self.patch_size[2]
|
||||||
|
)
|
||||||
|
return x.permute(0, 4, 1, 5, 2, 6, 3, 7).flatten(2, 3).flatten(3, 4).flatten(4, 5)
|
||||||
|
|
||||||
|
|
||||||
|
class TransformerEncoderBlock(nn.Module):
|
||||||
|
def __init__(self, model_dim, time_dim, ff_dim, head_dim, operation_settings=None):
|
||||||
|
super().__init__()
|
||||||
|
self.text_modulation = Modulation(time_dim, model_dim, 6, operation_settings=operation_settings)
|
||||||
|
operations = operation_settings.get("operations")
|
||||||
|
|
||||||
|
self.self_attention_norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||||
|
self.self_attention = SelfAttention(model_dim, head_dim, operation_settings=operation_settings)
|
||||||
|
|
||||||
|
self.feed_forward_norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||||
|
self.feed_forward = FeedForward(model_dim, ff_dim, operation_settings=operation_settings)
|
||||||
|
|
||||||
|
def forward(self, x, time_embed, freqs, transformer_options={}):
|
||||||
|
self_attn_params, ff_params = torch.chunk(self.text_modulation(time_embed), 2, dim=-1)
|
||||||
|
shift, scale, gate = get_shift_scale_gate(self_attn_params)
|
||||||
|
out = apply_scale_shift_norm(self.self_attention_norm, x, scale, shift)
|
||||||
|
out = self.self_attention(out, freqs, transformer_options=transformer_options)
|
||||||
|
x = apply_gate_sum(x, out, gate)
|
||||||
|
|
||||||
|
shift, scale, gate = get_shift_scale_gate(ff_params)
|
||||||
|
out = apply_scale_shift_norm(self.feed_forward_norm, x, scale, shift)
|
||||||
|
out = self.feed_forward(out)
|
||||||
|
x = apply_gate_sum(x, out, gate)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class TransformerDecoderBlock(nn.Module):
|
||||||
|
def __init__(self, model_dim, time_dim, ff_dim, head_dim, operation_settings=None):
|
||||||
|
super().__init__()
|
||||||
|
self.visual_modulation = Modulation(time_dim, model_dim, 9, operation_settings=operation_settings)
|
||||||
|
|
||||||
|
operations = operation_settings.get("operations")
|
||||||
|
self.self_attention_norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||||
|
self.self_attention = SelfAttention(model_dim, head_dim, operation_settings=operation_settings)
|
||||||
|
|
||||||
|
self.cross_attention_norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||||
|
self.cross_attention = CrossAttention(model_dim, head_dim, operation_settings=operation_settings)
|
||||||
|
|
||||||
|
self.feed_forward_norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||||
|
self.feed_forward = FeedForward(model_dim, ff_dim, operation_settings=operation_settings)
|
||||||
|
|
||||||
|
def forward(self, visual_embed, text_embed, time_embed, freqs, transformer_options={}):
|
||||||
|
self_attn_params, cross_attn_params, ff_params = torch.chunk(self.visual_modulation(time_embed), 3, dim=-1)
|
||||||
|
# self attention
|
||||||
|
shift, scale, gate = get_shift_scale_gate(self_attn_params)
|
||||||
|
visual_out = apply_scale_shift_norm(self.self_attention_norm, visual_embed, scale, shift)
|
||||||
|
visual_out = self.self_attention(visual_out, freqs, transformer_options=transformer_options)
|
||||||
|
visual_embed = apply_gate_sum(visual_embed, visual_out, gate)
|
||||||
|
# cross attention
|
||||||
|
shift, scale, gate = get_shift_scale_gate(cross_attn_params)
|
||||||
|
visual_out = apply_scale_shift_norm(self.cross_attention_norm, visual_embed, scale, shift)
|
||||||
|
visual_out = self.cross_attention(visual_out, text_embed, transformer_options=transformer_options)
|
||||||
|
visual_embed = apply_gate_sum(visual_embed, visual_out, gate)
|
||||||
|
# feed forward
|
||||||
|
shift, scale, gate = get_shift_scale_gate(ff_params)
|
||||||
|
visual_out = apply_scale_shift_norm(self.feed_forward_norm, visual_embed, scale, shift)
|
||||||
|
visual_out = self.feed_forward(visual_out)
|
||||||
|
visual_embed = apply_gate_sum(visual_embed, visual_out, gate)
|
||||||
|
return visual_embed
|
||||||
|
|
||||||
|
|
||||||
|
class Kandinsky5(nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
in_visual_dim=16, out_visual_dim=16, in_text_dim=3584, in_text_dim2=768, time_dim=512,
|
||||||
|
model_dim=1792, ff_dim=7168, visual_embed_dim=132, patch_size=(1, 2, 2), num_text_blocks=2, num_visual_blocks=32,
|
||||||
|
axes_dims=(16, 24, 24), rope_scale_factor=(1.0, 2.0, 2.0),
|
||||||
|
dtype=None, device=None, operations=None, **kwargs
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
head_dim = sum(axes_dims)
|
||||||
|
self.rope_scale_factor = rope_scale_factor
|
||||||
|
self.in_visual_dim = in_visual_dim
|
||||||
|
self.model_dim = model_dim
|
||||||
|
self.patch_size = patch_size
|
||||||
|
self.visual_embed_dim = visual_embed_dim
|
||||||
|
self.dtype = dtype
|
||||||
|
self.device = device
|
||||||
|
operation_settings = {"operations": operations, "device": device, "dtype": dtype}
|
||||||
|
|
||||||
|
self.time_embeddings = TimeEmbeddings(model_dim, time_dim, operation_settings=operation_settings)
|
||||||
|
self.text_embeddings = TextEmbeddings(in_text_dim, model_dim, operation_settings=operation_settings)
|
||||||
|
self.pooled_text_embeddings = TextEmbeddings(in_text_dim2, time_dim, operation_settings=operation_settings)
|
||||||
|
self.visual_embeddings = VisualEmbeddings(visual_embed_dim, model_dim, patch_size, operation_settings=operation_settings)
|
||||||
|
|
||||||
|
self.text_transformer_blocks = nn.ModuleList(
|
||||||
|
[TransformerEncoderBlock(model_dim, time_dim, ff_dim, head_dim, operation_settings=operation_settings) for _ in range(num_text_blocks)]
|
||||||
|
)
|
||||||
|
|
||||||
|
self.visual_transformer_blocks = nn.ModuleList(
|
||||||
|
[TransformerDecoderBlock(model_dim, time_dim, ff_dim, head_dim, operation_settings=operation_settings) for _ in range(num_visual_blocks)]
|
||||||
|
)
|
||||||
|
|
||||||
|
self.out_layer = OutLayer(model_dim, time_dim, out_visual_dim, patch_size, operation_settings=operation_settings)
|
||||||
|
|
||||||
|
self.rope_embedder_3d = EmbedND(dim=head_dim, theta=10000.0, axes_dim=axes_dims)
|
||||||
|
self.rope_embedder_1d = EmbedND(dim=head_dim, theta=10000.0, axes_dim=[head_dim])
|
||||||
|
|
||||||
|
def rope_encode_1d(self, seq_len, seq_start=0, steps=None, device=None, dtype=None, transformer_options={}):
|
||||||
|
steps = seq_len if steps is None else steps
|
||||||
|
seq_ids = torch.linspace(seq_start, seq_start + (seq_len - 1), steps=steps, device=device, dtype=dtype)
|
||||||
|
seq_ids = seq_ids.reshape(-1, 1).unsqueeze(0) # Shape: (1, steps, 1)
|
||||||
|
freqs = self.rope_embedder_1d(seq_ids).movedim(1, 2)
|
||||||
|
return freqs
|
||||||
|
|
||||||
|
def rope_encode_3d(self, t, h, w, t_start=0, steps_t=None, steps_h=None, steps_w=None, device=None, dtype=None, transformer_options={}):
|
||||||
|
|
||||||
|
patch_size = self.patch_size
|
||||||
|
t_len = ((t + (patch_size[0] // 2)) // patch_size[0])
|
||||||
|
h_len = ((h + (patch_size[1] // 2)) // patch_size[1])
|
||||||
|
w_len = ((w + (patch_size[2] // 2)) // patch_size[2])
|
||||||
|
|
||||||
|
if steps_t is None:
|
||||||
|
steps_t = t_len
|
||||||
|
if steps_h is None:
|
||||||
|
steps_h = h_len
|
||||||
|
if steps_w is None:
|
||||||
|
steps_w = w_len
|
||||||
|
|
||||||
|
h_start = 0
|
||||||
|
w_start = 0
|
||||||
|
rope_options = transformer_options.get("rope_options", None)
|
||||||
|
if rope_options is not None:
|
||||||
|
t_len = (t_len - 1.0) * rope_options.get("scale_t", 1.0) + 1.0
|
||||||
|
h_len = (h_len - 1.0) * rope_options.get("scale_y", 1.0) + 1.0
|
||||||
|
w_len = (w_len - 1.0) * rope_options.get("scale_x", 1.0) + 1.0
|
||||||
|
|
||||||
|
t_start += rope_options.get("shift_t", 0.0)
|
||||||
|
h_start += rope_options.get("shift_y", 0.0)
|
||||||
|
w_start += rope_options.get("shift_x", 0.0)
|
||||||
|
else:
|
||||||
|
rope_scale_factor = self.rope_scale_factor
|
||||||
|
if self.model_dim == 4096: # pro video model uses different rope scaling at higher resolutions
|
||||||
|
if h * w >= 14080:
|
||||||
|
rope_scale_factor = (1.0, 3.16, 3.16)
|
||||||
|
|
||||||
|
t_len = (t_len - 1.0) / rope_scale_factor[0] + 1.0
|
||||||
|
h_len = (h_len - 1.0) / rope_scale_factor[1] + 1.0
|
||||||
|
w_len = (w_len - 1.0) / rope_scale_factor[2] + 1.0
|
||||||
|
|
||||||
|
img_ids = torch.zeros((steps_t, steps_h, steps_w, 3), device=device, dtype=dtype)
|
||||||
|
img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(t_start, t_start + (t_len - 1), steps=steps_t, device=device, dtype=dtype).reshape(-1, 1, 1)
|
||||||
|
img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(h_start, h_start + (h_len - 1), steps=steps_h, device=device, dtype=dtype).reshape(1, -1, 1)
|
||||||
|
img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(w_start, w_start + (w_len - 1), steps=steps_w, device=device, dtype=dtype).reshape(1, 1, -1)
|
||||||
|
img_ids = img_ids.reshape(1, -1, img_ids.shape[-1])
|
||||||
|
|
||||||
|
freqs = self.rope_embedder_3d(img_ids).movedim(1, 2)
|
||||||
|
return freqs
|
||||||
|
|
||||||
|
def forward_orig(self, x, timestep, context, y, freqs, freqs_text, transformer_options={}, **kwargs):
|
||||||
|
patches_replace = transformer_options.get("patches_replace", {})
|
||||||
|
context = self.text_embeddings(context)
|
||||||
|
time_embed = self.time_embeddings(timestep, x.dtype) + self.pooled_text_embeddings(y)
|
||||||
|
|
||||||
|
for block in self.text_transformer_blocks:
|
||||||
|
context = block(context, time_embed, freqs_text, transformer_options=transformer_options)
|
||||||
|
|
||||||
|
visual_embed = self.visual_embeddings(x)
|
||||||
|
visual_shape = visual_embed.shape[:-1]
|
||||||
|
visual_embed = visual_embed.flatten(1, -2)
|
||||||
|
|
||||||
|
blocks_replace = patches_replace.get("dit", {})
|
||||||
|
transformer_options["total_blocks"] = len(self.visual_transformer_blocks)
|
||||||
|
transformer_options["block_type"] = "double"
|
||||||
|
for i, block in enumerate(self.visual_transformer_blocks):
|
||||||
|
transformer_options["block_index"] = i
|
||||||
|
if ("double_block", i) in blocks_replace:
|
||||||
|
def block_wrap(args):
|
||||||
|
return block(x=args["x"], context=args["context"], time_embed=args["time_embed"], freqs=args["freqs"], transformer_options=args.get("transformer_options"))
|
||||||
|
visual_embed = blocks_replace[("double_block", i)]({"x": visual_embed, "context": context, "time_embed": time_embed, "freqs": freqs, "transformer_options": transformer_options}, {"original_block": block_wrap})["x"]
|
||||||
|
else:
|
||||||
|
visual_embed = block(visual_embed, context, time_embed, freqs=freqs, transformer_options=transformer_options)
|
||||||
|
|
||||||
|
visual_embed = visual_embed.reshape(*visual_shape, -1)
|
||||||
|
return self.out_layer(visual_embed, time_embed)
|
||||||
|
|
||||||
|
def _forward(self, x, timestep, context, y, time_dim_replace=None, transformer_options={}, **kwargs):
|
||||||
|
original_dims = x.ndim
|
||||||
|
if original_dims == 4:
|
||||||
|
x = x.unsqueeze(2)
|
||||||
|
bs, c, t_len, h, w = x.shape
|
||||||
|
x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size)
|
||||||
|
|
||||||
|
if time_dim_replace is not None:
|
||||||
|
time_dim_replace = comfy.ldm.common_dit.pad_to_patch_size(time_dim_replace, self.patch_size)
|
||||||
|
x[:, :time_dim_replace.shape[1], :time_dim_replace.shape[2]] = time_dim_replace
|
||||||
|
|
||||||
|
freqs = self.rope_encode_3d(t_len, h, w, device=x.device, dtype=x.dtype, transformer_options=transformer_options)
|
||||||
|
freqs_text = self.rope_encode_1d(context.shape[1], device=x.device, dtype=x.dtype, transformer_options=transformer_options)
|
||||||
|
|
||||||
|
out = self.forward_orig(x, timestep, context, y, freqs, freqs_text, transformer_options=transformer_options, **kwargs)
|
||||||
|
if original_dims == 4:
|
||||||
|
out = out.squeeze(2)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def forward(self, x, timestep, context, y, time_dim_replace=None, transformer_options={}, **kwargs):
|
||||||
|
return comfy.patcher_extension.WrapperExecutor.new_class_executor(
|
||||||
|
self._forward,
|
||||||
|
self,
|
||||||
|
comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
|
||||||
|
).execute(x, timestep, context, y, time_dim_replace=time_dim_replace, transformer_options=transformer_options, **kwargs)
|
||||||
160
comfy/ldm/lumina/controlnet.py
Normal file
160
comfy/ldm/lumina/controlnet.py
Normal file
@ -0,0 +1,160 @@
|
|||||||
|
import torch
|
||||||
|
from torch import nn
|
||||||
|
|
||||||
|
from .model import JointTransformerBlock
|
||||||
|
|
||||||
|
class ZImageControlTransformerBlock(JointTransformerBlock):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
layer_id: int,
|
||||||
|
dim: int,
|
||||||
|
n_heads: int,
|
||||||
|
n_kv_heads: int,
|
||||||
|
multiple_of: int,
|
||||||
|
ffn_dim_multiplier: float,
|
||||||
|
norm_eps: float,
|
||||||
|
qk_norm: bool,
|
||||||
|
modulation=True,
|
||||||
|
block_id=0,
|
||||||
|
operation_settings=None,
|
||||||
|
):
|
||||||
|
super().__init__(layer_id, dim, n_heads, n_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, qk_norm, modulation, z_image_modulation=True, operation_settings=operation_settings)
|
||||||
|
self.block_id = block_id
|
||||||
|
if block_id == 0:
|
||||||
|
self.before_proj = operation_settings.get("operations").Linear(self.dim, self.dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||||
|
self.after_proj = operation_settings.get("operations").Linear(self.dim, self.dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
|
||||||
|
|
||||||
|
def forward(self, c, x, **kwargs):
|
||||||
|
if self.block_id == 0:
|
||||||
|
c = self.before_proj(c) + x
|
||||||
|
c = super().forward(c, **kwargs)
|
||||||
|
c_skip = self.after_proj(c)
|
||||||
|
return c_skip, c
|
||||||
|
|
||||||
|
class ZImage_Control(torch.nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
dim: int = 3840,
|
||||||
|
n_heads: int = 30,
|
||||||
|
n_kv_heads: int = 30,
|
||||||
|
multiple_of: int = 256,
|
||||||
|
ffn_dim_multiplier: float = (8.0 / 3.0),
|
||||||
|
norm_eps: float = 1e-5,
|
||||||
|
qk_norm: bool = True,
|
||||||
|
n_control_layers=6,
|
||||||
|
control_in_dim=16,
|
||||||
|
additional_in_dim=0,
|
||||||
|
broken=False,
|
||||||
|
refiner_control=False,
|
||||||
|
dtype=None,
|
||||||
|
device=None,
|
||||||
|
operations=None,
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
operation_settings = {"operations": operations, "device": device, "dtype": dtype}
|
||||||
|
|
||||||
|
self.broken = broken
|
||||||
|
self.additional_in_dim = additional_in_dim
|
||||||
|
self.control_in_dim = control_in_dim
|
||||||
|
n_refiner_layers = 2
|
||||||
|
self.n_control_layers = n_control_layers
|
||||||
|
self.control_layers = nn.ModuleList(
|
||||||
|
[
|
||||||
|
ZImageControlTransformerBlock(
|
||||||
|
i,
|
||||||
|
dim,
|
||||||
|
n_heads,
|
||||||
|
n_kv_heads,
|
||||||
|
multiple_of,
|
||||||
|
ffn_dim_multiplier,
|
||||||
|
norm_eps,
|
||||||
|
qk_norm,
|
||||||
|
block_id=i,
|
||||||
|
operation_settings=operation_settings,
|
||||||
|
)
|
||||||
|
for i in range(self.n_control_layers)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
all_x_embedder = {}
|
||||||
|
patch_size = 2
|
||||||
|
f_patch_size = 1
|
||||||
|
x_embedder = operations.Linear(f_patch_size * patch_size * patch_size * (self.control_in_dim + self.additional_in_dim), dim, bias=True, device=device, dtype=dtype)
|
||||||
|
all_x_embedder[f"{patch_size}-{f_patch_size}"] = x_embedder
|
||||||
|
|
||||||
|
self.refiner_control = refiner_control
|
||||||
|
|
||||||
|
self.control_all_x_embedder = nn.ModuleDict(all_x_embedder)
|
||||||
|
if self.refiner_control:
|
||||||
|
self.control_noise_refiner = nn.ModuleList(
|
||||||
|
[
|
||||||
|
ZImageControlTransformerBlock(
|
||||||
|
layer_id,
|
||||||
|
dim,
|
||||||
|
n_heads,
|
||||||
|
n_kv_heads,
|
||||||
|
multiple_of,
|
||||||
|
ffn_dim_multiplier,
|
||||||
|
norm_eps,
|
||||||
|
qk_norm,
|
||||||
|
block_id=layer_id,
|
||||||
|
operation_settings=operation_settings,
|
||||||
|
)
|
||||||
|
for layer_id in range(n_refiner_layers)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.control_noise_refiner = nn.ModuleList(
|
||||||
|
[
|
||||||
|
JointTransformerBlock(
|
||||||
|
layer_id,
|
||||||
|
dim,
|
||||||
|
n_heads,
|
||||||
|
n_kv_heads,
|
||||||
|
multiple_of,
|
||||||
|
ffn_dim_multiplier,
|
||||||
|
norm_eps,
|
||||||
|
qk_norm,
|
||||||
|
modulation=True,
|
||||||
|
z_image_modulation=True,
|
||||||
|
operation_settings=operation_settings,
|
||||||
|
)
|
||||||
|
for layer_id in range(n_refiner_layers)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, cap_feats, control_context, x_freqs_cis, adaln_input):
|
||||||
|
patch_size = 2
|
||||||
|
f_patch_size = 1
|
||||||
|
pH = pW = patch_size
|
||||||
|
B, C, H, W = control_context.shape
|
||||||
|
control_context = self.control_all_x_embedder[f"{patch_size}-{f_patch_size}"](control_context.view(B, C, H // pH, pH, W // pW, pW).permute(0, 2, 4, 3, 5, 1).flatten(3).flatten(1, 2))
|
||||||
|
|
||||||
|
x_attn_mask = None
|
||||||
|
if not self.refiner_control:
|
||||||
|
for layer in self.control_noise_refiner:
|
||||||
|
control_context = layer(control_context, x_attn_mask, x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input)
|
||||||
|
|
||||||
|
return control_context
|
||||||
|
|
||||||
|
def forward_noise_refiner_block(self, layer_id, control_context, x, x_attn_mask, x_freqs_cis, adaln_input):
|
||||||
|
if self.refiner_control:
|
||||||
|
if self.broken:
|
||||||
|
if layer_id == 0:
|
||||||
|
return self.control_layers[layer_id](control_context, x, x_mask=x_attn_mask, freqs_cis=x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input=adaln_input)
|
||||||
|
if layer_id > 0:
|
||||||
|
out = None
|
||||||
|
for i in range(1, len(self.control_layers)):
|
||||||
|
o, control_context = self.control_layers[i](control_context, x, x_mask=x_attn_mask, freqs_cis=x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input=adaln_input)
|
||||||
|
if out is None:
|
||||||
|
out = o
|
||||||
|
|
||||||
|
return (out, control_context)
|
||||||
|
else:
|
||||||
|
return self.control_noise_refiner[layer_id](control_context, x, x_mask=x_attn_mask, freqs_cis=x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input=adaln_input)
|
||||||
|
else:
|
||||||
|
return (None, control_context)
|
||||||
|
|
||||||
|
def forward_control_block(self, layer_id, control_context, x, x_attn_mask, x_freqs_cis, adaln_input):
|
||||||
|
return self.control_layers[layer_id](control_context, x, x_mask=x_attn_mask, freqs_cis=x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input=adaln_input)
|
||||||
@ -22,6 +22,10 @@ def modulate(x, scale):
|
|||||||
# Core NextDiT Model #
|
# Core NextDiT Model #
|
||||||
#############################################################################
|
#############################################################################
|
||||||
|
|
||||||
|
def clamp_fp16(x):
|
||||||
|
if x.dtype == torch.float16:
|
||||||
|
return torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504)
|
||||||
|
return x
|
||||||
|
|
||||||
class JointAttention(nn.Module):
|
class JointAttention(nn.Module):
|
||||||
"""Multi-head attention module."""
|
"""Multi-head attention module."""
|
||||||
@ -169,7 +173,7 @@ class FeedForward(nn.Module):
|
|||||||
|
|
||||||
# @torch.compile
|
# @torch.compile
|
||||||
def _forward_silu_gating(self, x1, x3):
|
def _forward_silu_gating(self, x1, x3):
|
||||||
return F.silu(x1) * x3
|
return clamp_fp16(F.silu(x1) * x3)
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
return self.w2(self._forward_silu_gating(self.w1(x), self.w3(x)))
|
return self.w2(self._forward_silu_gating(self.w1(x), self.w3(x)))
|
||||||
@ -273,27 +277,27 @@ class JointTransformerBlock(nn.Module):
|
|||||||
scale_msa, gate_msa, scale_mlp, gate_mlp = self.adaLN_modulation(adaln_input).chunk(4, dim=1)
|
scale_msa, gate_msa, scale_mlp, gate_mlp = self.adaLN_modulation(adaln_input).chunk(4, dim=1)
|
||||||
|
|
||||||
x = x + gate_msa.unsqueeze(1).tanh() * self.attention_norm2(
|
x = x + gate_msa.unsqueeze(1).tanh() * self.attention_norm2(
|
||||||
self.attention(
|
clamp_fp16(self.attention(
|
||||||
modulate(self.attention_norm1(x), scale_msa),
|
modulate(self.attention_norm1(x), scale_msa),
|
||||||
x_mask,
|
x_mask,
|
||||||
freqs_cis,
|
freqs_cis,
|
||||||
transformer_options=transformer_options,
|
transformer_options=transformer_options,
|
||||||
)
|
))
|
||||||
)
|
)
|
||||||
x = x + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(
|
x = x + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(
|
||||||
self.feed_forward(
|
clamp_fp16(self.feed_forward(
|
||||||
modulate(self.ffn_norm1(x), scale_mlp),
|
modulate(self.ffn_norm1(x), scale_mlp),
|
||||||
)
|
))
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
assert adaln_input is None
|
assert adaln_input is None
|
||||||
x = x + self.attention_norm2(
|
x = x + self.attention_norm2(
|
||||||
self.attention(
|
clamp_fp16(self.attention(
|
||||||
self.attention_norm1(x),
|
self.attention_norm1(x),
|
||||||
x_mask,
|
x_mask,
|
||||||
freqs_cis,
|
freqs_cis,
|
||||||
transformer_options=transformer_options,
|
transformer_options=transformer_options,
|
||||||
)
|
))
|
||||||
)
|
)
|
||||||
x = x + self.ffn_norm2(
|
x = x + self.ffn_norm2(
|
||||||
self.feed_forward(
|
self.feed_forward(
|
||||||
@ -373,6 +377,7 @@ class NextDiT(nn.Module):
|
|||||||
z_image_modulation=False,
|
z_image_modulation=False,
|
||||||
time_scale=1.0,
|
time_scale=1.0,
|
||||||
pad_tokens_multiple=None,
|
pad_tokens_multiple=None,
|
||||||
|
clip_text_dim=None,
|
||||||
image_model=None,
|
image_model=None,
|
||||||
device=None,
|
device=None,
|
||||||
dtype=None,
|
dtype=None,
|
||||||
@ -443,6 +448,31 @@ class NextDiT(nn.Module):
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.clip_text_pooled_proj = None
|
||||||
|
|
||||||
|
if clip_text_dim is not None:
|
||||||
|
self.clip_text_dim = clip_text_dim
|
||||||
|
self.clip_text_pooled_proj = nn.Sequential(
|
||||||
|
operation_settings.get("operations").RMSNorm(clip_text_dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")),
|
||||||
|
operation_settings.get("operations").Linear(
|
||||||
|
clip_text_dim,
|
||||||
|
clip_text_dim,
|
||||||
|
bias=True,
|
||||||
|
device=operation_settings.get("device"),
|
||||||
|
dtype=operation_settings.get("dtype"),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
self.time_text_embed = nn.Sequential(
|
||||||
|
nn.SiLU(),
|
||||||
|
operation_settings.get("operations").Linear(
|
||||||
|
min(dim, 1024) + clip_text_dim,
|
||||||
|
min(dim, 1024),
|
||||||
|
bias=True,
|
||||||
|
device=operation_settings.get("device"),
|
||||||
|
dtype=operation_settings.get("dtype"),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
self.layers = nn.ModuleList(
|
self.layers = nn.ModuleList(
|
||||||
[
|
[
|
||||||
JointTransformerBlock(
|
JointTransformerBlock(
|
||||||
@ -506,10 +536,11 @@ class NextDiT(nn.Module):
|
|||||||
bsz = len(x)
|
bsz = len(x)
|
||||||
pH = pW = self.patch_size
|
pH = pW = self.patch_size
|
||||||
device = x[0].device
|
device = x[0].device
|
||||||
|
orig_x = x
|
||||||
|
|
||||||
if self.pad_tokens_multiple is not None:
|
if self.pad_tokens_multiple is not None:
|
||||||
pad_extra = (-cap_feats.shape[1]) % self.pad_tokens_multiple
|
pad_extra = (-cap_feats.shape[1]) % self.pad_tokens_multiple
|
||||||
cap_feats = torch.cat((cap_feats, self.cap_pad_token.to(device=cap_feats.device, dtype=cap_feats.dtype).unsqueeze(0).repeat(cap_feats.shape[0], pad_extra, 1)), dim=1)
|
cap_feats = torch.cat((cap_feats, self.cap_pad_token.to(device=cap_feats.device, dtype=cap_feats.dtype, copy=True).unsqueeze(0).repeat(cap_feats.shape[0], pad_extra, 1)), dim=1)
|
||||||
|
|
||||||
cap_pos_ids = torch.zeros(bsz, cap_feats.shape[1], 3, dtype=torch.float32, device=device)
|
cap_pos_ids = torch.zeros(bsz, cap_feats.shape[1], 3, dtype=torch.float32, device=device)
|
||||||
cap_pos_ids[:, :, 0] = torch.arange(cap_feats.shape[1], dtype=torch.float32, device=device) + 1.0
|
cap_pos_ids[:, :, 0] = torch.arange(cap_feats.shape[1], dtype=torch.float32, device=device) + 1.0
|
||||||
@ -517,26 +548,46 @@ class NextDiT(nn.Module):
|
|||||||
B, C, H, W = x.shape
|
B, C, H, W = x.shape
|
||||||
x = self.x_embedder(x.view(B, C, H // pH, pH, W // pW, pW).permute(0, 2, 4, 3, 5, 1).flatten(3).flatten(1, 2))
|
x = self.x_embedder(x.view(B, C, H // pH, pH, W // pW, pW).permute(0, 2, 4, 3, 5, 1).flatten(3).flatten(1, 2))
|
||||||
|
|
||||||
|
rope_options = transformer_options.get("rope_options", None)
|
||||||
|
h_scale = 1.0
|
||||||
|
w_scale = 1.0
|
||||||
|
h_start = 0
|
||||||
|
w_start = 0
|
||||||
|
if rope_options is not None:
|
||||||
|
h_scale = rope_options.get("scale_y", 1.0)
|
||||||
|
w_scale = rope_options.get("scale_x", 1.0)
|
||||||
|
|
||||||
|
h_start = rope_options.get("shift_y", 0.0)
|
||||||
|
w_start = rope_options.get("shift_x", 0.0)
|
||||||
|
|
||||||
H_tokens, W_tokens = H // pH, W // pW
|
H_tokens, W_tokens = H // pH, W // pW
|
||||||
x_pos_ids = torch.zeros((bsz, x.shape[1], 3), dtype=torch.float32, device=device)
|
x_pos_ids = torch.zeros((bsz, x.shape[1], 3), dtype=torch.float32, device=device)
|
||||||
x_pos_ids[:, :, 0] = cap_feats.shape[1] + 1
|
x_pos_ids[:, :, 0] = cap_feats.shape[1] + 1
|
||||||
x_pos_ids[:, :, 1] = torch.arange(H_tokens, dtype=torch.float32, device=device).view(-1, 1).repeat(1, W_tokens).flatten()
|
x_pos_ids[:, :, 1] = (torch.arange(H_tokens, dtype=torch.float32, device=device) * h_scale + h_start).view(-1, 1).repeat(1, W_tokens).flatten()
|
||||||
x_pos_ids[:, :, 2] = torch.arange(W_tokens, dtype=torch.float32, device=device).view(1, -1).repeat(H_tokens, 1).flatten()
|
x_pos_ids[:, :, 2] = (torch.arange(W_tokens, dtype=torch.float32, device=device) * w_scale + w_start).view(1, -1).repeat(H_tokens, 1).flatten()
|
||||||
|
|
||||||
if self.pad_tokens_multiple is not None:
|
if self.pad_tokens_multiple is not None:
|
||||||
pad_extra = (-x.shape[1]) % self.pad_tokens_multiple
|
pad_extra = (-x.shape[1]) % self.pad_tokens_multiple
|
||||||
x = torch.cat((x, self.x_pad_token.to(device=x.device, dtype=x.dtype).unsqueeze(0).repeat(x.shape[0], pad_extra, 1)), dim=1)
|
x = torch.cat((x, self.x_pad_token.to(device=x.device, dtype=x.dtype, copy=True).unsqueeze(0).repeat(x.shape[0], pad_extra, 1)), dim=1)
|
||||||
x_pos_ids = torch.nn.functional.pad(x_pos_ids, (0, 0, 0, pad_extra))
|
x_pos_ids = torch.nn.functional.pad(x_pos_ids, (0, 0, 0, pad_extra))
|
||||||
|
|
||||||
freqs_cis = self.rope_embedder(torch.cat((cap_pos_ids, x_pos_ids), dim=1)).movedim(1, 2)
|
freqs_cis = self.rope_embedder(torch.cat((cap_pos_ids, x_pos_ids), dim=1)).movedim(1, 2)
|
||||||
|
|
||||||
|
patches = transformer_options.get("patches", {})
|
||||||
|
|
||||||
# refine context
|
# refine context
|
||||||
for layer in self.context_refiner:
|
for layer in self.context_refiner:
|
||||||
cap_feats = layer(cap_feats, cap_mask, freqs_cis[:, :cap_pos_ids.shape[1]], transformer_options=transformer_options)
|
cap_feats = layer(cap_feats, cap_mask, freqs_cis[:, :cap_pos_ids.shape[1]], transformer_options=transformer_options)
|
||||||
|
|
||||||
padded_img_mask = None
|
padded_img_mask = None
|
||||||
for layer in self.noise_refiner:
|
x_input = x
|
||||||
|
for i, layer in enumerate(self.noise_refiner):
|
||||||
x = layer(x, padded_img_mask, freqs_cis[:, cap_pos_ids.shape[1]:], t, transformer_options=transformer_options)
|
x = layer(x, padded_img_mask, freqs_cis[:, cap_pos_ids.shape[1]:], t, transformer_options=transformer_options)
|
||||||
|
if "noise_refiner" in patches:
|
||||||
|
for p in patches["noise_refiner"]:
|
||||||
|
out = p({"img": x, "img_input": x_input, "txt": cap_feats, "pe": freqs_cis[:, cap_pos_ids.shape[1]:], "vec": t, "x": orig_x, "block_index": i, "transformer_options": transformer_options, "block_type": "noise_refiner"})
|
||||||
|
if "img" in out:
|
||||||
|
x = out["img"]
|
||||||
|
|
||||||
padded_full_embed = torch.cat((cap_feats, x), dim=1)
|
padded_full_embed = torch.cat((cap_feats, x), dim=1)
|
||||||
mask = None
|
mask = None
|
||||||
@ -552,7 +603,7 @@ class NextDiT(nn.Module):
|
|||||||
).execute(x, timesteps, context, num_tokens, attention_mask, **kwargs)
|
).execute(x, timesteps, context, num_tokens, attention_mask, **kwargs)
|
||||||
|
|
||||||
# def forward(self, x, t, cap_feats, cap_mask):
|
# def forward(self, x, t, cap_feats, cap_mask):
|
||||||
def _forward(self, x, timesteps, context, num_tokens, attention_mask=None, **kwargs):
|
def _forward(self, x, timesteps, context, num_tokens, attention_mask=None, transformer_options={}, **kwargs):
|
||||||
t = 1.0 - timesteps
|
t = 1.0 - timesteps
|
||||||
cap_feats = context
|
cap_feats = context
|
||||||
cap_mask = attention_mask
|
cap_mask = attention_mask
|
||||||
@ -569,16 +620,36 @@ class NextDiT(nn.Module):
|
|||||||
|
|
||||||
cap_feats = self.cap_embedder(cap_feats) # (N, L, D) # todo check if able to batchify w.o. redundant compute
|
cap_feats = self.cap_embedder(cap_feats) # (N, L, D) # todo check if able to batchify w.o. redundant compute
|
||||||
|
|
||||||
transformer_options = kwargs.get("transformer_options", {})
|
if self.clip_text_pooled_proj is not None:
|
||||||
|
pooled = kwargs.get("clip_text_pooled", None)
|
||||||
|
if pooled is not None:
|
||||||
|
pooled = self.clip_text_pooled_proj(pooled)
|
||||||
|
else:
|
||||||
|
pooled = torch.zeros((1, self.clip_text_dim), device=x.device, dtype=x.dtype)
|
||||||
|
|
||||||
|
adaln_input = self.time_text_embed(torch.cat((t, pooled), dim=-1))
|
||||||
|
|
||||||
|
patches = transformer_options.get("patches", {})
|
||||||
x_is_tensor = isinstance(x, torch.Tensor)
|
x_is_tensor = isinstance(x, torch.Tensor)
|
||||||
x, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, t, num_tokens, transformer_options=transformer_options)
|
img, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, adaln_input, num_tokens, transformer_options=transformer_options)
|
||||||
freqs_cis = freqs_cis.to(x.device)
|
freqs_cis = freqs_cis.to(img.device)
|
||||||
|
|
||||||
for layer in self.layers:
|
transformer_options["total_blocks"] = len(self.layers)
|
||||||
x = layer(x, mask, freqs_cis, adaln_input, transformer_options=transformer_options)
|
transformer_options["block_type"] = "double"
|
||||||
|
img_input = img
|
||||||
|
for i, layer in enumerate(self.layers):
|
||||||
|
transformer_options["block_index"] = i
|
||||||
|
img = layer(img, mask, freqs_cis, adaln_input, transformer_options=transformer_options)
|
||||||
|
if "double_block" in patches:
|
||||||
|
for p in patches["double_block"]:
|
||||||
|
out = p({"img": img[:, cap_size[0]:], "img_input": img_input[:, cap_size[0]:], "txt": img[:, :cap_size[0]], "pe": freqs_cis[:, cap_size[0]:], "vec": adaln_input, "x": x, "block_index": i, "transformer_options": transformer_options})
|
||||||
|
if "img" in out:
|
||||||
|
img[:, cap_size[0]:] = out["img"]
|
||||||
|
if "txt" in out:
|
||||||
|
img[:, :cap_size[0]] = out["txt"]
|
||||||
|
|
||||||
x = self.final_layer(x, adaln_input)
|
img = self.final_layer(img, adaln_input)
|
||||||
x = self.unpatchify(x, img_size, cap_size, return_tensor=x_is_tensor)[:,:,:h,:w]
|
img = self.unpatchify(img, img_size, cap_size, return_tensor=x_is_tensor)[:, :, :h, :w]
|
||||||
|
|
||||||
return -x
|
return -img
|
||||||
|
|
||||||
|
|||||||
@ -517,6 +517,7 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha
|
|||||||
|
|
||||||
@wrap_attn
|
@wrap_attn
|
||||||
def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False, **kwargs):
|
def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False, **kwargs):
|
||||||
|
exception_fallback = False
|
||||||
if skip_reshape:
|
if skip_reshape:
|
||||||
b, _, _, dim_head = q.shape
|
b, _, _, dim_head = q.shape
|
||||||
tensor_layout = "HND"
|
tensor_layout = "HND"
|
||||||
@ -541,6 +542,8 @@ def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=
|
|||||||
out = sageattn(q, k, v, attn_mask=mask, is_causal=False, tensor_layout=tensor_layout)
|
out = sageattn(q, k, v, attn_mask=mask, is_causal=False, tensor_layout=tensor_layout)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error("Error running sage attention: {}, using pytorch attention instead.".format(e))
|
logging.error("Error running sage attention: {}, using pytorch attention instead.".format(e))
|
||||||
|
exception_fallback = True
|
||||||
|
if exception_fallback:
|
||||||
if tensor_layout == "NHD":
|
if tensor_layout == "NHD":
|
||||||
q, k, v = map(
|
q, k, v = map(
|
||||||
lambda t: t.transpose(1, 2),
|
lambda t: t.transpose(1, 2),
|
||||||
|
|||||||
@ -13,6 +13,12 @@ if model_management.xformers_enabled_vae():
|
|||||||
import xformers
|
import xformers
|
||||||
import xformers.ops
|
import xformers.ops
|
||||||
|
|
||||||
|
def torch_cat_if_needed(xl, dim):
|
||||||
|
if len(xl) > 1:
|
||||||
|
return torch.cat(xl, dim)
|
||||||
|
else:
|
||||||
|
return xl[0]
|
||||||
|
|
||||||
def get_timestep_embedding(timesteps, embedding_dim):
|
def get_timestep_embedding(timesteps, embedding_dim):
|
||||||
"""
|
"""
|
||||||
This matches the implementation in Denoising Diffusion Probabilistic Models:
|
This matches the implementation in Denoising Diffusion Probabilistic Models:
|
||||||
@ -43,6 +49,37 @@ def Normalize(in_channels, num_groups=32):
|
|||||||
return ops.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
|
return ops.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
|
||||||
|
|
||||||
|
|
||||||
|
class CarriedConv3d(nn.Module):
|
||||||
|
def __init__(self, n_channels, out_channels, kernel_size, stride=1, dilation=1, padding=0, **kwargs):
|
||||||
|
super().__init__()
|
||||||
|
self.conv = ops.Conv3d(n_channels, out_channels, kernel_size, stride=stride, dilation=dilation, **kwargs)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return self.conv(x)
|
||||||
|
|
||||||
|
|
||||||
|
def conv_carry_causal_3d(xl, op, conv_carry_in=None, conv_carry_out=None):
|
||||||
|
|
||||||
|
x = xl[0]
|
||||||
|
xl.clear()
|
||||||
|
|
||||||
|
if isinstance(op, CarriedConv3d):
|
||||||
|
if conv_carry_in is None:
|
||||||
|
x = torch.nn.functional.pad(x, (1, 1, 1, 1, 2, 0), mode = 'replicate')
|
||||||
|
else:
|
||||||
|
carry_len = conv_carry_in[0].shape[2]
|
||||||
|
x = torch.nn.functional.pad(x, (1, 1, 1, 1, 2 - carry_len, 0), mode = 'replicate')
|
||||||
|
x = torch.cat([conv_carry_in.pop(0), x], dim=2)
|
||||||
|
|
||||||
|
if conv_carry_out is not None:
|
||||||
|
to_push = x[:, :, -2:, :, :].clone()
|
||||||
|
conv_carry_out.append(to_push)
|
||||||
|
|
||||||
|
out = op(x)
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
class VideoConv3d(nn.Module):
|
class VideoConv3d(nn.Module):
|
||||||
def __init__(self, n_channels, out_channels, kernel_size, stride=1, dilation=1, padding_mode='replicate', padding=1, **kwargs):
|
def __init__(self, n_channels, out_channels, kernel_size, stride=1, dilation=1, padding_mode='replicate', padding=1, **kwargs):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
@ -89,29 +126,24 @@ class Upsample(nn.Module):
|
|||||||
stride=1,
|
stride=1,
|
||||||
padding=1)
|
padding=1)
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x, conv_carry_in=None, conv_carry_out=None):
|
||||||
scale_factor = self.scale_factor
|
scale_factor = self.scale_factor
|
||||||
if isinstance(scale_factor, (int, float)):
|
if isinstance(scale_factor, (int, float)):
|
||||||
scale_factor = (scale_factor,) * (x.ndim - 2)
|
scale_factor = (scale_factor,) * (x.ndim - 2)
|
||||||
|
|
||||||
if x.ndim == 5 and scale_factor[0] > 1.0:
|
if x.ndim == 5 and scale_factor[0] > 1.0:
|
||||||
t = x.shape[2]
|
results = []
|
||||||
if t > 1:
|
if conv_carry_in is None:
|
||||||
a, b = x.split((1, t - 1), dim=2)
|
first = x[:, :, :1, :, :]
|
||||||
del x
|
results.append(interpolate_up(first.squeeze(2), scale_factor=scale_factor[1:]).unsqueeze(2))
|
||||||
b = interpolate_up(b, scale_factor)
|
x = x[:, :, 1:, :, :]
|
||||||
else:
|
if x.shape[2] > 0:
|
||||||
a = x
|
results.append(interpolate_up(x, scale_factor))
|
||||||
|
x = torch_cat_if_needed(results, dim=2)
|
||||||
a = interpolate_up(a.squeeze(2), scale_factor=scale_factor[1:]).unsqueeze(2)
|
|
||||||
if t > 1:
|
|
||||||
x = torch.cat((a, b), dim=2)
|
|
||||||
else:
|
|
||||||
x = a
|
|
||||||
else:
|
else:
|
||||||
x = interpolate_up(x, scale_factor)
|
x = interpolate_up(x, scale_factor)
|
||||||
if self.with_conv:
|
if self.with_conv:
|
||||||
x = self.conv(x)
|
x = conv_carry_causal_3d([x], self.conv, conv_carry_in, conv_carry_out)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
@ -127,17 +159,20 @@ class Downsample(nn.Module):
|
|||||||
stride=stride,
|
stride=stride,
|
||||||
padding=0)
|
padding=0)
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x, conv_carry_in=None, conv_carry_out=None):
|
||||||
if self.with_conv:
|
if self.with_conv:
|
||||||
if x.ndim == 4:
|
if isinstance(self.conv, CarriedConv3d):
|
||||||
|
x = conv_carry_causal_3d([x], self.conv, conv_carry_in, conv_carry_out)
|
||||||
|
elif x.ndim == 4:
|
||||||
pad = (0, 1, 0, 1)
|
pad = (0, 1, 0, 1)
|
||||||
mode = "constant"
|
mode = "constant"
|
||||||
x = torch.nn.functional.pad(x, pad, mode=mode, value=0)
|
x = torch.nn.functional.pad(x, pad, mode=mode, value=0)
|
||||||
|
x = self.conv(x)
|
||||||
elif x.ndim == 5:
|
elif x.ndim == 5:
|
||||||
pad = (1, 1, 1, 1, 2, 0)
|
pad = (1, 1, 1, 1, 2, 0)
|
||||||
mode = "replicate"
|
mode = "replicate"
|
||||||
x = torch.nn.functional.pad(x, pad, mode=mode)
|
x = torch.nn.functional.pad(x, pad, mode=mode)
|
||||||
x = self.conv(x)
|
x = self.conv(x)
|
||||||
else:
|
else:
|
||||||
x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
|
x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
|
||||||
return x
|
return x
|
||||||
@ -183,23 +218,23 @@ class ResnetBlock(nn.Module):
|
|||||||
stride=1,
|
stride=1,
|
||||||
padding=0)
|
padding=0)
|
||||||
|
|
||||||
def forward(self, x, temb=None):
|
def forward(self, x, temb=None, conv_carry_in=None, conv_carry_out=None):
|
||||||
h = x
|
h = x
|
||||||
h = self.norm1(h)
|
h = self.norm1(h)
|
||||||
h = self.swish(h)
|
h = [ self.swish(h) ]
|
||||||
h = self.conv1(h)
|
h = conv_carry_causal_3d(h, self.conv1, conv_carry_in=conv_carry_in, conv_carry_out=conv_carry_out)
|
||||||
|
|
||||||
if temb is not None:
|
if temb is not None:
|
||||||
h = h + self.temb_proj(self.swish(temb))[:,:,None,None]
|
h = h + self.temb_proj(self.swish(temb))[:,:,None,None]
|
||||||
|
|
||||||
h = self.norm2(h)
|
h = self.norm2(h)
|
||||||
h = self.swish(h)
|
h = self.swish(h)
|
||||||
h = self.dropout(h)
|
h = [ self.dropout(h) ]
|
||||||
h = self.conv2(h)
|
h = conv_carry_causal_3d(h, self.conv2, conv_carry_in=conv_carry_in, conv_carry_out=conv_carry_out)
|
||||||
|
|
||||||
if self.in_channels != self.out_channels:
|
if self.in_channels != self.out_channels:
|
||||||
if self.use_conv_shortcut:
|
if self.use_conv_shortcut:
|
||||||
x = self.conv_shortcut(x)
|
x = conv_carry_causal_3d([x], self.conv_shortcut, conv_carry_in=conv_carry_in, conv_carry_out=conv_carry_out)
|
||||||
else:
|
else:
|
||||||
x = self.nin_shortcut(x)
|
x = self.nin_shortcut(x)
|
||||||
|
|
||||||
@ -279,6 +314,7 @@ def pytorch_attention(q, k, v):
|
|||||||
orig_shape = q.shape
|
orig_shape = q.shape
|
||||||
B = orig_shape[0]
|
B = orig_shape[0]
|
||||||
C = orig_shape[1]
|
C = orig_shape[1]
|
||||||
|
oom_fallback = False
|
||||||
q, k, v = map(
|
q, k, v = map(
|
||||||
lambda t: t.view(B, 1, C, -1).transpose(2, 3).contiguous(),
|
lambda t: t.view(B, 1, C, -1).transpose(2, 3).contiguous(),
|
||||||
(q, k, v),
|
(q, k, v),
|
||||||
@ -289,6 +325,8 @@ def pytorch_attention(q, k, v):
|
|||||||
out = out.transpose(2, 3).reshape(orig_shape)
|
out = out.transpose(2, 3).reshape(orig_shape)
|
||||||
except model_management.OOM_EXCEPTION:
|
except model_management.OOM_EXCEPTION:
|
||||||
logging.warning("scaled_dot_product_attention OOMed: switched to slice attention")
|
logging.warning("scaled_dot_product_attention OOMed: switched to slice attention")
|
||||||
|
oom_fallback = True
|
||||||
|
if oom_fallback:
|
||||||
out = slice_attention(q.view(B, -1, C), k.view(B, -1, C).transpose(1, 2), v.view(B, -1, C).transpose(1, 2)).reshape(orig_shape)
|
out = slice_attention(q.view(B, -1, C), k.view(B, -1, C).transpose(1, 2), v.view(B, -1, C).transpose(1, 2)).reshape(orig_shape)
|
||||||
return out
|
return out
|
||||||
|
|
||||||
@ -517,9 +555,14 @@ class Encoder(nn.Module):
|
|||||||
self.num_res_blocks = num_res_blocks
|
self.num_res_blocks = num_res_blocks
|
||||||
self.resolution = resolution
|
self.resolution = resolution
|
||||||
self.in_channels = in_channels
|
self.in_channels = in_channels
|
||||||
|
self.carried = False
|
||||||
|
|
||||||
if conv3d:
|
if conv3d:
|
||||||
conv_op = VideoConv3d
|
if not attn_resolutions:
|
||||||
|
conv_op = CarriedConv3d
|
||||||
|
self.carried = True
|
||||||
|
else:
|
||||||
|
conv_op = VideoConv3d
|
||||||
mid_attn_conv_op = ops.Conv3d
|
mid_attn_conv_op = ops.Conv3d
|
||||||
else:
|
else:
|
||||||
conv_op = ops.Conv2d
|
conv_op = ops.Conv2d
|
||||||
@ -532,6 +575,7 @@ class Encoder(nn.Module):
|
|||||||
stride=1,
|
stride=1,
|
||||||
padding=1)
|
padding=1)
|
||||||
|
|
||||||
|
self.time_compress = 1
|
||||||
curr_res = resolution
|
curr_res = resolution
|
||||||
in_ch_mult = (1,)+tuple(ch_mult)
|
in_ch_mult = (1,)+tuple(ch_mult)
|
||||||
self.in_ch_mult = in_ch_mult
|
self.in_ch_mult = in_ch_mult
|
||||||
@ -558,10 +602,15 @@ class Encoder(nn.Module):
|
|||||||
if time_compress is not None:
|
if time_compress is not None:
|
||||||
if (self.num_resolutions - 1 - i_level) > math.log2(time_compress):
|
if (self.num_resolutions - 1 - i_level) > math.log2(time_compress):
|
||||||
stride = (1, 2, 2)
|
stride = (1, 2, 2)
|
||||||
|
else:
|
||||||
|
self.time_compress *= 2
|
||||||
down.downsample = Downsample(block_in, resamp_with_conv, stride=stride, conv_op=conv_op)
|
down.downsample = Downsample(block_in, resamp_with_conv, stride=stride, conv_op=conv_op)
|
||||||
curr_res = curr_res // 2
|
curr_res = curr_res // 2
|
||||||
self.down.append(down)
|
self.down.append(down)
|
||||||
|
|
||||||
|
if time_compress is not None:
|
||||||
|
self.time_compress = time_compress
|
||||||
|
|
||||||
# middle
|
# middle
|
||||||
self.mid = nn.Module()
|
self.mid = nn.Module()
|
||||||
self.mid.block_1 = ResnetBlock(in_channels=block_in,
|
self.mid.block_1 = ResnetBlock(in_channels=block_in,
|
||||||
@ -587,15 +636,42 @@ class Encoder(nn.Module):
|
|||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
# timestep embedding
|
# timestep embedding
|
||||||
temb = None
|
temb = None
|
||||||
# downsampling
|
|
||||||
h = self.conv_in(x)
|
if self.carried:
|
||||||
for i_level in range(self.num_resolutions):
|
xl = [x[:, :, :1, :, :]]
|
||||||
for i_block in range(self.num_res_blocks):
|
if x.shape[2] > self.time_compress:
|
||||||
h = self.down[i_level].block[i_block](h, temb)
|
tc = self.time_compress
|
||||||
if len(self.down[i_level].attn) > 0:
|
xl += torch.split(x[:, :, 1: 1 + ((x.shape[2] - 1) // tc) * tc, :, :], tc * 2, dim = 2)
|
||||||
h = self.down[i_level].attn[i_block](h)
|
x = xl
|
||||||
if i_level != self.num_resolutions-1:
|
else:
|
||||||
h = self.down[i_level].downsample(h)
|
x = [x]
|
||||||
|
out = []
|
||||||
|
|
||||||
|
conv_carry_in = None
|
||||||
|
|
||||||
|
for i, x1 in enumerate(x):
|
||||||
|
conv_carry_out = []
|
||||||
|
if i == len(x) - 1:
|
||||||
|
conv_carry_out = None
|
||||||
|
|
||||||
|
# downsampling
|
||||||
|
x1 = [ x1 ]
|
||||||
|
h1 = conv_carry_causal_3d(x1, self.conv_in, conv_carry_in, conv_carry_out)
|
||||||
|
|
||||||
|
for i_level in range(self.num_resolutions):
|
||||||
|
for i_block in range(self.num_res_blocks):
|
||||||
|
h1 = self.down[i_level].block[i_block](h1, temb, conv_carry_in, conv_carry_out)
|
||||||
|
if len(self.down[i_level].attn) > 0:
|
||||||
|
assert i == 0 #carried should not happen if attn exists
|
||||||
|
h1 = self.down[i_level].attn[i_block](h1)
|
||||||
|
if i_level != self.num_resolutions-1:
|
||||||
|
h1 = self.down[i_level].downsample(h1, conv_carry_in, conv_carry_out)
|
||||||
|
|
||||||
|
out.append(h1)
|
||||||
|
conv_carry_in = conv_carry_out
|
||||||
|
|
||||||
|
h = torch_cat_if_needed(out, dim=2)
|
||||||
|
del out
|
||||||
|
|
||||||
# middle
|
# middle
|
||||||
h = self.mid.block_1(h, temb)
|
h = self.mid.block_1(h, temb)
|
||||||
@ -604,15 +680,15 @@ class Encoder(nn.Module):
|
|||||||
|
|
||||||
# end
|
# end
|
||||||
h = self.norm_out(h)
|
h = self.norm_out(h)
|
||||||
h = nonlinearity(h)
|
h = [ nonlinearity(h) ]
|
||||||
h = self.conv_out(h)
|
h = conv_carry_causal_3d(h, self.conv_out)
|
||||||
return h
|
return h
|
||||||
|
|
||||||
|
|
||||||
class Decoder(nn.Module):
|
class Decoder(nn.Module):
|
||||||
def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
|
def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
|
||||||
attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
|
attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
|
||||||
resolution, z_channels, give_pre_end=False, tanh_out=False, use_linear_attn=False,
|
resolution, z_channels, tanh_out=False, use_linear_attn=False,
|
||||||
conv_out_op=ops.Conv2d,
|
conv_out_op=ops.Conv2d,
|
||||||
resnet_op=ResnetBlock,
|
resnet_op=ResnetBlock,
|
||||||
attn_op=AttnBlock,
|
attn_op=AttnBlock,
|
||||||
@ -626,12 +702,18 @@ class Decoder(nn.Module):
|
|||||||
self.num_res_blocks = num_res_blocks
|
self.num_res_blocks = num_res_blocks
|
||||||
self.resolution = resolution
|
self.resolution = resolution
|
||||||
self.in_channels = in_channels
|
self.in_channels = in_channels
|
||||||
self.give_pre_end = give_pre_end
|
|
||||||
self.tanh_out = tanh_out
|
self.tanh_out = tanh_out
|
||||||
|
self.carried = False
|
||||||
|
|
||||||
if conv3d:
|
if conv3d:
|
||||||
conv_op = VideoConv3d
|
if not attn_resolutions and resnet_op == ResnetBlock:
|
||||||
conv_out_op = VideoConv3d
|
conv_op = CarriedConv3d
|
||||||
|
conv_out_op = CarriedConv3d
|
||||||
|
self.carried = True
|
||||||
|
else:
|
||||||
|
conv_op = VideoConv3d
|
||||||
|
conv_out_op = VideoConv3d
|
||||||
|
|
||||||
mid_attn_conv_op = ops.Conv3d
|
mid_attn_conv_op = ops.Conv3d
|
||||||
else:
|
else:
|
||||||
conv_op = ops.Conv2d
|
conv_op = ops.Conv2d
|
||||||
@ -706,29 +788,43 @@ class Decoder(nn.Module):
|
|||||||
temb = None
|
temb = None
|
||||||
|
|
||||||
# z to block_in
|
# z to block_in
|
||||||
h = self.conv_in(z)
|
h = conv_carry_causal_3d([z], self.conv_in)
|
||||||
|
|
||||||
# middle
|
# middle
|
||||||
h = self.mid.block_1(h, temb, **kwargs)
|
h = self.mid.block_1(h, temb, **kwargs)
|
||||||
h = self.mid.attn_1(h, **kwargs)
|
h = self.mid.attn_1(h, **kwargs)
|
||||||
h = self.mid.block_2(h, temb, **kwargs)
|
h = self.mid.block_2(h, temb, **kwargs)
|
||||||
|
|
||||||
|
if self.carried:
|
||||||
|
h = torch.split(h, 2, dim=2)
|
||||||
|
else:
|
||||||
|
h = [ h ]
|
||||||
|
out = []
|
||||||
|
|
||||||
|
conv_carry_in = None
|
||||||
|
|
||||||
# upsampling
|
# upsampling
|
||||||
for i_level in reversed(range(self.num_resolutions)):
|
for i, h1 in enumerate(h):
|
||||||
for i_block in range(self.num_res_blocks+1):
|
conv_carry_out = []
|
||||||
h = self.up[i_level].block[i_block](h, temb, **kwargs)
|
if i == len(h) - 1:
|
||||||
if len(self.up[i_level].attn) > 0:
|
conv_carry_out = None
|
||||||
h = self.up[i_level].attn[i_block](h, **kwargs)
|
for i_level in reversed(range(self.num_resolutions)):
|
||||||
if i_level != 0:
|
for i_block in range(self.num_res_blocks+1):
|
||||||
h = self.up[i_level].upsample(h)
|
h1 = self.up[i_level].block[i_block](h1, temb, conv_carry_in, conv_carry_out, **kwargs)
|
||||||
|
if len(self.up[i_level].attn) > 0:
|
||||||
|
assert i == 0 #carried should not happen if attn exists
|
||||||
|
h1 = self.up[i_level].attn[i_block](h1, **kwargs)
|
||||||
|
if i_level != 0:
|
||||||
|
h1 = self.up[i_level].upsample(h1, conv_carry_in, conv_carry_out)
|
||||||
|
|
||||||
# end
|
h1 = self.norm_out(h1)
|
||||||
if self.give_pre_end:
|
h1 = [ nonlinearity(h1) ]
|
||||||
return h
|
h1 = conv_carry_causal_3d(h1, self.conv_out, conv_carry_in, conv_carry_out)
|
||||||
|
if self.tanh_out:
|
||||||
|
h1 = torch.tanh(h1)
|
||||||
|
out.append(h1)
|
||||||
|
conv_carry_in = conv_carry_out
|
||||||
|
|
||||||
h = self.norm_out(h)
|
out = torch_cat_if_needed(out, dim=2)
|
||||||
h = nonlinearity(h)
|
|
||||||
h = self.conv_out(h, **kwargs)
|
return out
|
||||||
if self.tanh_out:
|
|
||||||
h = torch.tanh(h)
|
|
||||||
return h
|
|
||||||
|
|||||||
@ -218,9 +218,24 @@ class QwenImageTransformerBlock(nn.Module):
|
|||||||
operations=operations,
|
operations=operations,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _modulate(self, x: torch.Tensor, mod_params: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
def _apply_gate(self, x, y, gate, timestep_zero_index=None):
|
||||||
|
if timestep_zero_index is not None:
|
||||||
|
return y + torch.cat((x[:, :timestep_zero_index] * gate[0], x[:, timestep_zero_index:] * gate[1]), dim=1)
|
||||||
|
else:
|
||||||
|
return torch.addcmul(y, gate, x)
|
||||||
|
|
||||||
|
def _modulate(self, x: torch.Tensor, mod_params: torch.Tensor, timestep_zero_index=None) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||||
shift, scale, gate = torch.chunk(mod_params, 3, dim=-1)
|
shift, scale, gate = torch.chunk(mod_params, 3, dim=-1)
|
||||||
return torch.addcmul(shift.unsqueeze(1), x, 1 + scale.unsqueeze(1)), gate.unsqueeze(1)
|
if timestep_zero_index is not None:
|
||||||
|
actual_batch = shift.size(0) // 2
|
||||||
|
shift, shift_0 = shift[:actual_batch], shift[actual_batch:]
|
||||||
|
scale, scale_0 = scale[:actual_batch], scale[actual_batch:]
|
||||||
|
gate, gate_0 = gate[:actual_batch], gate[actual_batch:]
|
||||||
|
reg = torch.addcmul(shift.unsqueeze(1), x[:, :timestep_zero_index], 1 + scale.unsqueeze(1))
|
||||||
|
zero = torch.addcmul(shift_0.unsqueeze(1), x[:, timestep_zero_index:], 1 + scale_0.unsqueeze(1))
|
||||||
|
return torch.cat((reg, zero), dim=1), (gate.unsqueeze(1), gate_0.unsqueeze(1))
|
||||||
|
else:
|
||||||
|
return torch.addcmul(shift.unsqueeze(1), x, 1 + scale.unsqueeze(1)), gate.unsqueeze(1)
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
@ -229,14 +244,19 @@ class QwenImageTransformerBlock(nn.Module):
|
|||||||
encoder_hidden_states_mask: torch.Tensor,
|
encoder_hidden_states_mask: torch.Tensor,
|
||||||
temb: torch.Tensor,
|
temb: torch.Tensor,
|
||||||
image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
|
image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
|
||||||
|
timestep_zero_index=None,
|
||||||
transformer_options={},
|
transformer_options={},
|
||||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||||
img_mod_params = self.img_mod(temb)
|
img_mod_params = self.img_mod(temb)
|
||||||
|
|
||||||
|
if timestep_zero_index is not None:
|
||||||
|
temb = temb.chunk(2, dim=0)[0]
|
||||||
|
|
||||||
txt_mod_params = self.txt_mod(temb)
|
txt_mod_params = self.txt_mod(temb)
|
||||||
img_mod1, img_mod2 = img_mod_params.chunk(2, dim=-1)
|
img_mod1, img_mod2 = img_mod_params.chunk(2, dim=-1)
|
||||||
txt_mod1, txt_mod2 = txt_mod_params.chunk(2, dim=-1)
|
txt_mod1, txt_mod2 = txt_mod_params.chunk(2, dim=-1)
|
||||||
|
|
||||||
img_modulated, img_gate1 = self._modulate(self.img_norm1(hidden_states), img_mod1)
|
img_modulated, img_gate1 = self._modulate(self.img_norm1(hidden_states), img_mod1, timestep_zero_index)
|
||||||
del img_mod1
|
del img_mod1
|
||||||
txt_modulated, txt_gate1 = self._modulate(self.txt_norm1(encoder_hidden_states), txt_mod1)
|
txt_modulated, txt_gate1 = self._modulate(self.txt_norm1(encoder_hidden_states), txt_mod1)
|
||||||
del txt_mod1
|
del txt_mod1
|
||||||
@ -251,15 +271,15 @@ class QwenImageTransformerBlock(nn.Module):
|
|||||||
del img_modulated
|
del img_modulated
|
||||||
del txt_modulated
|
del txt_modulated
|
||||||
|
|
||||||
hidden_states = hidden_states + img_gate1 * img_attn_output
|
hidden_states = self._apply_gate(img_attn_output, hidden_states, img_gate1, timestep_zero_index)
|
||||||
encoder_hidden_states = encoder_hidden_states + txt_gate1 * txt_attn_output
|
encoder_hidden_states = encoder_hidden_states + txt_gate1 * txt_attn_output
|
||||||
del img_attn_output
|
del img_attn_output
|
||||||
del txt_attn_output
|
del txt_attn_output
|
||||||
del img_gate1
|
del img_gate1
|
||||||
del txt_gate1
|
del txt_gate1
|
||||||
|
|
||||||
img_modulated2, img_gate2 = self._modulate(self.img_norm2(hidden_states), img_mod2)
|
img_modulated2, img_gate2 = self._modulate(self.img_norm2(hidden_states), img_mod2, timestep_zero_index)
|
||||||
hidden_states = torch.addcmul(hidden_states, img_gate2, self.img_mlp(img_modulated2))
|
hidden_states = self._apply_gate(self.img_mlp(img_modulated2), hidden_states, img_gate2, timestep_zero_index)
|
||||||
|
|
||||||
txt_modulated2, txt_gate2 = self._modulate(self.txt_norm2(encoder_hidden_states), txt_mod2)
|
txt_modulated2, txt_gate2 = self._modulate(self.txt_norm2(encoder_hidden_states), txt_mod2)
|
||||||
encoder_hidden_states = torch.addcmul(encoder_hidden_states, txt_gate2, self.txt_mlp(txt_modulated2))
|
encoder_hidden_states = torch.addcmul(encoder_hidden_states, txt_gate2, self.txt_mlp(txt_modulated2))
|
||||||
@ -302,6 +322,7 @@ class QwenImageTransformer2DModel(nn.Module):
|
|||||||
pooled_projection_dim: int = 768,
|
pooled_projection_dim: int = 768,
|
||||||
guidance_embeds: bool = False,
|
guidance_embeds: bool = False,
|
||||||
axes_dims_rope: Tuple[int, int, int] = (16, 56, 56),
|
axes_dims_rope: Tuple[int, int, int] = (16, 56, 56),
|
||||||
|
default_ref_method="index",
|
||||||
image_model=None,
|
image_model=None,
|
||||||
final_layer=True,
|
final_layer=True,
|
||||||
dtype=None,
|
dtype=None,
|
||||||
@ -314,6 +335,7 @@ class QwenImageTransformer2DModel(nn.Module):
|
|||||||
self.in_channels = in_channels
|
self.in_channels = in_channels
|
||||||
self.out_channels = out_channels or in_channels
|
self.out_channels = out_channels or in_channels
|
||||||
self.inner_dim = num_attention_heads * attention_head_dim
|
self.inner_dim = num_attention_heads * attention_head_dim
|
||||||
|
self.default_ref_method = default_ref_method
|
||||||
|
|
||||||
self.pe_embedder = EmbedND(dim=attention_head_dim, theta=10000, axes_dim=list(axes_dims_rope))
|
self.pe_embedder = EmbedND(dim=attention_head_dim, theta=10000, axes_dim=list(axes_dims_rope))
|
||||||
|
|
||||||
@ -391,11 +413,14 @@ class QwenImageTransformer2DModel(nn.Module):
|
|||||||
hidden_states, img_ids, orig_shape = self.process_img(x)
|
hidden_states, img_ids, orig_shape = self.process_img(x)
|
||||||
num_embeds = hidden_states.shape[1]
|
num_embeds = hidden_states.shape[1]
|
||||||
|
|
||||||
|
timestep_zero_index = None
|
||||||
if ref_latents is not None:
|
if ref_latents is not None:
|
||||||
h = 0
|
h = 0
|
||||||
w = 0
|
w = 0
|
||||||
index = 0
|
index = 0
|
||||||
index_ref_method = kwargs.get("ref_latents_method", "index") == "index"
|
ref_method = kwargs.get("ref_latents_method", self.default_ref_method)
|
||||||
|
index_ref_method = (ref_method == "index") or (ref_method == "index_timestep_zero")
|
||||||
|
timestep_zero = ref_method == "index_timestep_zero"
|
||||||
for ref in ref_latents:
|
for ref in ref_latents:
|
||||||
if index_ref_method:
|
if index_ref_method:
|
||||||
index += 1
|
index += 1
|
||||||
@ -415,6 +440,10 @@ class QwenImageTransformer2DModel(nn.Module):
|
|||||||
kontext, kontext_ids, _ = self.process_img(ref, index=index, h_offset=h_offset, w_offset=w_offset)
|
kontext, kontext_ids, _ = self.process_img(ref, index=index, h_offset=h_offset, w_offset=w_offset)
|
||||||
hidden_states = torch.cat([hidden_states, kontext], dim=1)
|
hidden_states = torch.cat([hidden_states, kontext], dim=1)
|
||||||
img_ids = torch.cat([img_ids, kontext_ids], dim=1)
|
img_ids = torch.cat([img_ids, kontext_ids], dim=1)
|
||||||
|
if timestep_zero:
|
||||||
|
if index > 0:
|
||||||
|
timestep = torch.cat([timestep, timestep * 0], dim=0)
|
||||||
|
timestep_zero_index = num_embeds
|
||||||
|
|
||||||
txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size) // 2, ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size) // 2))
|
txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size) // 2, ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size) // 2))
|
||||||
txt_ids = torch.arange(txt_start, txt_start + context.shape[1], device=x.device).reshape(1, -1, 1).repeat(x.shape[0], 1, 3)
|
txt_ids = torch.arange(txt_start, txt_start + context.shape[1], device=x.device).reshape(1, -1, 1).repeat(x.shape[0], 1, 3)
|
||||||
@ -446,7 +475,7 @@ class QwenImageTransformer2DModel(nn.Module):
|
|||||||
if ("double_block", i) in blocks_replace:
|
if ("double_block", i) in blocks_replace:
|
||||||
def block_wrap(args):
|
def block_wrap(args):
|
||||||
out = {}
|
out = {}
|
||||||
out["txt"], out["img"] = block(hidden_states=args["img"], encoder_hidden_states=args["txt"], encoder_hidden_states_mask=encoder_hidden_states_mask, temb=args["vec"], image_rotary_emb=args["pe"], transformer_options=args["transformer_options"])
|
out["txt"], out["img"] = block(hidden_states=args["img"], encoder_hidden_states=args["txt"], encoder_hidden_states_mask=encoder_hidden_states_mask, temb=args["vec"], image_rotary_emb=args["pe"], timestep_zero_index=timestep_zero_index, transformer_options=args["transformer_options"])
|
||||||
return out
|
return out
|
||||||
out = blocks_replace[("double_block", i)]({"img": hidden_states, "txt": encoder_hidden_states, "vec": temb, "pe": image_rotary_emb, "transformer_options": transformer_options}, {"original_block": block_wrap})
|
out = blocks_replace[("double_block", i)]({"img": hidden_states, "txt": encoder_hidden_states, "vec": temb, "pe": image_rotary_emb, "transformer_options": transformer_options}, {"original_block": block_wrap})
|
||||||
hidden_states = out["img"]
|
hidden_states = out["img"]
|
||||||
@ -458,6 +487,7 @@ class QwenImageTransformer2DModel(nn.Module):
|
|||||||
encoder_hidden_states_mask=encoder_hidden_states_mask,
|
encoder_hidden_states_mask=encoder_hidden_states_mask,
|
||||||
temb=temb,
|
temb=temb,
|
||||||
image_rotary_emb=image_rotary_emb,
|
image_rotary_emb=image_rotary_emb,
|
||||||
|
timestep_zero_index=timestep_zero_index,
|
||||||
transformer_options=transformer_options,
|
transformer_options=transformer_options,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -474,6 +504,9 @@ class QwenImageTransformer2DModel(nn.Module):
|
|||||||
if add is not None:
|
if add is not None:
|
||||||
hidden_states[:, :add.shape[1]] += add
|
hidden_states[:, :add.shape[1]] += add
|
||||||
|
|
||||||
|
if timestep_zero_index is not None:
|
||||||
|
temb = temb.chunk(2, dim=0)[0]
|
||||||
|
|
||||||
hidden_states = self.norm_out(hidden_states, temb)
|
hidden_states = self.norm_out(hidden_states, temb)
|
||||||
hidden_states = self.proj_out(hidden_states)
|
hidden_states = self.proj_out(hidden_states)
|
||||||
|
|
||||||
|
|||||||
@ -568,7 +568,10 @@ class WanModel(torch.nn.Module):
|
|||||||
|
|
||||||
patches_replace = transformer_options.get("patches_replace", {})
|
patches_replace = transformer_options.get("patches_replace", {})
|
||||||
blocks_replace = patches_replace.get("dit", {})
|
blocks_replace = patches_replace.get("dit", {})
|
||||||
|
transformer_options["total_blocks"] = len(self.blocks)
|
||||||
|
transformer_options["block_type"] = "double"
|
||||||
for i, block in enumerate(self.blocks):
|
for i, block in enumerate(self.blocks):
|
||||||
|
transformer_options["block_index"] = i
|
||||||
if ("double_block", i) in blocks_replace:
|
if ("double_block", i) in blocks_replace:
|
||||||
def block_wrap(args):
|
def block_wrap(args):
|
||||||
out = {}
|
out = {}
|
||||||
@ -763,7 +766,10 @@ class VaceWanModel(WanModel):
|
|||||||
|
|
||||||
patches_replace = transformer_options.get("patches_replace", {})
|
patches_replace = transformer_options.get("patches_replace", {})
|
||||||
blocks_replace = patches_replace.get("dit", {})
|
blocks_replace = patches_replace.get("dit", {})
|
||||||
|
transformer_options["total_blocks"] = len(self.blocks)
|
||||||
|
transformer_options["block_type"] = "double"
|
||||||
for i, block in enumerate(self.blocks):
|
for i, block in enumerate(self.blocks):
|
||||||
|
transformer_options["block_index"] = i
|
||||||
if ("double_block", i) in blocks_replace:
|
if ("double_block", i) in blocks_replace:
|
||||||
def block_wrap(args):
|
def block_wrap(args):
|
||||||
out = {}
|
out = {}
|
||||||
@ -862,7 +868,10 @@ class CameraWanModel(WanModel):
|
|||||||
|
|
||||||
patches_replace = transformer_options.get("patches_replace", {})
|
patches_replace = transformer_options.get("patches_replace", {})
|
||||||
blocks_replace = patches_replace.get("dit", {})
|
blocks_replace = patches_replace.get("dit", {})
|
||||||
|
transformer_options["total_blocks"] = len(self.blocks)
|
||||||
|
transformer_options["block_type"] = "double"
|
||||||
for i, block in enumerate(self.blocks):
|
for i, block in enumerate(self.blocks):
|
||||||
|
transformer_options["block_index"] = i
|
||||||
if ("double_block", i) in blocks_replace:
|
if ("double_block", i) in blocks_replace:
|
||||||
def block_wrap(args):
|
def block_wrap(args):
|
||||||
out = {}
|
out = {}
|
||||||
@ -1326,16 +1335,19 @@ class WanModel_S2V(WanModel):
|
|||||||
|
|
||||||
patches_replace = transformer_options.get("patches_replace", {})
|
patches_replace = transformer_options.get("patches_replace", {})
|
||||||
blocks_replace = patches_replace.get("dit", {})
|
blocks_replace = patches_replace.get("dit", {})
|
||||||
|
transformer_options["total_blocks"] = len(self.blocks)
|
||||||
|
transformer_options["block_type"] = "double"
|
||||||
for i, block in enumerate(self.blocks):
|
for i, block in enumerate(self.blocks):
|
||||||
|
transformer_options["block_index"] = i
|
||||||
if ("double_block", i) in blocks_replace:
|
if ("double_block", i) in blocks_replace:
|
||||||
def block_wrap(args):
|
def block_wrap(args):
|
||||||
out = {}
|
out = {}
|
||||||
out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"])
|
out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], transformer_options=args["transformer_options"])
|
||||||
return out
|
return out
|
||||||
out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs}, {"original_block": block_wrap})
|
out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs, "transformer_options": transformer_options}, {"original_block": block_wrap})
|
||||||
x = out["img"]
|
x = out["img"]
|
||||||
else:
|
else:
|
||||||
x = block(x, e=e0, freqs=freqs, context=context)
|
x = block(x, e=e0, freqs=freqs, context=context, transformer_options=transformer_options)
|
||||||
if audio_emb is not None:
|
if audio_emb is not None:
|
||||||
x = self.audio_injector(x, i, audio_emb, audio_emb_global, seq_len)
|
x = self.audio_injector(x, i, audio_emb, audio_emb_global, seq_len)
|
||||||
# head
|
# head
|
||||||
@ -1574,7 +1586,10 @@ class HumoWanModel(WanModel):
|
|||||||
|
|
||||||
patches_replace = transformer_options.get("patches_replace", {})
|
patches_replace = transformer_options.get("patches_replace", {})
|
||||||
blocks_replace = patches_replace.get("dit", {})
|
blocks_replace = patches_replace.get("dit", {})
|
||||||
|
transformer_options["total_blocks"] = len(self.blocks)
|
||||||
|
transformer_options["block_type"] = "double"
|
||||||
for i, block in enumerate(self.blocks):
|
for i, block in enumerate(self.blocks):
|
||||||
|
transformer_options["block_index"] = i
|
||||||
if ("double_block", i) in blocks_replace:
|
if ("double_block", i) in blocks_replace:
|
||||||
def block_wrap(args):
|
def block_wrap(args):
|
||||||
out = {}
|
out = {}
|
||||||
|
|||||||
@ -523,7 +523,10 @@ class AnimateWanModel(WanModel):
|
|||||||
|
|
||||||
patches_replace = transformer_options.get("patches_replace", {})
|
patches_replace = transformer_options.get("patches_replace", {})
|
||||||
blocks_replace = patches_replace.get("dit", {})
|
blocks_replace = patches_replace.get("dit", {})
|
||||||
|
transformer_options["total_blocks"] = len(self.blocks)
|
||||||
|
transformer_options["block_type"] = "double"
|
||||||
for i, block in enumerate(self.blocks):
|
for i, block in enumerate(self.blocks):
|
||||||
|
transformer_options["block_index"] = i
|
||||||
if ("double_block", i) in blocks_replace:
|
if ("double_block", i) in blocks_replace:
|
||||||
def block_wrap(args):
|
def block_wrap(args):
|
||||||
out = {}
|
out = {}
|
||||||
|
|||||||
@ -313,6 +313,23 @@ def model_lora_keys_unet(model, key_map={}):
|
|||||||
key_map["transformer.{}".format(key_lora)] = k
|
key_map["transformer.{}".format(key_lora)] = k
|
||||||
key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = k #SimpleTuner lycoris format
|
key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = k #SimpleTuner lycoris format
|
||||||
|
|
||||||
|
if isinstance(model, comfy.model_base.Lumina2):
|
||||||
|
diffusers_keys = comfy.utils.z_image_to_diffusers(model.model_config.unet_config, output_prefix="diffusion_model.")
|
||||||
|
for k in diffusers_keys:
|
||||||
|
if k.endswith(".weight"):
|
||||||
|
to = diffusers_keys[k]
|
||||||
|
key_lora = k[:-len(".weight")]
|
||||||
|
key_map["diffusion_model.{}".format(key_lora)] = to
|
||||||
|
key_map["transformer.{}".format(key_lora)] = to
|
||||||
|
key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = to
|
||||||
|
|
||||||
|
if isinstance(model, comfy.model_base.Kandinsky5):
|
||||||
|
for k in sdk:
|
||||||
|
if k.startswith("diffusion_model.") and k.endswith(".weight"):
|
||||||
|
key_lora = k[len("diffusion_model."):-len(".weight")]
|
||||||
|
key_map["{}".format(key_lora)] = k
|
||||||
|
key_map["transformer.{}".format(key_lora)] = k
|
||||||
|
|
||||||
return key_map
|
return key_map
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -47,6 +47,7 @@ import comfy.ldm.chroma_radiance.model
|
|||||||
import comfy.ldm.ace.model
|
import comfy.ldm.ace.model
|
||||||
import comfy.ldm.omnigen.omnigen2
|
import comfy.ldm.omnigen.omnigen2
|
||||||
import comfy.ldm.qwen_image.model
|
import comfy.ldm.qwen_image.model
|
||||||
|
import comfy.ldm.kandinsky5.model
|
||||||
|
|
||||||
import comfy.model_management
|
import comfy.model_management
|
||||||
import comfy.patcher_extension
|
import comfy.patcher_extension
|
||||||
@ -134,7 +135,7 @@ class BaseModel(torch.nn.Module):
|
|||||||
if not unet_config.get("disable_unet_model_creation", False):
|
if not unet_config.get("disable_unet_model_creation", False):
|
||||||
if model_config.custom_operations is None:
|
if model_config.custom_operations is None:
|
||||||
fp8 = model_config.optimizations.get("fp8", False)
|
fp8 = model_config.optimizations.get("fp8", False)
|
||||||
operations = comfy.ops.pick_operations(unet_config.get("dtype", None), self.manual_cast_dtype, fp8_optimizations=fp8, scaled_fp8=model_config.scaled_fp8, model_config=model_config)
|
operations = comfy.ops.pick_operations(unet_config.get("dtype", None), self.manual_cast_dtype, fp8_optimizations=fp8, model_config=model_config)
|
||||||
else:
|
else:
|
||||||
operations = model_config.custom_operations
|
operations = model_config.custom_operations
|
||||||
self.diffusion_model = unet_model(**unet_config, device=device, operations=operations)
|
self.diffusion_model = unet_model(**unet_config, device=device, operations=operations)
|
||||||
@ -329,18 +330,6 @@ class BaseModel(torch.nn.Module):
|
|||||||
extra_sds.append(self.model_config.process_clip_vision_state_dict_for_saving(clip_vision_state_dict))
|
extra_sds.append(self.model_config.process_clip_vision_state_dict_for_saving(clip_vision_state_dict))
|
||||||
|
|
||||||
unet_state_dict = self.diffusion_model.state_dict()
|
unet_state_dict = self.diffusion_model.state_dict()
|
||||||
|
|
||||||
if self.model_config.scaled_fp8 is not None:
|
|
||||||
unet_state_dict["scaled_fp8"] = torch.tensor([], dtype=self.model_config.scaled_fp8)
|
|
||||||
|
|
||||||
# Save mixed precision metadata
|
|
||||||
if hasattr(self.model_config, 'layer_quant_config') and self.model_config.layer_quant_config:
|
|
||||||
metadata = {
|
|
||||||
"format_version": "1.0",
|
|
||||||
"layers": self.model_config.layer_quant_config
|
|
||||||
}
|
|
||||||
unet_state_dict["_quantization_metadata"] = metadata
|
|
||||||
|
|
||||||
unet_state_dict = self.model_config.process_unet_state_dict_for_saving(unet_state_dict)
|
unet_state_dict = self.model_config.process_unet_state_dict_for_saving(unet_state_dict)
|
||||||
|
|
||||||
if self.model_type == ModelType.V_PREDICTION:
|
if self.model_type == ModelType.V_PREDICTION:
|
||||||
@ -1121,6 +1110,10 @@ class Lumina2(BaseModel):
|
|||||||
if 'num_tokens' not in out:
|
if 'num_tokens' not in out:
|
||||||
out['num_tokens'] = comfy.conds.CONDConstant(cross_attn.shape[1])
|
out['num_tokens'] = comfy.conds.CONDConstant(cross_attn.shape[1])
|
||||||
|
|
||||||
|
clip_text_pooled = kwargs["pooled_output"] # Newbie
|
||||||
|
if clip_text_pooled is not None:
|
||||||
|
out['clip_text_pooled'] = comfy.conds.CONDRegular(clip_text_pooled)
|
||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
class WAN21(BaseModel):
|
class WAN21(BaseModel):
|
||||||
@ -1642,3 +1635,49 @@ class HunyuanVideo15_SR_Distilled(HunyuanVideo15):
|
|||||||
out = super().extra_conds(**kwargs)
|
out = super().extra_conds(**kwargs)
|
||||||
out['disable_time_r'] = comfy.conds.CONDConstant(False)
|
out['disable_time_r'] = comfy.conds.CONDConstant(False)
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
class Kandinsky5(BaseModel):
|
||||||
|
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
|
||||||
|
super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.kandinsky5.model.Kandinsky5)
|
||||||
|
|
||||||
|
def encode_adm(self, **kwargs):
|
||||||
|
return kwargs["pooled_output"]
|
||||||
|
|
||||||
|
def concat_cond(self, **kwargs):
|
||||||
|
noise = kwargs.get("noise", None)
|
||||||
|
device = kwargs["device"]
|
||||||
|
image = torch.zeros_like(noise)
|
||||||
|
|
||||||
|
mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
|
||||||
|
if mask is None:
|
||||||
|
mask = torch.zeros_like(noise)[:, :1]
|
||||||
|
else:
|
||||||
|
mask = 1.0 - mask
|
||||||
|
mask = utils.common_upscale(mask.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
|
||||||
|
if mask.shape[-3] < noise.shape[-3]:
|
||||||
|
mask = torch.nn.functional.pad(mask, (0, 0, 0, 0, 0, noise.shape[-3] - mask.shape[-3]), mode='constant', value=0)
|
||||||
|
mask = utils.resize_to_batch_size(mask, noise.shape[0])
|
||||||
|
|
||||||
|
return torch.cat((image, mask), dim=1)
|
||||||
|
|
||||||
|
def extra_conds(self, **kwargs):
|
||||||
|
out = super().extra_conds(**kwargs)
|
||||||
|
attention_mask = kwargs.get("attention_mask", None)
|
||||||
|
if attention_mask is not None:
|
||||||
|
out['attention_mask'] = comfy.conds.CONDRegular(attention_mask)
|
||||||
|
cross_attn = kwargs.get("cross_attn", None)
|
||||||
|
if cross_attn is not None:
|
||||||
|
out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
|
||||||
|
|
||||||
|
time_dim_replace = kwargs.get("time_dim_replace", None)
|
||||||
|
if time_dim_replace is not None:
|
||||||
|
out['time_dim_replace'] = comfy.conds.CONDRegular(self.process_latent_in(time_dim_replace))
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
class Kandinsky5Image(Kandinsky5):
|
||||||
|
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
|
||||||
|
super().__init__(model_config, model_type, device=device)
|
||||||
|
|
||||||
|
def concat_cond(self, **kwargs):
|
||||||
|
return None
|
||||||
|
|||||||
@ -6,20 +6,6 @@ import math
|
|||||||
import logging
|
import logging
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
|
||||||
def detect_layer_quantization(metadata):
|
|
||||||
quant_key = "_quantization_metadata"
|
|
||||||
if metadata is not None and quant_key in metadata:
|
|
||||||
quant_metadata = metadata.pop(quant_key)
|
|
||||||
quant_metadata = json.loads(quant_metadata)
|
|
||||||
if isinstance(quant_metadata, dict) and "layers" in quant_metadata:
|
|
||||||
logging.info(f"Found quantization metadata (version {quant_metadata.get('format_version', 'unknown')})")
|
|
||||||
return quant_metadata["layers"]
|
|
||||||
else:
|
|
||||||
raise ValueError("Invalid quantization metadata format")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def count_blocks(state_dict_keys, prefix_string):
|
def count_blocks(state_dict_keys, prefix_string):
|
||||||
count = 0
|
count = 0
|
||||||
while True:
|
while True:
|
||||||
@ -194,8 +180,10 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
|||||||
dit_config["use_cond_type_embedding"] = False
|
dit_config["use_cond_type_embedding"] = False
|
||||||
if '{}vision_in.proj.0.weight'.format(key_prefix) in state_dict_keys:
|
if '{}vision_in.proj.0.weight'.format(key_prefix) in state_dict_keys:
|
||||||
dit_config["vision_in_dim"] = state_dict['{}vision_in.proj.0.weight'.format(key_prefix)].shape[0]
|
dit_config["vision_in_dim"] = state_dict['{}vision_in.proj.0.weight'.format(key_prefix)].shape[0]
|
||||||
|
dit_config["meanflow_sum"] = True
|
||||||
else:
|
else:
|
||||||
dit_config["vision_in_dim"] = None
|
dit_config["vision_in_dim"] = None
|
||||||
|
dit_config["meanflow_sum"] = False
|
||||||
return dit_config
|
return dit_config
|
||||||
|
|
||||||
if '{}double_blocks.0.img_attn.norm.key_norm.scale'.format(key_prefix) in state_dict_keys and ('{}img_in.weight'.format(key_prefix) in state_dict_keys or f"{key_prefix}distilled_guidance_layer.norms.0.scale" in state_dict_keys): #Flux, Chroma or Chroma Radiance (has no img_in.weight)
|
if '{}double_blocks.0.img_attn.norm.key_norm.scale'.format(key_prefix) in state_dict_keys and ('{}img_in.weight'.format(key_prefix) in state_dict_keys or f"{key_prefix}distilled_guidance_layer.norms.0.scale" in state_dict_keys): #Flux, Chroma or Chroma Radiance (has no img_in.weight)
|
||||||
@ -208,12 +196,12 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
|||||||
dit_config["theta"] = 2000
|
dit_config["theta"] = 2000
|
||||||
dit_config["out_channels"] = 128
|
dit_config["out_channels"] = 128
|
||||||
dit_config["global_modulation"] = True
|
dit_config["global_modulation"] = True
|
||||||
dit_config["vec_in_dim"] = None
|
|
||||||
dit_config["mlp_silu_act"] = True
|
dit_config["mlp_silu_act"] = True
|
||||||
dit_config["qkv_bias"] = False
|
dit_config["qkv_bias"] = False
|
||||||
dit_config["ops_bias"] = False
|
dit_config["ops_bias"] = False
|
||||||
dit_config["default_ref_method"] = "index"
|
dit_config["default_ref_method"] = "index"
|
||||||
dit_config["ref_index_scale"] = 10.0
|
dit_config["ref_index_scale"] = 10.0
|
||||||
|
dit_config["txt_ids_dims"] = [3]
|
||||||
patch_size = 1
|
patch_size = 1
|
||||||
else:
|
else:
|
||||||
dit_config["image_model"] = "flux"
|
dit_config["image_model"] = "flux"
|
||||||
@ -223,6 +211,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
|||||||
dit_config["theta"] = 10000
|
dit_config["theta"] = 10000
|
||||||
dit_config["out_channels"] = 16
|
dit_config["out_channels"] = 16
|
||||||
dit_config["qkv_bias"] = True
|
dit_config["qkv_bias"] = True
|
||||||
|
dit_config["txt_ids_dims"] = []
|
||||||
patch_size = 2
|
patch_size = 2
|
||||||
|
|
||||||
dit_config["in_channels"] = 16
|
dit_config["in_channels"] = 16
|
||||||
@ -245,6 +234,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
|||||||
vec_in_key = '{}vector_in.in_layer.weight'.format(key_prefix)
|
vec_in_key = '{}vector_in.in_layer.weight'.format(key_prefix)
|
||||||
if vec_in_key in state_dict_keys:
|
if vec_in_key in state_dict_keys:
|
||||||
dit_config["vec_in_dim"] = state_dict[vec_in_key].shape[1]
|
dit_config["vec_in_dim"] = state_dict[vec_in_key].shape[1]
|
||||||
|
else:
|
||||||
|
dit_config["vec_in_dim"] = None
|
||||||
|
|
||||||
dit_config["depth"] = count_blocks(state_dict_keys, '{}double_blocks.'.format(key_prefix) + '{}.')
|
dit_config["depth"] = count_blocks(state_dict_keys, '{}double_blocks.'.format(key_prefix) + '{}.')
|
||||||
dit_config["depth_single_blocks"] = count_blocks(state_dict_keys, '{}single_blocks.'.format(key_prefix) + '{}.')
|
dit_config["depth_single_blocks"] = count_blocks(state_dict_keys, '{}single_blocks.'.format(key_prefix) + '{}.')
|
||||||
@ -268,8 +259,17 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
|||||||
dit_config["nerf_tile_size"] = 512
|
dit_config["nerf_tile_size"] = 512
|
||||||
dit_config["nerf_final_head_type"] = "conv" if f"{key_prefix}nerf_final_layer_conv.norm.scale" in state_dict_keys else "linear"
|
dit_config["nerf_final_head_type"] = "conv" if f"{key_prefix}nerf_final_layer_conv.norm.scale" in state_dict_keys else "linear"
|
||||||
dit_config["nerf_embedder_dtype"] = torch.float32
|
dit_config["nerf_embedder_dtype"] = torch.float32
|
||||||
|
if "__x0__" in state_dict_keys: # x0 pred
|
||||||
|
dit_config["use_x0"] = True
|
||||||
|
else:
|
||||||
|
dit_config["use_x0"] = False
|
||||||
else:
|
else:
|
||||||
dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys
|
dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys
|
||||||
|
dit_config["yak_mlp"] = '{}double_blocks.0.img_mlp.gate_proj.weight'.format(key_prefix) in state_dict_keys
|
||||||
|
dit_config["txt_norm"] = "{}txt_norm.scale".format(key_prefix) in state_dict_keys
|
||||||
|
if dit_config["yak_mlp"] and dit_config["txt_norm"]: # Ovis model
|
||||||
|
dit_config["txt_ids_dims"] = [1, 2]
|
||||||
|
|
||||||
return dit_config
|
return dit_config
|
||||||
|
|
||||||
if '{}t5_yproj.weight'.format(key_prefix) in state_dict_keys: #Genmo mochi preview
|
if '{}t5_yproj.weight'.format(key_prefix) in state_dict_keys: #Genmo mochi preview
|
||||||
@ -429,6 +429,9 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
|||||||
dit_config["axes_lens"] = [300, 512, 512]
|
dit_config["axes_lens"] = [300, 512, 512]
|
||||||
dit_config["rope_theta"] = 10000.0
|
dit_config["rope_theta"] = 10000.0
|
||||||
dit_config["ffn_dim_multiplier"] = 4.0
|
dit_config["ffn_dim_multiplier"] = 4.0
|
||||||
|
ctd_weight = state_dict.get('{}clip_text_pooled_proj.0.weight'.format(key_prefix), None)
|
||||||
|
if ctd_weight is not None:
|
||||||
|
dit_config["clip_text_dim"] = ctd_weight.shape[0]
|
||||||
elif dit_config["dim"] == 3840: # Z image
|
elif dit_config["dim"] == 3840: # Z image
|
||||||
dit_config["n_heads"] = 30
|
dit_config["n_heads"] = 30
|
||||||
dit_config["n_kv_heads"] = 30
|
dit_config["n_kv_heads"] = 30
|
||||||
@ -617,6 +620,24 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
|||||||
dit_config["num_layers"] = count_blocks(state_dict_keys, '{}transformer_blocks.'.format(key_prefix) + '{}.')
|
dit_config["num_layers"] = count_blocks(state_dict_keys, '{}transformer_blocks.'.format(key_prefix) + '{}.')
|
||||||
return dit_config
|
return dit_config
|
||||||
|
|
||||||
|
if '{}visual_transformer_blocks.0.cross_attention.key_norm.weight'.format(key_prefix) in state_dict_keys: # Kandinsky 5
|
||||||
|
dit_config = {}
|
||||||
|
model_dim = state_dict['{}visual_embeddings.in_layer.bias'.format(key_prefix)].shape[0]
|
||||||
|
dit_config["model_dim"] = model_dim
|
||||||
|
if model_dim in [4096, 2560]: # pro video and lite image
|
||||||
|
dit_config["axes_dims"] = (32, 48, 48)
|
||||||
|
if model_dim == 2560: # lite image
|
||||||
|
dit_config["rope_scale_factor"] = (1.0, 1.0, 1.0)
|
||||||
|
elif model_dim == 1792: # lite video
|
||||||
|
dit_config["axes_dims"] = (16, 24, 24)
|
||||||
|
dit_config["time_dim"] = state_dict['{}time_embeddings.in_layer.bias'.format(key_prefix)].shape[0]
|
||||||
|
dit_config["image_model"] = "kandinsky5"
|
||||||
|
dit_config["ff_dim"] = state_dict['{}visual_transformer_blocks.0.feed_forward.in_layer.weight'.format(key_prefix)].shape[0]
|
||||||
|
dit_config["visual_embed_dim"] = state_dict['{}visual_embeddings.in_layer.weight'.format(key_prefix)].shape[1]
|
||||||
|
dit_config["num_text_blocks"] = count_blocks(state_dict_keys, '{}text_transformer_blocks.'.format(key_prefix) + '{}.')
|
||||||
|
dit_config["num_visual_blocks"] = count_blocks(state_dict_keys, '{}visual_transformer_blocks.'.format(key_prefix) + '{}.')
|
||||||
|
return dit_config
|
||||||
|
|
||||||
if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys:
|
if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -759,22 +780,11 @@ def model_config_from_unet(state_dict, unet_key_prefix, use_base_if_no_match=Fal
|
|||||||
if model_config is None and use_base_if_no_match:
|
if model_config is None and use_base_if_no_match:
|
||||||
model_config = comfy.supported_models_base.BASE(unet_config)
|
model_config = comfy.supported_models_base.BASE(unet_config)
|
||||||
|
|
||||||
scaled_fp8_key = "{}scaled_fp8".format(unet_key_prefix)
|
|
||||||
if scaled_fp8_key in state_dict:
|
|
||||||
scaled_fp8_weight = state_dict.pop(scaled_fp8_key)
|
|
||||||
model_config.scaled_fp8 = scaled_fp8_weight.dtype
|
|
||||||
if model_config.scaled_fp8 == torch.float32:
|
|
||||||
model_config.scaled_fp8 = torch.float8_e4m3fn
|
|
||||||
if scaled_fp8_weight.nelement() == 2:
|
|
||||||
model_config.optimizations["fp8"] = False
|
|
||||||
else:
|
|
||||||
model_config.optimizations["fp8"] = True
|
|
||||||
|
|
||||||
# Detect per-layer quantization (mixed precision)
|
# Detect per-layer quantization (mixed precision)
|
||||||
layer_quant_config = detect_layer_quantization(metadata)
|
quant_config = comfy.utils.detect_layer_quantization(state_dict, unet_key_prefix)
|
||||||
if layer_quant_config:
|
if quant_config:
|
||||||
model_config.layer_quant_config = layer_quant_config
|
model_config.quant_config = quant_config
|
||||||
logging.info(f"Detected mixed precision quantization: {len(layer_quant_config)} layers quantized")
|
logging.info("Detected mixed precision quantization")
|
||||||
|
|
||||||
return model_config
|
return model_config
|
||||||
|
|
||||||
|
|||||||
@ -689,7 +689,7 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu
|
|||||||
loaded_memory = loaded_model.model_loaded_memory()
|
loaded_memory = loaded_model.model_loaded_memory()
|
||||||
current_free_mem = get_free_memory(torch_dev) + loaded_memory
|
current_free_mem = get_free_memory(torch_dev) + loaded_memory
|
||||||
|
|
||||||
lowvram_model_memory = max(128 * 1024 * 1024, (current_free_mem - minimum_memory_required), min(current_free_mem * MIN_WEIGHT_MEMORY_RATIO, current_free_mem - minimum_inference_memory()))
|
lowvram_model_memory = max(0, (current_free_mem - minimum_memory_required), min(current_free_mem * MIN_WEIGHT_MEMORY_RATIO, current_free_mem - minimum_inference_memory()))
|
||||||
lowvram_model_memory = lowvram_model_memory - loaded_memory
|
lowvram_model_memory = lowvram_model_memory - loaded_memory
|
||||||
|
|
||||||
if lowvram_model_memory == 0:
|
if lowvram_model_memory == 0:
|
||||||
@ -1012,9 +1012,18 @@ def force_channels_last():
|
|||||||
|
|
||||||
|
|
||||||
STREAMS = {}
|
STREAMS = {}
|
||||||
NUM_STREAMS = 1
|
NUM_STREAMS = 0
|
||||||
if args.async_offload:
|
if args.async_offload is not None:
|
||||||
NUM_STREAMS = 2
|
NUM_STREAMS = args.async_offload
|
||||||
|
else:
|
||||||
|
# Enable by default on Nvidia
|
||||||
|
if is_nvidia():
|
||||||
|
NUM_STREAMS = 2
|
||||||
|
|
||||||
|
if args.disable_async_offload:
|
||||||
|
NUM_STREAMS = 0
|
||||||
|
|
||||||
|
if NUM_STREAMS > 0:
|
||||||
logging.info("Using async weight offloading with {} streams".format(NUM_STREAMS))
|
logging.info("Using async weight offloading with {} streams".format(NUM_STREAMS))
|
||||||
|
|
||||||
def current_stream(device):
|
def current_stream(device):
|
||||||
@ -1030,7 +1039,10 @@ def current_stream(device):
|
|||||||
stream_counters = {}
|
stream_counters = {}
|
||||||
def get_offload_stream(device):
|
def get_offload_stream(device):
|
||||||
stream_counter = stream_counters.get(device, 0)
|
stream_counter = stream_counters.get(device, 0)
|
||||||
if NUM_STREAMS <= 1:
|
if NUM_STREAMS == 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if torch.compiler.is_compiling():
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if device in STREAMS:
|
if device in STREAMS:
|
||||||
@ -1043,7 +1055,9 @@ def get_offload_stream(device):
|
|||||||
elif is_device_cuda(device):
|
elif is_device_cuda(device):
|
||||||
ss = []
|
ss = []
|
||||||
for k in range(NUM_STREAMS):
|
for k in range(NUM_STREAMS):
|
||||||
ss.append(torch.cuda.Stream(device=device, priority=0))
|
s1 = torch.cuda.Stream(device=device, priority=0)
|
||||||
|
s1.as_context = torch.cuda.stream
|
||||||
|
ss.append(s1)
|
||||||
STREAMS[device] = ss
|
STREAMS[device] = ss
|
||||||
s = ss[stream_counter]
|
s = ss[stream_counter]
|
||||||
stream_counters[device] = stream_counter
|
stream_counters[device] = stream_counter
|
||||||
@ -1051,7 +1065,9 @@ def get_offload_stream(device):
|
|||||||
elif is_device_xpu(device):
|
elif is_device_xpu(device):
|
||||||
ss = []
|
ss = []
|
||||||
for k in range(NUM_STREAMS):
|
for k in range(NUM_STREAMS):
|
||||||
ss.append(torch.xpu.Stream(device=device, priority=0))
|
s1 = torch.xpu.Stream(device=device, priority=0)
|
||||||
|
s1.as_context = torch.xpu.stream
|
||||||
|
ss.append(s1)
|
||||||
STREAMS[device] = ss
|
STREAMS[device] = ss
|
||||||
s = ss[stream_counter]
|
s = ss[stream_counter]
|
||||||
stream_counters[device] = stream_counter
|
stream_counters[device] = stream_counter
|
||||||
@ -1069,12 +1085,19 @@ def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, str
|
|||||||
if dtype is None or weight.dtype == dtype:
|
if dtype is None or weight.dtype == dtype:
|
||||||
return weight
|
return weight
|
||||||
if stream is not None:
|
if stream is not None:
|
||||||
with stream:
|
wf_context = stream
|
||||||
|
if hasattr(wf_context, "as_context"):
|
||||||
|
wf_context = wf_context.as_context(stream)
|
||||||
|
with wf_context:
|
||||||
return weight.to(dtype=dtype, copy=copy)
|
return weight.to(dtype=dtype, copy=copy)
|
||||||
return weight.to(dtype=dtype, copy=copy)
|
return weight.to(dtype=dtype, copy=copy)
|
||||||
|
|
||||||
|
|
||||||
if stream is not None:
|
if stream is not None:
|
||||||
with stream:
|
wf_context = stream
|
||||||
|
if hasattr(wf_context, "as_context"):
|
||||||
|
wf_context = wf_context.as_context(stream)
|
||||||
|
with wf_context:
|
||||||
r = torch.empty_like(weight, dtype=dtype, device=device)
|
r = torch.empty_like(weight, dtype=dtype, device=device)
|
||||||
r.copy_(weight, non_blocking=non_blocking)
|
r.copy_(weight, non_blocking=non_blocking)
|
||||||
else:
|
else:
|
||||||
@ -1469,6 +1492,20 @@ def extended_fp16_support():
|
|||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
LORA_COMPUTE_DTYPES = {}
|
||||||
|
def lora_compute_dtype(device):
|
||||||
|
dtype = LORA_COMPUTE_DTYPES.get(device, None)
|
||||||
|
if dtype is not None:
|
||||||
|
return dtype
|
||||||
|
|
||||||
|
if should_use_fp16(device):
|
||||||
|
dtype = torch.float16
|
||||||
|
else:
|
||||||
|
dtype = torch.float32
|
||||||
|
|
||||||
|
LORA_COMPUTE_DTYPES[device] = dtype
|
||||||
|
return dtype
|
||||||
|
|
||||||
def soft_empty_cache(force=False):
|
def soft_empty_cache(force=False):
|
||||||
global cpu_state
|
global cpu_state
|
||||||
if cpu_state == CPUState.MPS:
|
if cpu_state == CPUState.MPS:
|
||||||
|
|||||||
@ -35,6 +35,7 @@ import comfy.model_management
|
|||||||
import comfy.patcher_extension
|
import comfy.patcher_extension
|
||||||
import comfy.utils
|
import comfy.utils
|
||||||
from comfy.comfy_types import UnetWrapperFunction
|
from comfy.comfy_types import UnetWrapperFunction
|
||||||
|
from comfy.quant_ops import QuantizedTensor
|
||||||
from comfy.patcher_extension import CallbacksMP, PatcherInjection, WrappersMP
|
from comfy.patcher_extension import CallbacksMP, PatcherInjection, WrappersMP
|
||||||
|
|
||||||
|
|
||||||
@ -126,27 +127,23 @@ class LowVramPatch:
|
|||||||
def __init__(self, key, patches, convert_func=None, set_func=None):
|
def __init__(self, key, patches, convert_func=None, set_func=None):
|
||||||
self.key = key
|
self.key = key
|
||||||
self.patches = patches
|
self.patches = patches
|
||||||
self.convert_func = convert_func
|
self.convert_func = convert_func # TODO: remove
|
||||||
self.set_func = set_func
|
self.set_func = set_func
|
||||||
|
|
||||||
def __call__(self, weight):
|
def __call__(self, weight):
|
||||||
intermediate_dtype = weight.dtype
|
return comfy.lora.calculate_weight(self.patches[self.key], weight, self.key, intermediate_dtype=weight.dtype)
|
||||||
if self.convert_func is not None:
|
|
||||||
weight = self.convert_func(weight, inplace=False)
|
|
||||||
|
|
||||||
if intermediate_dtype not in [torch.float32, torch.float16, torch.bfloat16]: #intermediate_dtype has to be one that is supported in math ops
|
LOWVRAM_PATCH_ESTIMATE_MATH_FACTOR = 2
|
||||||
intermediate_dtype = torch.float32
|
|
||||||
out = comfy.lora.calculate_weight(self.patches[self.key], weight.to(intermediate_dtype), self.key, intermediate_dtype=intermediate_dtype)
|
|
||||||
if self.set_func is None:
|
|
||||||
return comfy.float.stochastic_rounding(out, weight.dtype, seed=string_to_seed(self.key))
|
|
||||||
else:
|
|
||||||
return self.set_func(out, seed=string_to_seed(self.key), return_weight=True)
|
|
||||||
|
|
||||||
out = comfy.lora.calculate_weight(self.patches[self.key], weight, self.key, intermediate_dtype=intermediate_dtype)
|
def low_vram_patch_estimate_vram(model, key):
|
||||||
if self.set_func is not None:
|
weight, set_func, convert_func = get_key_weight(model, key)
|
||||||
return self.set_func(out, seed=string_to_seed(self.key), return_weight=True).to(dtype=intermediate_dtype)
|
if weight is None:
|
||||||
else:
|
return 0
|
||||||
return out
|
model_dtype = getattr(model, "manual_cast_dtype", torch.float32)
|
||||||
|
if model_dtype is None:
|
||||||
|
model_dtype = weight.dtype
|
||||||
|
|
||||||
|
return weight.numel() * model_dtype.itemsize * LOWVRAM_PATCH_ESTIMATE_MATH_FACTOR
|
||||||
|
|
||||||
def get_key_weight(model, key):
|
def get_key_weight(model, key):
|
||||||
set_func = None
|
set_func = None
|
||||||
@ -269,6 +266,9 @@ class ModelPatcher:
|
|||||||
if not hasattr(self.model, 'current_weight_patches_uuid'):
|
if not hasattr(self.model, 'current_weight_patches_uuid'):
|
||||||
self.model.current_weight_patches_uuid = None
|
self.model.current_weight_patches_uuid = None
|
||||||
|
|
||||||
|
if not hasattr(self.model, 'model_offload_buffer_memory'):
|
||||||
|
self.model.model_offload_buffer_memory = 0
|
||||||
|
|
||||||
def model_size(self):
|
def model_size(self):
|
||||||
if self.size > 0:
|
if self.size > 0:
|
||||||
return self.size
|
return self.size
|
||||||
@ -454,6 +454,9 @@ class ModelPatcher:
|
|||||||
def set_model_post_input_patch(self, patch):
|
def set_model_post_input_patch(self, patch):
|
||||||
self.set_model_patch(patch, "post_input")
|
self.set_model_patch(patch, "post_input")
|
||||||
|
|
||||||
|
def set_model_noise_refiner_patch(self, patch):
|
||||||
|
self.set_model_patch(patch, "noise_refiner")
|
||||||
|
|
||||||
def set_model_rope_options(self, scale_x, shift_x, scale_y, shift_y, scale_t, shift_t, **kwargs):
|
def set_model_rope_options(self, scale_x, shift_x, scale_y, shift_y, scale_t, shift_t, **kwargs):
|
||||||
rope_options = self.model_options["transformer_options"].get("rope_options", {})
|
rope_options = self.model_options["transformer_options"].get("rope_options", {})
|
||||||
rope_options["scale_x"] = scale_x
|
rope_options["scale_x"] = scale_x
|
||||||
@ -618,10 +621,11 @@ class ModelPatcher:
|
|||||||
if key not in self.backup:
|
if key not in self.backup:
|
||||||
self.backup[key] = collections.namedtuple('Dimension', ['weight', 'inplace_update'])(weight.to(device=self.offload_device, copy=inplace_update), inplace_update)
|
self.backup[key] = collections.namedtuple('Dimension', ['weight', 'inplace_update'])(weight.to(device=self.offload_device, copy=inplace_update), inplace_update)
|
||||||
|
|
||||||
|
temp_dtype = comfy.model_management.lora_compute_dtype(device_to)
|
||||||
if device_to is not None:
|
if device_to is not None:
|
||||||
temp_weight = comfy.model_management.cast_to_device(weight, device_to, torch.float32, copy=True)
|
temp_weight = comfy.model_management.cast_to_device(weight, device_to, temp_dtype, copy=True)
|
||||||
else:
|
else:
|
||||||
temp_weight = weight.to(torch.float32, copy=True)
|
temp_weight = weight.to(temp_dtype, copy=True)
|
||||||
if convert_func is not None:
|
if convert_func is not None:
|
||||||
temp_weight = convert_func(temp_weight, inplace=True)
|
temp_weight = convert_func(temp_weight, inplace=True)
|
||||||
|
|
||||||
@ -662,7 +666,22 @@ class ModelPatcher:
|
|||||||
skip = True # skip random weights in non leaf modules
|
skip = True # skip random weights in non leaf modules
|
||||||
break
|
break
|
||||||
if not skip and (hasattr(m, "comfy_cast_weights") or len(params) > 0):
|
if not skip and (hasattr(m, "comfy_cast_weights") or len(params) > 0):
|
||||||
loading.append((comfy.model_management.module_size(m), n, m, params))
|
module_mem = comfy.model_management.module_size(m)
|
||||||
|
module_offload_mem = module_mem
|
||||||
|
if hasattr(m, "comfy_cast_weights"):
|
||||||
|
def check_module_offload_mem(key):
|
||||||
|
if key in self.patches:
|
||||||
|
return low_vram_patch_estimate_vram(self.model, key)
|
||||||
|
model_dtype = getattr(self.model, "manual_cast_dtype", None)
|
||||||
|
weight, _, _ = get_key_weight(self.model, key)
|
||||||
|
if model_dtype is None or weight is None:
|
||||||
|
return 0
|
||||||
|
if (weight.dtype != model_dtype or isinstance(weight, QuantizedTensor)):
|
||||||
|
return weight.numel() * model_dtype.itemsize
|
||||||
|
return 0
|
||||||
|
module_offload_mem += check_module_offload_mem("{}.weight".format(n))
|
||||||
|
module_offload_mem += check_module_offload_mem("{}.bias".format(n))
|
||||||
|
loading.append((module_offload_mem, module_mem, n, m, params))
|
||||||
return loading
|
return loading
|
||||||
|
|
||||||
def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False, full_load=False):
|
def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False, full_load=False):
|
||||||
@ -676,20 +695,22 @@ class ModelPatcher:
|
|||||||
|
|
||||||
load_completely = []
|
load_completely = []
|
||||||
offloaded = []
|
offloaded = []
|
||||||
|
offload_buffer = 0
|
||||||
loading.sort(reverse=True)
|
loading.sort(reverse=True)
|
||||||
for x in loading:
|
for i, x in enumerate(loading):
|
||||||
n = x[1]
|
module_offload_mem, module_mem, n, m, params = x
|
||||||
m = x[2]
|
|
||||||
params = x[3]
|
|
||||||
module_mem = x[0]
|
|
||||||
|
|
||||||
lowvram_weight = False
|
lowvram_weight = False
|
||||||
|
|
||||||
|
potential_offload = max(offload_buffer, module_offload_mem + sum([ x1[1] for x1 in loading[i+1:i+1+comfy.model_management.NUM_STREAMS]]))
|
||||||
|
lowvram_fits = mem_counter + module_mem + potential_offload < lowvram_model_memory
|
||||||
|
|
||||||
weight_key = "{}.weight".format(n)
|
weight_key = "{}.weight".format(n)
|
||||||
bias_key = "{}.bias".format(n)
|
bias_key = "{}.bias".format(n)
|
||||||
|
|
||||||
if not full_load and hasattr(m, "comfy_cast_weights"):
|
if not full_load and hasattr(m, "comfy_cast_weights"):
|
||||||
if mem_counter + module_mem >= lowvram_model_memory:
|
if not lowvram_fits:
|
||||||
|
offload_buffer = potential_offload
|
||||||
lowvram_weight = True
|
lowvram_weight = True
|
||||||
lowvram_counter += 1
|
lowvram_counter += 1
|
||||||
lowvram_mem_counter += module_mem
|
lowvram_mem_counter += module_mem
|
||||||
@ -723,9 +744,11 @@ class ModelPatcher:
|
|||||||
if hasattr(m, "comfy_cast_weights"):
|
if hasattr(m, "comfy_cast_weights"):
|
||||||
wipe_lowvram_weight(m)
|
wipe_lowvram_weight(m)
|
||||||
|
|
||||||
if full_load or mem_counter + module_mem < lowvram_model_memory:
|
if full_load or lowvram_fits:
|
||||||
mem_counter += module_mem
|
mem_counter += module_mem
|
||||||
load_completely.append((module_mem, n, m, params))
|
load_completely.append((module_mem, n, m, params))
|
||||||
|
else:
|
||||||
|
offload_buffer = potential_offload
|
||||||
|
|
||||||
if cast_weight and hasattr(m, "comfy_cast_weights"):
|
if cast_weight and hasattr(m, "comfy_cast_weights"):
|
||||||
m.prev_comfy_cast_weights = m.comfy_cast_weights
|
m.prev_comfy_cast_weights = m.comfy_cast_weights
|
||||||
@ -752,6 +775,8 @@ class ModelPatcher:
|
|||||||
key = "{}.{}".format(n, param)
|
key = "{}.{}".format(n, param)
|
||||||
self.unpin_weight(key)
|
self.unpin_weight(key)
|
||||||
self.patch_weight_to_device(key, device_to=device_to)
|
self.patch_weight_to_device(key, device_to=device_to)
|
||||||
|
if comfy.model_management.is_device_cuda(device_to):
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
logging.debug("lowvram: loaded module regularly {} {}".format(n, m))
|
logging.debug("lowvram: loaded module regularly {} {}".format(n, m))
|
||||||
m.comfy_patched_weights = True
|
m.comfy_patched_weights = True
|
||||||
@ -766,7 +791,7 @@ class ModelPatcher:
|
|||||||
self.pin_weight_to_device("{}.{}".format(n, param))
|
self.pin_weight_to_device("{}.{}".format(n, param))
|
||||||
|
|
||||||
if lowvram_counter > 0:
|
if lowvram_counter > 0:
|
||||||
logging.info("loaded partially; {:.2f} MB usable, {:.2f} MB loaded, {:.2f} MB offloaded, lowvram patches: {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), lowvram_mem_counter / (1024 * 1024), patch_counter))
|
logging.info("loaded partially; {:.2f} MB usable, {:.2f} MB loaded, {:.2f} MB offloaded, {:.2f} MB buffer reserved, lowvram patches: {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), lowvram_mem_counter / (1024 * 1024), offload_buffer / (1024 * 1024), patch_counter))
|
||||||
self.model.model_lowvram = True
|
self.model.model_lowvram = True
|
||||||
else:
|
else:
|
||||||
logging.info("loaded completely; {:.2f} MB usable, {:.2f} MB loaded, full load: {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), full_load))
|
logging.info("loaded completely; {:.2f} MB usable, {:.2f} MB loaded, full load: {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), full_load))
|
||||||
@ -778,6 +803,7 @@ class ModelPatcher:
|
|||||||
self.model.lowvram_patch_counter += patch_counter
|
self.model.lowvram_patch_counter += patch_counter
|
||||||
self.model.device = device_to
|
self.model.device = device_to
|
||||||
self.model.model_loaded_weight_memory = mem_counter
|
self.model.model_loaded_weight_memory = mem_counter
|
||||||
|
self.model.model_offload_buffer_memory = offload_buffer
|
||||||
self.model.current_weight_patches_uuid = self.patches_uuid
|
self.model.current_weight_patches_uuid = self.patches_uuid
|
||||||
|
|
||||||
for callback in self.get_all_callbacks(CallbacksMP.ON_LOAD):
|
for callback in self.get_all_callbacks(CallbacksMP.ON_LOAD):
|
||||||
@ -831,6 +857,7 @@ class ModelPatcher:
|
|||||||
self.model.to(device_to)
|
self.model.to(device_to)
|
||||||
self.model.device = device_to
|
self.model.device = device_to
|
||||||
self.model.model_loaded_weight_memory = 0
|
self.model.model_loaded_weight_memory = 0
|
||||||
|
self.model.model_offload_buffer_memory = 0
|
||||||
|
|
||||||
for m in self.model.modules():
|
for m in self.model.modules():
|
||||||
if hasattr(m, "comfy_patched_weights"):
|
if hasattr(m, "comfy_patched_weights"):
|
||||||
@ -849,13 +876,18 @@ class ModelPatcher:
|
|||||||
patch_counter = 0
|
patch_counter = 0
|
||||||
unload_list = self._load_list()
|
unload_list = self._load_list()
|
||||||
unload_list.sort()
|
unload_list.sort()
|
||||||
|
|
||||||
|
offload_buffer = self.model.model_offload_buffer_memory
|
||||||
|
if len(unload_list) > 0:
|
||||||
|
NS = comfy.model_management.NUM_STREAMS
|
||||||
|
offload_weight_factor = [ min(offload_buffer / (NS + 1), unload_list[0][1]) ] * NS
|
||||||
|
|
||||||
for unload in unload_list:
|
for unload in unload_list:
|
||||||
if memory_to_free < memory_freed:
|
if memory_to_free + offload_buffer - self.model.model_offload_buffer_memory < memory_freed:
|
||||||
break
|
break
|
||||||
module_mem = unload[0]
|
module_offload_mem, module_mem, n, m, params = unload
|
||||||
n = unload[1]
|
|
||||||
m = unload[2]
|
potential_offload = module_offload_mem + sum(offload_weight_factor)
|
||||||
params = unload[3]
|
|
||||||
|
|
||||||
lowvram_possible = hasattr(m, "comfy_cast_weights")
|
lowvram_possible = hasattr(m, "comfy_cast_weights")
|
||||||
if hasattr(m, "comfy_patched_weights") and m.comfy_patched_weights == True:
|
if hasattr(m, "comfy_patched_weights") and m.comfy_patched_weights == True:
|
||||||
@ -901,20 +933,25 @@ class ModelPatcher:
|
|||||||
patch_counter += 1
|
patch_counter += 1
|
||||||
cast_weight = True
|
cast_weight = True
|
||||||
|
|
||||||
if cast_weight:
|
if cast_weight and hasattr(m, "comfy_cast_weights"):
|
||||||
m.prev_comfy_cast_weights = m.comfy_cast_weights
|
m.prev_comfy_cast_weights = m.comfy_cast_weights
|
||||||
m.comfy_cast_weights = True
|
m.comfy_cast_weights = True
|
||||||
m.comfy_patched_weights = False
|
m.comfy_patched_weights = False
|
||||||
memory_freed += module_mem
|
memory_freed += module_mem
|
||||||
|
offload_buffer = max(offload_buffer, potential_offload)
|
||||||
|
offload_weight_factor.append(module_mem)
|
||||||
|
offload_weight_factor.pop(0)
|
||||||
logging.debug("freed {}".format(n))
|
logging.debug("freed {}".format(n))
|
||||||
|
|
||||||
for param in params:
|
for param in params:
|
||||||
self.pin_weight_to_device("{}.{}".format(n, param))
|
self.pin_weight_to_device("{}.{}".format(n, param))
|
||||||
|
|
||||||
|
|
||||||
self.model.model_lowvram = True
|
self.model.model_lowvram = True
|
||||||
self.model.lowvram_patch_counter += patch_counter
|
self.model.lowvram_patch_counter += patch_counter
|
||||||
self.model.model_loaded_weight_memory -= memory_freed
|
self.model.model_loaded_weight_memory -= memory_freed
|
||||||
logging.info("loaded partially: {:.2f} MB loaded, lowvram patches: {}".format(self.model.model_loaded_weight_memory / (1024 * 1024), self.model.lowvram_patch_counter))
|
self.model.model_offload_buffer_memory = offload_buffer
|
||||||
|
logging.info("Unloaded partially: {:.2f} MB freed, {:.2f} MB remains loaded, {:.2f} MB buffer reserved, lowvram patches: {}".format(memory_freed / (1024 * 1024), self.model.model_loaded_weight_memory / (1024 * 1024), offload_buffer / (1024 * 1024), self.model.lowvram_patch_counter))
|
||||||
return memory_freed
|
return memory_freed
|
||||||
|
|
||||||
def partially_load(self, device_to, extra_memory=0, force_patch_weights=False):
|
def partially_load(self, device_to, extra_memory=0, force_patch_weights=False):
|
||||||
|
|||||||
217
comfy/ops.py
217
comfy/ops.py
@ -22,7 +22,7 @@ import comfy.model_management
|
|||||||
from comfy.cli_args import args, PerformanceFeature
|
from comfy.cli_args import args, PerformanceFeature
|
||||||
import comfy.float
|
import comfy.float
|
||||||
import comfy.rmsnorm
|
import comfy.rmsnorm
|
||||||
import contextlib
|
import json
|
||||||
|
|
||||||
def run_every_op():
|
def run_every_op():
|
||||||
if torch.compiler.is_compiling():
|
if torch.compiler.is_compiling():
|
||||||
@ -93,11 +93,6 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of
|
|||||||
else:
|
else:
|
||||||
offload_stream = None
|
offload_stream = None
|
||||||
|
|
||||||
if offload_stream is not None:
|
|
||||||
wf_context = offload_stream
|
|
||||||
else:
|
|
||||||
wf_context = contextlib.nullcontext()
|
|
||||||
|
|
||||||
non_blocking = comfy.model_management.device_supports_non_blocking(device)
|
non_blocking = comfy.model_management.device_supports_non_blocking(device)
|
||||||
|
|
||||||
weight_has_function = len(s.weight_function) > 0
|
weight_has_function = len(s.weight_function) > 0
|
||||||
@ -109,22 +104,24 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of
|
|||||||
if s.bias is not None:
|
if s.bias is not None:
|
||||||
bias = comfy.model_management.cast_to(s.bias, bias_dtype, device, non_blocking=non_blocking, copy=bias_has_function, stream=offload_stream)
|
bias = comfy.model_management.cast_to(s.bias, bias_dtype, device, non_blocking=non_blocking, copy=bias_has_function, stream=offload_stream)
|
||||||
|
|
||||||
if bias_has_function:
|
comfy.model_management.sync_stream(device, offload_stream)
|
||||||
with wf_context:
|
|
||||||
for f in s.bias_function:
|
bias_a = bias
|
||||||
bias = f(bias)
|
weight_a = weight
|
||||||
|
|
||||||
|
if s.bias is not None:
|
||||||
|
for f in s.bias_function:
|
||||||
|
bias = f(bias)
|
||||||
|
|
||||||
if weight_has_function or weight.dtype != dtype:
|
if weight_has_function or weight.dtype != dtype:
|
||||||
with wf_context:
|
weight = weight.to(dtype=dtype)
|
||||||
weight = weight.to(dtype=dtype)
|
if isinstance(weight, QuantizedTensor):
|
||||||
if isinstance(weight, QuantizedTensor):
|
weight = weight.dequantize()
|
||||||
weight = weight.dequantize()
|
for f in s.weight_function:
|
||||||
for f in s.weight_function:
|
weight = f(weight)
|
||||||
weight = f(weight)
|
|
||||||
|
|
||||||
comfy.model_management.sync_stream(device, offload_stream)
|
|
||||||
if offloadable:
|
if offloadable:
|
||||||
return weight, bias, offload_stream
|
return weight, bias, (offload_stream, weight_a, bias_a)
|
||||||
else:
|
else:
|
||||||
#Legacy function signature
|
#Legacy function signature
|
||||||
return weight, bias
|
return weight, bias
|
||||||
@ -133,13 +130,16 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of
|
|||||||
def uncast_bias_weight(s, weight, bias, offload_stream):
|
def uncast_bias_weight(s, weight, bias, offload_stream):
|
||||||
if offload_stream is None:
|
if offload_stream is None:
|
||||||
return
|
return
|
||||||
if weight is not None:
|
os, weight_a, bias_a = offload_stream
|
||||||
device = weight.device
|
if os is None:
|
||||||
|
return
|
||||||
|
if weight_a is not None:
|
||||||
|
device = weight_a.device
|
||||||
else:
|
else:
|
||||||
if bias is None:
|
if bias_a is None:
|
||||||
return
|
return
|
||||||
device = bias.device
|
device = bias_a.device
|
||||||
offload_stream.wait_stream(comfy.model_management.current_stream(device))
|
os.wait_stream(comfy.model_management.current_stream(device))
|
||||||
|
|
||||||
|
|
||||||
class CastWeightBiasOp:
|
class CastWeightBiasOp:
|
||||||
@ -415,22 +415,12 @@ def fp8_linear(self, input):
|
|||||||
|
|
||||||
if input.ndim == 3 or input.ndim == 2:
|
if input.ndim == 3 or input.ndim == 2:
|
||||||
w, bias, offload_stream = cast_bias_weight(self, input, dtype=dtype, bias_dtype=input_dtype, offloadable=True)
|
w, bias, offload_stream = cast_bias_weight(self, input, dtype=dtype, bias_dtype=input_dtype, offloadable=True)
|
||||||
|
scale_weight = torch.ones((), device=input.device, dtype=torch.float32)
|
||||||
|
|
||||||
scale_weight = self.scale_weight
|
scale_input = torch.ones((), device=input.device, dtype=torch.float32)
|
||||||
scale_input = self.scale_input
|
input = torch.clamp(input, min=-448, max=448, out=input)
|
||||||
if scale_weight is None:
|
layout_params_weight = {'scale': scale_input, 'orig_dtype': input_dtype}
|
||||||
scale_weight = torch.ones((), device=input.device, dtype=torch.float32)
|
quantized_input = QuantizedTensor(input.to(dtype).contiguous(), "TensorCoreFP8Layout", layout_params_weight)
|
||||||
else:
|
|
||||||
scale_weight = scale_weight.to(input.device)
|
|
||||||
|
|
||||||
if scale_input is None:
|
|
||||||
scale_input = torch.ones((), device=input.device, dtype=torch.float32)
|
|
||||||
input = torch.clamp(input, min=-448, max=448, out=input)
|
|
||||||
layout_params_weight = {'scale': scale_input, 'orig_dtype': input_dtype}
|
|
||||||
quantized_input = QuantizedTensor(input.to(dtype).contiguous(), "TensorCoreFP8Layout", layout_params_weight)
|
|
||||||
else:
|
|
||||||
scale_input = scale_input.to(input.device)
|
|
||||||
quantized_input = QuantizedTensor.from_float(input, "TensorCoreFP8Layout", scale=scale_input, dtype=dtype)
|
|
||||||
|
|
||||||
# Wrap weight in QuantizedTensor - this enables unified dispatch
|
# Wrap weight in QuantizedTensor - this enables unified dispatch
|
||||||
# Call F.linear - __torch_dispatch__ routes to fp8_linear handler in quant_ops.py!
|
# Call F.linear - __torch_dispatch__ routes to fp8_linear handler in quant_ops.py!
|
||||||
@ -451,7 +441,7 @@ class fp8_ops(manual_cast):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def forward_comfy_cast_weights(self, input):
|
def forward_comfy_cast_weights(self, input):
|
||||||
if not self.training:
|
if len(self.weight_function) == 0 and len(self.bias_function) == 0:
|
||||||
try:
|
try:
|
||||||
out = fp8_linear(self, input)
|
out = fp8_linear(self, input)
|
||||||
if out is not None:
|
if out is not None:
|
||||||
@ -464,59 +454,6 @@ class fp8_ops(manual_cast):
|
|||||||
uncast_bias_weight(self, weight, bias, offload_stream)
|
uncast_bias_weight(self, weight, bias, offload_stream)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def scaled_fp8_ops(fp8_matrix_mult=False, scale_input=False, override_dtype=None):
|
|
||||||
logging.info("Using scaled fp8: fp8 matrix mult: {}, scale input: {}".format(fp8_matrix_mult, scale_input))
|
|
||||||
class scaled_fp8_op(manual_cast):
|
|
||||||
class Linear(manual_cast.Linear):
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
if override_dtype is not None:
|
|
||||||
kwargs['dtype'] = override_dtype
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
def reset_parameters(self):
|
|
||||||
if not hasattr(self, 'scale_weight'):
|
|
||||||
self.scale_weight = torch.nn.parameter.Parameter(data=torch.ones((), device=self.weight.device, dtype=torch.float32), requires_grad=False)
|
|
||||||
|
|
||||||
if not scale_input:
|
|
||||||
self.scale_input = None
|
|
||||||
|
|
||||||
if not hasattr(self, 'scale_input'):
|
|
||||||
self.scale_input = torch.nn.parameter.Parameter(data=torch.ones((), device=self.weight.device, dtype=torch.float32), requires_grad=False)
|
|
||||||
return None
|
|
||||||
|
|
||||||
def forward_comfy_cast_weights(self, input):
|
|
||||||
if fp8_matrix_mult:
|
|
||||||
out = fp8_linear(self, input)
|
|
||||||
if out is not None:
|
|
||||||
return out
|
|
||||||
|
|
||||||
weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True)
|
|
||||||
|
|
||||||
if weight.numel() < input.numel(): #TODO: optimize
|
|
||||||
x = torch.nn.functional.linear(input, weight * self.scale_weight.to(device=weight.device, dtype=weight.dtype), bias)
|
|
||||||
else:
|
|
||||||
x = torch.nn.functional.linear(input * self.scale_weight.to(device=weight.device, dtype=weight.dtype), weight, bias)
|
|
||||||
uncast_bias_weight(self, weight, bias, offload_stream)
|
|
||||||
return x
|
|
||||||
|
|
||||||
def convert_weight(self, weight, inplace=False, **kwargs):
|
|
||||||
if inplace:
|
|
||||||
weight *= self.scale_weight.to(device=weight.device, dtype=weight.dtype)
|
|
||||||
return weight
|
|
||||||
else:
|
|
||||||
return weight.to(dtype=torch.float32) * self.scale_weight.to(device=weight.device, dtype=torch.float32)
|
|
||||||
|
|
||||||
def set_weight(self, weight, inplace_update=False, seed=None, return_weight=False, **kwargs):
|
|
||||||
weight = comfy.float.stochastic_rounding(weight / self.scale_weight.to(device=weight.device, dtype=weight.dtype), self.weight.dtype, seed=seed)
|
|
||||||
if return_weight:
|
|
||||||
return weight
|
|
||||||
if inplace_update:
|
|
||||||
self.weight.data.copy_(weight)
|
|
||||||
else:
|
|
||||||
self.weight = torch.nn.Parameter(weight, requires_grad=False)
|
|
||||||
|
|
||||||
return scaled_fp8_op
|
|
||||||
|
|
||||||
CUBLAS_IS_AVAILABLE = False
|
CUBLAS_IS_AVAILABLE = False
|
||||||
try:
|
try:
|
||||||
from cublas_ops import CublasLinear
|
from cublas_ops import CublasLinear
|
||||||
@ -543,9 +480,9 @@ if CUBLAS_IS_AVAILABLE:
|
|||||||
from .quant_ops import QuantizedTensor, QUANT_ALGOS
|
from .quant_ops import QuantizedTensor, QUANT_ALGOS
|
||||||
|
|
||||||
|
|
||||||
def mixed_precision_ops(layer_quant_config={}, compute_dtype=torch.bfloat16, full_precision_mm=False):
|
def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_precision_mm=False):
|
||||||
class MixedPrecisionOps(manual_cast):
|
class MixedPrecisionOps(manual_cast):
|
||||||
_layer_quant_config = layer_quant_config
|
_quant_config = quant_config
|
||||||
_compute_dtype = compute_dtype
|
_compute_dtype = compute_dtype
|
||||||
_full_precision_mm = full_precision_mm
|
_full_precision_mm = full_precision_mm
|
||||||
|
|
||||||
@ -560,15 +497,14 @@ def mixed_precision_ops(layer_quant_config={}, compute_dtype=torch.bfloat16, ful
|
|||||||
) -> None:
|
) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.factory_kwargs = {"device": device, "dtype": MixedPrecisionOps._compute_dtype}
|
if dtype is None:
|
||||||
# self.factory_kwargs = {"device": device, "dtype": dtype}
|
dtype = MixedPrecisionOps._compute_dtype
|
||||||
|
|
||||||
|
self.factory_kwargs = {"device": device, "dtype": dtype}
|
||||||
|
|
||||||
self.in_features = in_features
|
self.in_features = in_features
|
||||||
self.out_features = out_features
|
self.out_features = out_features
|
||||||
if bias:
|
self._has_bias = bias
|
||||||
self.bias = torch.nn.Parameter(torch.empty(out_features, **self.factory_kwargs))
|
|
||||||
else:
|
|
||||||
self.register_parameter("bias", None)
|
|
||||||
|
|
||||||
self.tensor_class = None
|
self.tensor_class = None
|
||||||
self._full_precision_mm = MixedPrecisionOps._full_precision_mm
|
self._full_precision_mm = MixedPrecisionOps._full_precision_mm
|
||||||
@ -588,36 +524,59 @@ def mixed_precision_ops(layer_quant_config={}, compute_dtype=torch.bfloat16, ful
|
|||||||
|
|
||||||
manually_loaded_keys = [weight_key]
|
manually_loaded_keys = [weight_key]
|
||||||
|
|
||||||
if layer_name not in MixedPrecisionOps._layer_quant_config:
|
layer_conf = state_dict.pop(f"{prefix}comfy_quant", None)
|
||||||
self.weight = torch.nn.Parameter(weight.to(device=device, dtype=MixedPrecisionOps._compute_dtype), requires_grad=False)
|
if layer_conf is not None:
|
||||||
|
layer_conf = json.loads(layer_conf.numpy().tobytes())
|
||||||
|
|
||||||
|
if layer_conf is None:
|
||||||
|
dtype = self.factory_kwargs["dtype"]
|
||||||
|
self.weight = torch.nn.Parameter(weight.to(device=device, dtype=dtype), requires_grad=False)
|
||||||
|
if dtype != MixedPrecisionOps._compute_dtype:
|
||||||
|
self.comfy_cast_weights = True
|
||||||
|
if self._has_bias:
|
||||||
|
self.bias = torch.nn.Parameter(torch.empty(self.out_features, device=device, dtype=dtype))
|
||||||
|
else:
|
||||||
|
self.register_parameter("bias", None)
|
||||||
else:
|
else:
|
||||||
quant_format = MixedPrecisionOps._layer_quant_config[layer_name].get("format", None)
|
self.quant_format = layer_conf.get("format", None)
|
||||||
if quant_format is None:
|
if not self._full_precision_mm:
|
||||||
|
self._full_precision_mm = layer_conf.get("full_precision_matrix_mult", False)
|
||||||
|
|
||||||
|
if self.quant_format is None:
|
||||||
raise ValueError(f"Unknown quantization format for layer {layer_name}")
|
raise ValueError(f"Unknown quantization format for layer {layer_name}")
|
||||||
|
|
||||||
qconfig = QUANT_ALGOS[quant_format]
|
qconfig = QUANT_ALGOS[self.quant_format]
|
||||||
self.layout_type = qconfig["comfy_tensor_layout"]
|
self.layout_type = qconfig["comfy_tensor_layout"]
|
||||||
|
|
||||||
weight_scale_key = f"{prefix}weight_scale"
|
weight_scale_key = f"{prefix}weight_scale"
|
||||||
|
scale = state_dict.pop(weight_scale_key, None)
|
||||||
|
if scale is not None:
|
||||||
|
scale = scale.to(device)
|
||||||
layout_params = {
|
layout_params = {
|
||||||
'scale': state_dict.pop(weight_scale_key, None),
|
'scale': scale,
|
||||||
'orig_dtype': MixedPrecisionOps._compute_dtype,
|
'orig_dtype': MixedPrecisionOps._compute_dtype,
|
||||||
'block_size': qconfig.get("group_size", None),
|
'block_size': qconfig.get("group_size", None),
|
||||||
}
|
}
|
||||||
if layout_params['scale'] is not None:
|
|
||||||
|
if scale is not None:
|
||||||
manually_loaded_keys.append(weight_scale_key)
|
manually_loaded_keys.append(weight_scale_key)
|
||||||
|
|
||||||
self.weight = torch.nn.Parameter(
|
self.weight = torch.nn.Parameter(
|
||||||
QuantizedTensor(weight.to(device=device), self.layout_type, layout_params),
|
QuantizedTensor(weight.to(device=device, dtype=qconfig.get("storage_t", None)), self.layout_type, layout_params),
|
||||||
requires_grad=False
|
requires_grad=False
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if self._has_bias:
|
||||||
|
self.bias = torch.nn.Parameter(torch.empty(self.out_features, device=device, dtype=MixedPrecisionOps._compute_dtype))
|
||||||
|
else:
|
||||||
|
self.register_parameter("bias", None)
|
||||||
|
|
||||||
for param_name in qconfig["parameters"]:
|
for param_name in qconfig["parameters"]:
|
||||||
param_key = f"{prefix}{param_name}"
|
param_key = f"{prefix}{param_name}"
|
||||||
_v = state_dict.pop(param_key, None)
|
_v = state_dict.pop(param_key, None)
|
||||||
if _v is None:
|
if _v is None:
|
||||||
continue
|
continue
|
||||||
setattr(self, param_name, torch.nn.Parameter(_v.to(device=device), requires_grad=False))
|
self.register_parameter(param_name, torch.nn.Parameter(_v.to(device=device), requires_grad=False))
|
||||||
manually_loaded_keys.append(param_key)
|
manually_loaded_keys.append(param_key)
|
||||||
|
|
||||||
super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
|
super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
|
||||||
@ -626,6 +585,16 @@ def mixed_precision_ops(layer_quant_config={}, compute_dtype=torch.bfloat16, ful
|
|||||||
if key in missing_keys:
|
if key in missing_keys:
|
||||||
missing_keys.remove(key)
|
missing_keys.remove(key)
|
||||||
|
|
||||||
|
def state_dict(self, *args, destination=None, prefix="", **kwargs):
|
||||||
|
sd = super().state_dict(*args, destination=destination, prefix=prefix, **kwargs)
|
||||||
|
if isinstance(self.weight, QuantizedTensor):
|
||||||
|
sd["{}weight_scale".format(prefix)] = self.weight._layout_params['scale']
|
||||||
|
quant_conf = {"format": self.quant_format}
|
||||||
|
if self._full_precision_mm:
|
||||||
|
quant_conf["full_precision_matrix_mult"] = True
|
||||||
|
sd["{}comfy_quant".format(prefix)] = torch.tensor(list(json.dumps(quant_conf).encode('utf-8')), dtype=torch.uint8)
|
||||||
|
return sd
|
||||||
|
|
||||||
def _forward(self, input, weight, bias):
|
def _forward(self, input, weight, bias):
|
||||||
return torch.nn.functional.linear(input, weight, bias)
|
return torch.nn.functional.linear(input, weight, bias)
|
||||||
|
|
||||||
@ -641,9 +610,8 @@ def mixed_precision_ops(layer_quant_config={}, compute_dtype=torch.bfloat16, ful
|
|||||||
if self._full_precision_mm or self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
|
if self._full_precision_mm or self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
|
||||||
return self.forward_comfy_cast_weights(input, *args, **kwargs)
|
return self.forward_comfy_cast_weights(input, *args, **kwargs)
|
||||||
if (getattr(self, 'layout_type', None) is not None and
|
if (getattr(self, 'layout_type', None) is not None and
|
||||||
getattr(self, 'input_scale', None) is not None and
|
|
||||||
not isinstance(input, QuantizedTensor)):
|
not isinstance(input, QuantizedTensor)):
|
||||||
input = QuantizedTensor.from_float(input, self.layout_type, scale=self.input_scale, dtype=self.weight.dtype)
|
input = QuantizedTensor.from_float(input, self.layout_type, scale=getattr(self, 'input_scale', None), dtype=self.weight.dtype)
|
||||||
return self._forward(input, self.weight, self.bias)
|
return self._forward(input, self.weight, self.bias)
|
||||||
|
|
||||||
def convert_weight(self, weight, inplace=False, **kwargs):
|
def convert_weight(self, weight, inplace=False, **kwargs):
|
||||||
@ -654,7 +622,7 @@ def mixed_precision_ops(layer_quant_config={}, compute_dtype=torch.bfloat16, ful
|
|||||||
|
|
||||||
def set_weight(self, weight, inplace_update=False, seed=None, return_weight=False, **kwargs):
|
def set_weight(self, weight, inplace_update=False, seed=None, return_weight=False, **kwargs):
|
||||||
if getattr(self, 'layout_type', None) is not None:
|
if getattr(self, 'layout_type', None) is not None:
|
||||||
weight = QuantizedTensor.from_float(weight, self.layout_type, scale=None, dtype=self.weight.dtype, stochastic_rounding=seed, inplace_ops=True)
|
weight = QuantizedTensor.from_float(weight, self.layout_type, scale="recalculate", dtype=self.weight.dtype, stochastic_rounding=seed, inplace_ops=True)
|
||||||
else:
|
else:
|
||||||
weight = weight.to(self.weight.dtype)
|
weight = weight.to(self.weight.dtype)
|
||||||
if return_weight:
|
if return_weight:
|
||||||
@ -663,17 +631,28 @@ def mixed_precision_ops(layer_quant_config={}, compute_dtype=torch.bfloat16, ful
|
|||||||
assert inplace_update is False # TODO: eventually remove the inplace_update stuff
|
assert inplace_update is False # TODO: eventually remove the inplace_update stuff
|
||||||
self.weight = torch.nn.Parameter(weight, requires_grad=False)
|
self.weight = torch.nn.Parameter(weight, requires_grad=False)
|
||||||
|
|
||||||
|
def _apply(self, fn, recurse=True): # This is to get torch.compile + moving weights to another device working
|
||||||
|
if recurse:
|
||||||
|
for module in self.children():
|
||||||
|
module._apply(fn)
|
||||||
|
|
||||||
|
for key, param in self._parameters.items():
|
||||||
|
if param is None:
|
||||||
|
continue
|
||||||
|
self.register_parameter(key, torch.nn.Parameter(fn(param), requires_grad=False))
|
||||||
|
for key, buf in self._buffers.items():
|
||||||
|
if buf is not None:
|
||||||
|
self._buffers[key] = fn(buf)
|
||||||
|
return self
|
||||||
|
|
||||||
return MixedPrecisionOps
|
return MixedPrecisionOps
|
||||||
|
|
||||||
def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_fp8=False, fp8_optimizations=False, scaled_fp8=None, model_config=None):
|
def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_fp8=False, fp8_optimizations=False, model_config=None):
|
||||||
fp8_compute = comfy.model_management.supports_fp8_compute(load_device) # TODO: if we support more ops this needs to be more granular
|
fp8_compute = comfy.model_management.supports_fp8_compute(load_device) # TODO: if we support more ops this needs to be more granular
|
||||||
|
|
||||||
if model_config and hasattr(model_config, 'layer_quant_config') and model_config.layer_quant_config:
|
if model_config and hasattr(model_config, 'quant_config') and model_config.quant_config:
|
||||||
logging.info(f"Using mixed precision operations: {len(model_config.layer_quant_config)} quantized layers")
|
logging.info("Using mixed precision operations")
|
||||||
return mixed_precision_ops(model_config.layer_quant_config, compute_dtype, full_precision_mm=not fp8_compute)
|
return mixed_precision_ops(model_config.quant_config, compute_dtype, full_precision_mm=not fp8_compute)
|
||||||
|
|
||||||
if scaled_fp8 is not None:
|
|
||||||
return scaled_fp8_ops(fp8_matrix_mult=fp8_compute and fp8_optimizations, scale_input=fp8_optimizations, override_dtype=scaled_fp8)
|
|
||||||
|
|
||||||
if (
|
if (
|
||||||
fp8_compute and
|
fp8_compute and
|
||||||
|
|||||||
@ -235,8 +235,11 @@ class QuantizedTensor(torch.Tensor):
|
|||||||
def is_pinned(self):
|
def is_pinned(self):
|
||||||
return self._qdata.is_pinned()
|
return self._qdata.is_pinned()
|
||||||
|
|
||||||
def is_contiguous(self):
|
def is_contiguous(self, *arg, **kwargs):
|
||||||
return self._qdata.is_contiguous()
|
return self._qdata.is_contiguous(*arg, **kwargs)
|
||||||
|
|
||||||
|
def storage(self):
|
||||||
|
return self._qdata.storage()
|
||||||
|
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
# Generic Utilities (Layout-Agnostic Operations)
|
# Generic Utilities (Layout-Agnostic Operations)
|
||||||
@ -249,12 +252,6 @@ def _create_transformed_qtensor(qt, transform_fn):
|
|||||||
|
|
||||||
|
|
||||||
def _handle_device_transfer(qt, target_device, target_dtype=None, target_layout=None, op_name="to"):
|
def _handle_device_transfer(qt, target_device, target_dtype=None, target_layout=None, op_name="to"):
|
||||||
if target_dtype is not None and target_dtype != qt.dtype:
|
|
||||||
logging.warning(
|
|
||||||
f"QuantizedTensor: dtype conversion requested to {target_dtype}, "
|
|
||||||
f"but not supported for quantized tensors. Ignoring dtype."
|
|
||||||
)
|
|
||||||
|
|
||||||
if target_layout is not None and target_layout != torch.strided:
|
if target_layout is not None and target_layout != torch.strided:
|
||||||
logging.warning(
|
logging.warning(
|
||||||
f"QuantizedTensor: layout change requested to {target_layout}, "
|
f"QuantizedTensor: layout change requested to {target_layout}, "
|
||||||
@ -274,6 +271,8 @@ def _handle_device_transfer(qt, target_device, target_dtype=None, target_layout=
|
|||||||
logging.debug(f"QuantizedTensor.{op_name}: Moving from {current_device} to {target_device}")
|
logging.debug(f"QuantizedTensor.{op_name}: Moving from {current_device} to {target_device}")
|
||||||
new_q_data = qt._qdata.to(device=target_device)
|
new_q_data = qt._qdata.to(device=target_device)
|
||||||
new_params = _move_layout_params_to_device(qt._layout_params, target_device)
|
new_params = _move_layout_params_to_device(qt._layout_params, target_device)
|
||||||
|
if target_dtype is not None:
|
||||||
|
new_params["orig_dtype"] = target_dtype
|
||||||
new_qt = QuantizedTensor(new_q_data, qt._layout_type, new_params)
|
new_qt = QuantizedTensor(new_q_data, qt._layout_type, new_params)
|
||||||
logging.debug(f"QuantizedTensor.{op_name}: Created new tensor on {target_device}")
|
logging.debug(f"QuantizedTensor.{op_name}: Created new tensor on {target_device}")
|
||||||
return new_qt
|
return new_qt
|
||||||
@ -339,7 +338,9 @@ def generic_copy_(func, args, kwargs):
|
|||||||
# Copy from another quantized tensor
|
# Copy from another quantized tensor
|
||||||
qt_dest._qdata.copy_(src._qdata, non_blocking=non_blocking)
|
qt_dest._qdata.copy_(src._qdata, non_blocking=non_blocking)
|
||||||
qt_dest._layout_type = src._layout_type
|
qt_dest._layout_type = src._layout_type
|
||||||
|
orig_dtype = qt_dest._layout_params["orig_dtype"]
|
||||||
_copy_layout_params_inplace(src._layout_params, qt_dest._layout_params, non_blocking=non_blocking)
|
_copy_layout_params_inplace(src._layout_params, qt_dest._layout_params, non_blocking=non_blocking)
|
||||||
|
qt_dest._layout_params["orig_dtype"] = orig_dtype
|
||||||
else:
|
else:
|
||||||
# Copy from regular tensor - just copy raw data
|
# Copy from regular tensor - just copy raw data
|
||||||
qt_dest._qdata.copy_(src)
|
qt_dest._qdata.copy_(src)
|
||||||
@ -397,17 +398,23 @@ class TensorCoreFP8Layout(QuantizedLayout):
|
|||||||
def quantize(cls, tensor, scale=None, dtype=torch.float8_e4m3fn, stochastic_rounding=0, inplace_ops=False):
|
def quantize(cls, tensor, scale=None, dtype=torch.float8_e4m3fn, stochastic_rounding=0, inplace_ops=False):
|
||||||
orig_dtype = tensor.dtype
|
orig_dtype = tensor.dtype
|
||||||
|
|
||||||
if scale is None:
|
if isinstance(scale, str) and scale == "recalculate":
|
||||||
scale = torch.amax(tensor.abs()) / torch.finfo(dtype).max
|
scale = torch.amax(tensor.abs()).to(dtype=torch.float32) / torch.finfo(dtype).max
|
||||||
|
if tensor.dtype not in [torch.float32, torch.bfloat16]: # Prevent scale from being too small
|
||||||
|
tensor_info = torch.finfo(tensor.dtype)
|
||||||
|
scale = (1.0 / torch.clamp((1.0 / scale), min=tensor_info.min, max=tensor_info.max))
|
||||||
|
|
||||||
if not isinstance(scale, torch.Tensor):
|
if scale is not None:
|
||||||
scale = torch.tensor(scale)
|
if not isinstance(scale, torch.Tensor):
|
||||||
scale = scale.to(device=tensor.device, dtype=torch.float32)
|
scale = torch.tensor(scale)
|
||||||
|
scale = scale.to(device=tensor.device, dtype=torch.float32)
|
||||||
|
|
||||||
if inplace_ops:
|
if inplace_ops:
|
||||||
tensor *= (1.0 / scale).to(tensor.dtype)
|
tensor *= (1.0 / scale).to(tensor.dtype)
|
||||||
|
else:
|
||||||
|
tensor = tensor * (1.0 / scale).to(tensor.dtype)
|
||||||
else:
|
else:
|
||||||
tensor = tensor * (1.0 / scale).to(tensor.dtype)
|
scale = torch.ones((), device=tensor.device, dtype=torch.float32)
|
||||||
|
|
||||||
if stochastic_rounding > 0:
|
if stochastic_rounding > 0:
|
||||||
tensor = comfy.float.stochastic_rounding(tensor, dtype=dtype, seed=stochastic_rounding)
|
tensor = comfy.float.stochastic_rounding(tensor, dtype=dtype, seed=stochastic_rounding)
|
||||||
@ -425,7 +432,8 @@ class TensorCoreFP8Layout(QuantizedLayout):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def dequantize(qdata, scale, orig_dtype, **kwargs):
|
def dequantize(qdata, scale, orig_dtype, **kwargs):
|
||||||
plain_tensor = torch.ops.aten._to_copy.default(qdata, dtype=orig_dtype)
|
plain_tensor = torch.ops.aten._to_copy.default(qdata, dtype=orig_dtype)
|
||||||
return plain_tensor * scale
|
plain_tensor.mul_(scale)
|
||||||
|
return plain_tensor
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_plain_tensors(cls, qtensor):
|
def get_plain_tensors(cls, qtensor):
|
||||||
|
|||||||
188
comfy/sd.py
188
comfy/sd.py
@ -53,6 +53,8 @@ import comfy.text_encoders.omnigen2
|
|||||||
import comfy.text_encoders.qwen_image
|
import comfy.text_encoders.qwen_image
|
||||||
import comfy.text_encoders.hunyuan_image
|
import comfy.text_encoders.hunyuan_image
|
||||||
import comfy.text_encoders.z_image
|
import comfy.text_encoders.z_image
|
||||||
|
import comfy.text_encoders.ovis
|
||||||
|
import comfy.text_encoders.kandinsky5
|
||||||
|
|
||||||
import comfy.model_patcher
|
import comfy.model_patcher
|
||||||
import comfy.lora
|
import comfy.lora
|
||||||
@ -60,6 +62,8 @@ import comfy.lora_convert
|
|||||||
import comfy.hooks
|
import comfy.hooks
|
||||||
import comfy.t2i_adapter.adapter
|
import comfy.t2i_adapter.adapter
|
||||||
import comfy.taesd.taesd
|
import comfy.taesd.taesd
|
||||||
|
import comfy.taesd.taehv
|
||||||
|
import comfy.latent_formats
|
||||||
|
|
||||||
import comfy.ldm.flux.redux
|
import comfy.ldm.flux.redux
|
||||||
|
|
||||||
@ -95,7 +99,7 @@ def load_lora_for_models(model, clip, lora, strength_model, strength_clip):
|
|||||||
|
|
||||||
|
|
||||||
class CLIP:
|
class CLIP:
|
||||||
def __init__(self, target=None, embedding_directory=None, no_init=False, tokenizer_data={}, parameters=0, model_options={}):
|
def __init__(self, target=None, embedding_directory=None, no_init=False, tokenizer_data={}, parameters=0, state_dict=[], model_options={}):
|
||||||
if no_init:
|
if no_init:
|
||||||
return
|
return
|
||||||
params = target.params.copy()
|
params = target.params.copy()
|
||||||
@ -123,9 +127,32 @@ class CLIP:
|
|||||||
|
|
||||||
self.tokenizer = tokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
|
self.tokenizer = tokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
|
||||||
self.patcher = comfy.model_patcher.ModelPatcher(self.cond_stage_model, load_device=load_device, offload_device=offload_device)
|
self.patcher = comfy.model_patcher.ModelPatcher(self.cond_stage_model, load_device=load_device, offload_device=offload_device)
|
||||||
|
#Match torch.float32 hardcode upcast in TE implemention
|
||||||
|
self.patcher.set_model_compute_dtype(torch.float32)
|
||||||
self.patcher.hook_mode = comfy.hooks.EnumHookMode.MinVram
|
self.patcher.hook_mode = comfy.hooks.EnumHookMode.MinVram
|
||||||
self.patcher.is_clip = True
|
self.patcher.is_clip = True
|
||||||
self.apply_hooks_to_conds = None
|
self.apply_hooks_to_conds = None
|
||||||
|
if len(state_dict) > 0:
|
||||||
|
if isinstance(state_dict, list):
|
||||||
|
for c in state_dict:
|
||||||
|
m, u = self.load_sd(c)
|
||||||
|
if len(m) > 0:
|
||||||
|
logging.warning("clip missing: {}".format(m))
|
||||||
|
|
||||||
|
if len(u) > 0:
|
||||||
|
logging.debug("clip unexpected: {}".format(u))
|
||||||
|
else:
|
||||||
|
m, u = self.load_sd(state_dict, full_model=True)
|
||||||
|
if len(m) > 0:
|
||||||
|
m_filter = list(filter(lambda a: ".logit_scale" not in a and ".transformer.text_projection.weight" not in a, m))
|
||||||
|
if len(m_filter) > 0:
|
||||||
|
logging.warning("clip missing: {}".format(m))
|
||||||
|
else:
|
||||||
|
logging.debug("clip missing: {}".format(m))
|
||||||
|
|
||||||
|
if len(u) > 0:
|
||||||
|
logging.debug("clip unexpected {}:".format(u))
|
||||||
|
|
||||||
if params['device'] == load_device:
|
if params['device'] == load_device:
|
||||||
model_management.load_models_gpu([self.patcher], force_full_load=True)
|
model_management.load_models_gpu([self.patcher], force_full_load=True)
|
||||||
self.layer_idx = None
|
self.layer_idx = None
|
||||||
@ -190,6 +217,7 @@ class CLIP:
|
|||||||
self.cond_stage_model.set_clip_options({"projected_pooled": False})
|
self.cond_stage_model.set_clip_options({"projected_pooled": False})
|
||||||
|
|
||||||
self.load_model()
|
self.load_model()
|
||||||
|
self.cond_stage_model.set_clip_options({"execution_device": self.patcher.load_device})
|
||||||
all_hooks.reset()
|
all_hooks.reset()
|
||||||
self.patcher.patch_hooks(None)
|
self.patcher.patch_hooks(None)
|
||||||
if show_pbar:
|
if show_pbar:
|
||||||
@ -237,6 +265,7 @@ class CLIP:
|
|||||||
self.cond_stage_model.set_clip_options({"projected_pooled": False})
|
self.cond_stage_model.set_clip_options({"projected_pooled": False})
|
||||||
|
|
||||||
self.load_model()
|
self.load_model()
|
||||||
|
self.cond_stage_model.set_clip_options({"execution_device": self.patcher.load_device})
|
||||||
o = self.cond_stage_model.encode_token_weights(tokens)
|
o = self.cond_stage_model.encode_token_weights(tokens)
|
||||||
cond, pooled = o[:2]
|
cond, pooled = o[:2]
|
||||||
if return_dict:
|
if return_dict:
|
||||||
@ -466,7 +495,7 @@ class VAE:
|
|||||||
decoder_config={'target': "comfy.ldm.hunyuan_video.vae_refiner.Decoder", 'params': ddconfig})
|
decoder_config={'target': "comfy.ldm.hunyuan_video.vae_refiner.Decoder", 'params': ddconfig})
|
||||||
|
|
||||||
self.memory_used_encode = lambda shape, dtype: (1400 * 9 * shape[-2] * shape[-1]) * model_management.dtype_size(dtype)
|
self.memory_used_encode = lambda shape, dtype: (1400 * 9 * shape[-2] * shape[-1]) * model_management.dtype_size(dtype)
|
||||||
self.memory_used_decode = lambda shape, dtype: (2800 * 4 * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype)
|
self.memory_used_decode = lambda shape, dtype: (3600 * 4 * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype)
|
||||||
elif "decoder.conv_in.conv.weight" in sd:
|
elif "decoder.conv_in.conv.weight" in sd:
|
||||||
ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}
|
ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}
|
||||||
ddconfig["conv3d"] = True
|
ddconfig["conv3d"] = True
|
||||||
@ -478,8 +507,10 @@ class VAE:
|
|||||||
self.latent_dim = 3
|
self.latent_dim = 3
|
||||||
self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.conv.weight"].shape[1]
|
self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.conv.weight"].shape[1]
|
||||||
self.first_stage_model = AutoencoderKL(ddconfig=ddconfig, embed_dim=sd['post_quant_conv.weight'].shape[1])
|
self.first_stage_model = AutoencoderKL(ddconfig=ddconfig, embed_dim=sd['post_quant_conv.weight'].shape[1])
|
||||||
self.memory_used_decode = lambda shape, dtype: (1500 * shape[2] * shape[3] * shape[4] * (4 * 8 * 8)) * model_management.dtype_size(dtype)
|
#This is likely to significantly over-estimate with single image or low frame counts as the
|
||||||
self.memory_used_encode = lambda shape, dtype: (900 * max(shape[2], 2) * shape[3] * shape[4]) * model_management.dtype_size(dtype)
|
#implementation is able to completely skip caching. Rework if used as an image only VAE
|
||||||
|
self.memory_used_decode = lambda shape, dtype: (2800 * min(8, ((shape[2] - 1) * 4) + 1) * shape[3] * shape[4] * (8 * 8)) * model_management.dtype_size(dtype)
|
||||||
|
self.memory_used_encode = lambda shape, dtype: (1400 * min(9, shape[2]) * shape[3] * shape[4]) * model_management.dtype_size(dtype)
|
||||||
self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32]
|
self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32]
|
||||||
elif "decoder.unpatcher3d.wavelets" in sd:
|
elif "decoder.unpatcher3d.wavelets" in sd:
|
||||||
self.upscale_ratio = (lambda a: max(0, a * 8 - 7), 8, 8)
|
self.upscale_ratio = (lambda a: max(0, a * 8 - 7), 8, 8)
|
||||||
@ -508,17 +539,20 @@ class VAE:
|
|||||||
self.memory_used_encode = lambda shape, dtype: 3300 * shape[3] * shape[4] * model_management.dtype_size(dtype)
|
self.memory_used_encode = lambda shape, dtype: 3300 * shape[3] * shape[4] * model_management.dtype_size(dtype)
|
||||||
self.memory_used_decode = lambda shape, dtype: 8000 * shape[3] * shape[4] * (16 * 16) * model_management.dtype_size(dtype)
|
self.memory_used_decode = lambda shape, dtype: 8000 * shape[3] * shape[4] * (16 * 16) * model_management.dtype_size(dtype)
|
||||||
else: # Wan 2.1 VAE
|
else: # Wan 2.1 VAE
|
||||||
|
dim = sd["decoder.head.0.gamma"].shape[0]
|
||||||
self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 8, 8)
|
self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 8, 8)
|
||||||
self.upscale_index_formula = (4, 8, 8)
|
self.upscale_index_formula = (4, 8, 8)
|
||||||
self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 8, 8)
|
self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 8, 8)
|
||||||
self.downscale_index_formula = (4, 8, 8)
|
self.downscale_index_formula = (4, 8, 8)
|
||||||
self.latent_dim = 3
|
self.latent_dim = 3
|
||||||
self.latent_channels = 16
|
self.latent_channels = 16
|
||||||
ddconfig = {"dim": 96, "z_dim": self.latent_channels, "dim_mult": [1, 2, 4, 4], "num_res_blocks": 2, "attn_scales": [], "temperal_downsample": [False, True, True], "dropout": 0.0}
|
ddconfig = {"dim": dim, "z_dim": self.latent_channels, "dim_mult": [1, 2, 4, 4], "num_res_blocks": 2, "attn_scales": [], "temperal_downsample": [False, True, True], "dropout": 0.0}
|
||||||
self.first_stage_model = comfy.ldm.wan.vae.WanVAE(**ddconfig)
|
self.first_stage_model = comfy.ldm.wan.vae.WanVAE(**ddconfig)
|
||||||
self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32]
|
self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32]
|
||||||
self.memory_used_encode = lambda shape, dtype: 6000 * shape[3] * shape[4] * model_management.dtype_size(dtype)
|
self.memory_used_encode = lambda shape, dtype: (1500 if shape[2]<=4 else 6000) * shape[3] * shape[4] * model_management.dtype_size(dtype)
|
||||||
self.memory_used_decode = lambda shape, dtype: 7000 * shape[3] * shape[4] * (8 * 8) * model_management.dtype_size(dtype)
|
self.memory_used_decode = lambda shape, dtype: (2200 if shape[2]<=4 else 7000) * shape[3] * shape[4] * (8*8) * model_management.dtype_size(dtype)
|
||||||
|
|
||||||
|
|
||||||
# Hunyuan 3d v2 2.0 & 2.1
|
# Hunyuan 3d v2 2.0 & 2.1
|
||||||
elif "geo_decoder.cross_attn_decoder.ln_1.bias" in sd:
|
elif "geo_decoder.cross_attn_decoder.ln_1.bias" in sd:
|
||||||
|
|
||||||
@ -584,6 +618,35 @@ class VAE:
|
|||||||
self.process_input = lambda audio: audio
|
self.process_input = lambda audio: audio
|
||||||
self.working_dtypes = [torch.float32]
|
self.working_dtypes = [torch.float32]
|
||||||
self.crop_input = False
|
self.crop_input = False
|
||||||
|
elif "decoder.22.bias" in sd: # taehv, taew and lighttae
|
||||||
|
self.latent_channels = sd["decoder.1.weight"].shape[1]
|
||||||
|
self.latent_dim = 3
|
||||||
|
self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 16, 16)
|
||||||
|
self.upscale_index_formula = (4, 16, 16)
|
||||||
|
self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 16, 16)
|
||||||
|
self.downscale_index_formula = (4, 16, 16)
|
||||||
|
if self.latent_channels == 48: # Wan 2.2
|
||||||
|
self.first_stage_model = comfy.taesd.taehv.TAEHV(latent_channels=self.latent_channels, latent_format=None) # taehv doesn't need scaling
|
||||||
|
self.process_input = lambda image: (_ for _ in ()).throw(NotImplementedError("This light tae doesn't support encoding currently"))
|
||||||
|
self.process_output = lambda image: image
|
||||||
|
self.memory_used_decode = lambda shape, dtype: (1800 * (max(1, (shape[-3] ** 0.7 * 0.1)) * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype))
|
||||||
|
elif self.latent_channels == 32 and sd["decoder.22.bias"].shape[0] == 12: # lighttae_hv15
|
||||||
|
self.first_stage_model = comfy.taesd.taehv.TAEHV(latent_channels=self.latent_channels, latent_format=comfy.latent_formats.HunyuanVideo15)
|
||||||
|
self.process_input = lambda image: (_ for _ in ()).throw(NotImplementedError("This light tae doesn't support encoding currently"))
|
||||||
|
self.memory_used_decode = lambda shape, dtype: (1200 * (max(1, (shape[-3] ** 0.7 * 0.05)) * shape[-2] * shape[-1] * 32 * 32) * model_management.dtype_size(dtype))
|
||||||
|
else:
|
||||||
|
if sd["decoder.1.weight"].dtype == torch.float16: # taehv currently only available in float16, so assume it's not lighttaew2_1 as otherwise state dicts are identical
|
||||||
|
latent_format=comfy.latent_formats.HunyuanVideo
|
||||||
|
else:
|
||||||
|
latent_format=None # lighttaew2_1 doesn't need scaling
|
||||||
|
self.first_stage_model = comfy.taesd.taehv.TAEHV(latent_channels=self.latent_channels, latent_format=latent_format)
|
||||||
|
self.process_input = self.process_output = lambda image: image
|
||||||
|
self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 8, 8)
|
||||||
|
self.upscale_index_formula = (4, 8, 8)
|
||||||
|
self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 8, 8)
|
||||||
|
self.downscale_index_formula = (4, 8, 8)
|
||||||
|
self.memory_used_encode = lambda shape, dtype: (700 * (max(1, (shape[-3] ** 0.66 * 0.11)) * shape[-2] * shape[-1]) * model_management.dtype_size(dtype))
|
||||||
|
self.memory_used_decode = lambda shape, dtype: (50 * (max(1, (shape[-3] ** 0.65 * 0.26)) * shape[-2] * shape[-1] * 32 * 32) * model_management.dtype_size(dtype))
|
||||||
else:
|
else:
|
||||||
logging.warning("WARNING: No VAE weights detected, VAE not initalized.")
|
logging.warning("WARNING: No VAE weights detected, VAE not initalized.")
|
||||||
self.first_stage_model = None
|
self.first_stage_model = None
|
||||||
@ -708,6 +771,8 @@ class VAE:
|
|||||||
self.throw_exception_if_invalid()
|
self.throw_exception_if_invalid()
|
||||||
pixel_samples = None
|
pixel_samples = None
|
||||||
do_tile = False
|
do_tile = False
|
||||||
|
if self.latent_dim == 2 and samples_in.ndim == 5:
|
||||||
|
samples_in = samples_in[:, :, 0]
|
||||||
try:
|
try:
|
||||||
memory_used = self.memory_used_decode(samples_in.shape, self.vae_dtype)
|
memory_used = self.memory_used_decode(samples_in.shape, self.vae_dtype)
|
||||||
model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
|
model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
|
||||||
@ -924,16 +989,17 @@ class CLIPType(Enum):
|
|||||||
QWEN_IMAGE = 18
|
QWEN_IMAGE = 18
|
||||||
HUNYUAN_IMAGE = 19
|
HUNYUAN_IMAGE = 19
|
||||||
HUNYUAN_VIDEO_15 = 20
|
HUNYUAN_VIDEO_15 = 20
|
||||||
|
OVIS = 21
|
||||||
|
KANDINSKY5 = 22
|
||||||
|
KANDINSKY5_IMAGE = 23
|
||||||
|
|
||||||
|
|
||||||
def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}):
|
def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}):
|
||||||
clip_data = []
|
clip_data = []
|
||||||
for p in ckpt_paths:
|
for p in ckpt_paths:
|
||||||
sd, metadata = comfy.utils.load_torch_file(p, safe_load=True, return_metadata=True)
|
sd, metadata = comfy.utils.load_torch_file(p, safe_load=True, return_metadata=True)
|
||||||
if metadata is not None:
|
if model_options.get("custom_operations", None) is None:
|
||||||
quant_metadata = metadata.get("_quantization_metadata", None)
|
sd, metadata = comfy.utils.convert_old_quants(sd, model_prefix="", metadata=metadata)
|
||||||
if quant_metadata is not None:
|
|
||||||
sd["_quantization_metadata"] = quant_metadata
|
|
||||||
clip_data.append(sd)
|
clip_data.append(sd)
|
||||||
return load_text_encoder_state_dicts(clip_data, embedding_directory=embedding_directory, clip_type=clip_type, model_options=model_options)
|
return load_text_encoder_state_dicts(clip_data, embedding_directory=embedding_directory, clip_type=clip_type, model_options=model_options)
|
||||||
|
|
||||||
@ -955,6 +1021,7 @@ class TEModel(Enum):
|
|||||||
MISTRAL3_24B = 14
|
MISTRAL3_24B = 14
|
||||||
MISTRAL3_24B_PRUNED_FLUX2 = 15
|
MISTRAL3_24B_PRUNED_FLUX2 = 15
|
||||||
QWEN3_4B = 16
|
QWEN3_4B = 16
|
||||||
|
QWEN3_2B = 17
|
||||||
|
|
||||||
|
|
||||||
def detect_te_model(sd):
|
def detect_te_model(sd):
|
||||||
@ -988,9 +1055,12 @@ def detect_te_model(sd):
|
|||||||
if weight.shape[0] == 512:
|
if weight.shape[0] == 512:
|
||||||
return TEModel.QWEN25_7B
|
return TEModel.QWEN25_7B
|
||||||
if "model.layers.0.post_attention_layernorm.weight" in sd:
|
if "model.layers.0.post_attention_layernorm.weight" in sd:
|
||||||
if 'model.layers.0.self_attn.q_norm.weight' in sd:
|
|
||||||
return TEModel.QWEN3_4B
|
|
||||||
weight = sd['model.layers.0.post_attention_layernorm.weight']
|
weight = sd['model.layers.0.post_attention_layernorm.weight']
|
||||||
|
if 'model.layers.0.self_attn.q_norm.weight' in sd:
|
||||||
|
if weight.shape[0] == 2560:
|
||||||
|
return TEModel.QWEN3_4B
|
||||||
|
elif weight.shape[0] == 2048:
|
||||||
|
return TEModel.QWEN3_2B
|
||||||
if weight.shape[0] == 5120:
|
if weight.shape[0] == 5120:
|
||||||
if "model.layers.39.post_attention_layernorm.weight" in sd:
|
if "model.layers.39.post_attention_layernorm.weight" in sd:
|
||||||
return TEModel.MISTRAL3_24B
|
return TEModel.MISTRAL3_24B
|
||||||
@ -1046,7 +1116,7 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
|||||||
clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=False, clip_g=True, t5=False)
|
clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=False, clip_g=True, t5=False)
|
||||||
clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
|
clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
|
||||||
elif clip_type == CLIPType.HIDREAM:
|
elif clip_type == CLIPType.HIDREAM:
|
||||||
clip_target.clip = comfy.text_encoders.hidream.hidream_clip(clip_l=False, clip_g=True, t5=False, llama=False, dtype_t5=None, dtype_llama=None, t5xxl_scaled_fp8=None, llama_scaled_fp8=None)
|
clip_target.clip = comfy.text_encoders.hidream.hidream_clip(clip_l=False, clip_g=True, t5=False, llama=False, dtype_t5=None, dtype_llama=None)
|
||||||
clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
|
clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
|
||||||
else:
|
else:
|
||||||
clip_target.clip = sdxl_clip.SDXLRefinerClipModel
|
clip_target.clip = sdxl_clip.SDXLRefinerClipModel
|
||||||
@ -1070,7 +1140,7 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
|||||||
tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None)
|
tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None)
|
||||||
elif clip_type == CLIPType.HIDREAM:
|
elif clip_type == CLIPType.HIDREAM:
|
||||||
clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**t5xxl_detect(clip_data),
|
clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**t5xxl_detect(clip_data),
|
||||||
clip_l=False, clip_g=False, t5=True, llama=False, dtype_llama=None, llama_scaled_fp8=None)
|
clip_l=False, clip_g=False, t5=True, llama=False, dtype_llama=None)
|
||||||
clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
|
clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
|
||||||
else: #CLIPType.MOCHI
|
else: #CLIPType.MOCHI
|
||||||
clip_target.clip = comfy.text_encoders.genmo.mochi_te(**t5xxl_detect(clip_data))
|
clip_target.clip = comfy.text_encoders.genmo.mochi_te(**t5xxl_detect(clip_data))
|
||||||
@ -1099,7 +1169,7 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
|||||||
tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None)
|
tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None)
|
||||||
elif te_model == TEModel.LLAMA3_8:
|
elif te_model == TEModel.LLAMA3_8:
|
||||||
clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**llama_detect(clip_data),
|
clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**llama_detect(clip_data),
|
||||||
clip_l=False, clip_g=False, t5=False, llama=True, dtype_t5=None, t5xxl_scaled_fp8=None)
|
clip_l=False, clip_g=False, t5=False, llama=True, dtype_t5=None)
|
||||||
clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
|
clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
|
||||||
elif te_model == TEModel.QWEN25_3B:
|
elif te_model == TEModel.QWEN25_3B:
|
||||||
clip_target.clip = comfy.text_encoders.omnigen2.te(**llama_detect(clip_data))
|
clip_target.clip = comfy.text_encoders.omnigen2.te(**llama_detect(clip_data))
|
||||||
@ -1118,13 +1188,16 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
|||||||
elif te_model == TEModel.QWEN3_4B:
|
elif te_model == TEModel.QWEN3_4B:
|
||||||
clip_target.clip = comfy.text_encoders.z_image.te(**llama_detect(clip_data))
|
clip_target.clip = comfy.text_encoders.z_image.te(**llama_detect(clip_data))
|
||||||
clip_target.tokenizer = comfy.text_encoders.z_image.ZImageTokenizer
|
clip_target.tokenizer = comfy.text_encoders.z_image.ZImageTokenizer
|
||||||
|
elif te_model == TEModel.QWEN3_2B:
|
||||||
|
clip_target.clip = comfy.text_encoders.ovis.te(**llama_detect(clip_data))
|
||||||
|
clip_target.tokenizer = comfy.text_encoders.ovis.OvisTokenizer
|
||||||
else:
|
else:
|
||||||
# clip_l
|
# clip_l
|
||||||
if clip_type == CLIPType.SD3:
|
if clip_type == CLIPType.SD3:
|
||||||
clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=True, clip_g=False, t5=False)
|
clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=True, clip_g=False, t5=False)
|
||||||
clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
|
clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
|
||||||
elif clip_type == CLIPType.HIDREAM:
|
elif clip_type == CLIPType.HIDREAM:
|
||||||
clip_target.clip = comfy.text_encoders.hidream.hidream_clip(clip_l=True, clip_g=False, t5=False, llama=False, dtype_t5=None, dtype_llama=None, t5xxl_scaled_fp8=None, llama_scaled_fp8=None)
|
clip_target.clip = comfy.text_encoders.hidream.hidream_clip(clip_l=True, clip_g=False, t5=False, llama=False, dtype_t5=None, dtype_llama=None)
|
||||||
clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
|
clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
|
||||||
else:
|
else:
|
||||||
clip_target.clip = sd1_clip.SD1ClipModel
|
clip_target.clip = sd1_clip.SD1ClipModel
|
||||||
@ -1167,6 +1240,12 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
|||||||
elif clip_type == CLIPType.HUNYUAN_VIDEO_15:
|
elif clip_type == CLIPType.HUNYUAN_VIDEO_15:
|
||||||
clip_target.clip = comfy.text_encoders.hunyuan_image.te(**llama_detect(clip_data))
|
clip_target.clip = comfy.text_encoders.hunyuan_image.te(**llama_detect(clip_data))
|
||||||
clip_target.tokenizer = comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer
|
clip_target.tokenizer = comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer
|
||||||
|
elif clip_type == CLIPType.KANDINSKY5:
|
||||||
|
clip_target.clip = comfy.text_encoders.kandinsky5.te(**llama_detect(clip_data))
|
||||||
|
clip_target.tokenizer = comfy.text_encoders.kandinsky5.Kandinsky5Tokenizer
|
||||||
|
elif clip_type == CLIPType.KANDINSKY5_IMAGE:
|
||||||
|
clip_target.clip = comfy.text_encoders.kandinsky5.te(**llama_detect(clip_data))
|
||||||
|
clip_target.tokenizer = comfy.text_encoders.kandinsky5.Kandinsky5TokenizerImage
|
||||||
else:
|
else:
|
||||||
clip_target.clip = sdxl_clip.SDXLClipModel
|
clip_target.clip = sdxl_clip.SDXLClipModel
|
||||||
clip_target.tokenizer = sdxl_clip.SDXLTokenizer
|
clip_target.tokenizer = sdxl_clip.SDXLTokenizer
|
||||||
@ -1179,19 +1258,10 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
|||||||
|
|
||||||
parameters = 0
|
parameters = 0
|
||||||
for c in clip_data:
|
for c in clip_data:
|
||||||
if "_quantization_metadata" in c:
|
|
||||||
c.pop("_quantization_metadata")
|
|
||||||
parameters += comfy.utils.calculate_parameters(c)
|
parameters += comfy.utils.calculate_parameters(c)
|
||||||
tokenizer_data, model_options = comfy.text_encoders.long_clipl.model_options_long_clip(c, tokenizer_data, model_options)
|
tokenizer_data, model_options = comfy.text_encoders.long_clipl.model_options_long_clip(c, tokenizer_data, model_options)
|
||||||
|
|
||||||
clip = CLIP(clip_target, embedding_directory=embedding_directory, parameters=parameters, tokenizer_data=tokenizer_data, model_options=model_options)
|
clip = CLIP(clip_target, embedding_directory=embedding_directory, parameters=parameters, tokenizer_data=tokenizer_data, state_dict=clip_data, model_options=model_options)
|
||||||
for c in clip_data:
|
|
||||||
m, u = clip.load_sd(c)
|
|
||||||
if len(m) > 0:
|
|
||||||
logging.warning("clip missing: {}".format(m))
|
|
||||||
|
|
||||||
if len(u) > 0:
|
|
||||||
logging.debug("clip unexpected: {}".format(u))
|
|
||||||
return clip
|
return clip
|
||||||
|
|
||||||
def load_gligen(ckpt_path):
|
def load_gligen(ckpt_path):
|
||||||
@ -1250,6 +1320,10 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c
|
|||||||
weight_dtype = comfy.utils.weight_dtype(sd, diffusion_model_prefix)
|
weight_dtype = comfy.utils.weight_dtype(sd, diffusion_model_prefix)
|
||||||
load_device = model_management.get_torch_device()
|
load_device = model_management.get_torch_device()
|
||||||
|
|
||||||
|
custom_operations = model_options.get("custom_operations", None)
|
||||||
|
if custom_operations is None:
|
||||||
|
sd, metadata = comfy.utils.convert_old_quants(sd, diffusion_model_prefix, metadata=metadata)
|
||||||
|
|
||||||
model_config = model_detection.model_config_from_unet(sd, diffusion_model_prefix, metadata=metadata)
|
model_config = model_detection.model_config_from_unet(sd, diffusion_model_prefix, metadata=metadata)
|
||||||
if model_config is None:
|
if model_config is None:
|
||||||
logging.warning("Warning, This is not a checkpoint file, trying to load it as a diffusion model only.")
|
logging.warning("Warning, This is not a checkpoint file, trying to load it as a diffusion model only.")
|
||||||
@ -1258,18 +1332,22 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c
|
|||||||
return None
|
return None
|
||||||
return (diffusion_model, None, VAE(sd={}), None) # The VAE object is there to throw an exception if it's actually used'
|
return (diffusion_model, None, VAE(sd={}), None) # The VAE object is there to throw an exception if it's actually used'
|
||||||
|
|
||||||
|
|
||||||
unet_weight_dtype = list(model_config.supported_inference_dtypes)
|
unet_weight_dtype = list(model_config.supported_inference_dtypes)
|
||||||
if model_config.scaled_fp8 is not None:
|
if model_config.quant_config is not None:
|
||||||
weight_dtype = None
|
weight_dtype = None
|
||||||
|
|
||||||
model_config.custom_operations = model_options.get("custom_operations", None)
|
if custom_operations is not None:
|
||||||
|
model_config.custom_operations = custom_operations
|
||||||
|
|
||||||
unet_dtype = model_options.get("dtype", model_options.get("weight_dtype", None))
|
unet_dtype = model_options.get("dtype", model_options.get("weight_dtype", None))
|
||||||
|
|
||||||
if unet_dtype is None:
|
if unet_dtype is None:
|
||||||
unet_dtype = model_management.unet_dtype(model_params=parameters, supported_dtypes=unet_weight_dtype, weight_dtype=weight_dtype)
|
unet_dtype = model_management.unet_dtype(model_params=parameters, supported_dtypes=unet_weight_dtype, weight_dtype=weight_dtype)
|
||||||
|
|
||||||
manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes)
|
if model_config.quant_config is not None:
|
||||||
|
manual_cast_dtype = model_management.unet_manual_cast(None, load_device, model_config.supported_inference_dtypes)
|
||||||
|
else:
|
||||||
|
manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes)
|
||||||
model_config.set_inference_dtype(unet_dtype, manual_cast_dtype)
|
model_config.set_inference_dtype(unet_dtype, manual_cast_dtype)
|
||||||
|
|
||||||
if model_config.clip_vision_prefix is not None:
|
if model_config.clip_vision_prefix is not None:
|
||||||
@ -1287,22 +1365,33 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c
|
|||||||
vae = VAE(sd=vae_sd, metadata=metadata)
|
vae = VAE(sd=vae_sd, metadata=metadata)
|
||||||
|
|
||||||
if output_clip:
|
if output_clip:
|
||||||
|
if te_model_options.get("custom_operations", None) is None:
|
||||||
|
scaled_fp8_list = []
|
||||||
|
for k in list(sd.keys()): # Convert scaled fp8 to mixed ops
|
||||||
|
if k.endswith(".scaled_fp8"):
|
||||||
|
scaled_fp8_list.append(k[:-len("scaled_fp8")])
|
||||||
|
|
||||||
|
if len(scaled_fp8_list) > 0:
|
||||||
|
out_sd = {}
|
||||||
|
for k in sd:
|
||||||
|
skip = False
|
||||||
|
for pref in scaled_fp8_list:
|
||||||
|
skip = skip or k.startswith(pref)
|
||||||
|
if not skip:
|
||||||
|
out_sd[k] = sd[k]
|
||||||
|
|
||||||
|
for pref in scaled_fp8_list:
|
||||||
|
quant_sd, qmetadata = comfy.utils.convert_old_quants(sd, pref, metadata={})
|
||||||
|
for k in quant_sd:
|
||||||
|
out_sd[k] = quant_sd[k]
|
||||||
|
sd = out_sd
|
||||||
|
|
||||||
clip_target = model_config.clip_target(state_dict=sd)
|
clip_target = model_config.clip_target(state_dict=sd)
|
||||||
if clip_target is not None:
|
if clip_target is not None:
|
||||||
clip_sd = model_config.process_clip_state_dict(sd)
|
clip_sd = model_config.process_clip_state_dict(sd)
|
||||||
if len(clip_sd) > 0:
|
if len(clip_sd) > 0:
|
||||||
parameters = comfy.utils.calculate_parameters(clip_sd)
|
parameters = comfy.utils.calculate_parameters(clip_sd)
|
||||||
clip = CLIP(clip_target, embedding_directory=embedding_directory, tokenizer_data=clip_sd, parameters=parameters, model_options=te_model_options)
|
clip = CLIP(clip_target, embedding_directory=embedding_directory, tokenizer_data=clip_sd, parameters=parameters, state_dict=clip_sd, model_options=te_model_options)
|
||||||
m, u = clip.load_sd(clip_sd, full_model=True)
|
|
||||||
if len(m) > 0:
|
|
||||||
m_filter = list(filter(lambda a: ".logit_scale" not in a and ".transformer.text_projection.weight" not in a, m))
|
|
||||||
if len(m_filter) > 0:
|
|
||||||
logging.warning("clip missing: {}".format(m))
|
|
||||||
else:
|
|
||||||
logging.debug("clip missing: {}".format(m))
|
|
||||||
|
|
||||||
if len(u) > 0:
|
|
||||||
logging.debug("clip unexpected {}:".format(u))
|
|
||||||
else:
|
else:
|
||||||
logging.warning("no CLIP/text encoder weights in checkpoint, the text encoder model will not be loaded.")
|
logging.warning("no CLIP/text encoder weights in checkpoint, the text encoder model will not be loaded.")
|
||||||
|
|
||||||
@ -1349,6 +1438,9 @@ def load_diffusion_model_state_dict(sd, model_options={}, metadata=None):
|
|||||||
if len(temp_sd) > 0:
|
if len(temp_sd) > 0:
|
||||||
sd = temp_sd
|
sd = temp_sd
|
||||||
|
|
||||||
|
custom_operations = model_options.get("custom_operations", None)
|
||||||
|
if custom_operations is None:
|
||||||
|
sd, metadata = comfy.utils.convert_old_quants(sd, "", metadata=metadata)
|
||||||
parameters = comfy.utils.calculate_parameters(sd)
|
parameters = comfy.utils.calculate_parameters(sd)
|
||||||
weight_dtype = comfy.utils.weight_dtype(sd)
|
weight_dtype = comfy.utils.weight_dtype(sd)
|
||||||
|
|
||||||
@ -1379,7 +1471,7 @@ def load_diffusion_model_state_dict(sd, model_options={}, metadata=None):
|
|||||||
|
|
||||||
offload_device = model_management.unet_offload_device()
|
offload_device = model_management.unet_offload_device()
|
||||||
unet_weight_dtype = list(model_config.supported_inference_dtypes)
|
unet_weight_dtype = list(model_config.supported_inference_dtypes)
|
||||||
if model_config.scaled_fp8 is not None:
|
if model_config.quant_config is not None:
|
||||||
weight_dtype = None
|
weight_dtype = None
|
||||||
|
|
||||||
if dtype is None:
|
if dtype is None:
|
||||||
@ -1387,12 +1479,15 @@ def load_diffusion_model_state_dict(sd, model_options={}, metadata=None):
|
|||||||
else:
|
else:
|
||||||
unet_dtype = dtype
|
unet_dtype = dtype
|
||||||
|
|
||||||
if model_config.layer_quant_config is not None:
|
if model_config.quant_config is not None:
|
||||||
manual_cast_dtype = model_management.unet_manual_cast(None, load_device, model_config.supported_inference_dtypes)
|
manual_cast_dtype = model_management.unet_manual_cast(None, load_device, model_config.supported_inference_dtypes)
|
||||||
else:
|
else:
|
||||||
manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes)
|
manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes)
|
||||||
model_config.set_inference_dtype(unet_dtype, manual_cast_dtype)
|
model_config.set_inference_dtype(unet_dtype, manual_cast_dtype)
|
||||||
model_config.custom_operations = model_options.get("custom_operations", model_config.custom_operations)
|
|
||||||
|
if custom_operations is not None:
|
||||||
|
model_config.custom_operations = custom_operations
|
||||||
|
|
||||||
if model_options.get("fp8_optimizations", False):
|
if model_options.get("fp8_optimizations", False):
|
||||||
model_config.optimizations["fp8"] = True
|
model_config.optimizations["fp8"] = True
|
||||||
|
|
||||||
@ -1431,6 +1526,9 @@ def save_checkpoint(output_path, model, clip=None, vae=None, clip_vision=None, m
|
|||||||
if vae is not None:
|
if vae is not None:
|
||||||
vae_sd = vae.get_sd()
|
vae_sd = vae.get_sd()
|
||||||
|
|
||||||
|
if metadata is None:
|
||||||
|
metadata = {}
|
||||||
|
|
||||||
model_management.load_models_gpu(load_models, force_patch_weights=True)
|
model_management.load_models_gpu(load_models, force_patch_weights=True)
|
||||||
clip_vision_sd = clip_vision.get_sd() if clip_vision is not None else None
|
clip_vision_sd = clip_vision.get_sd() if clip_vision is not None else None
|
||||||
sd = model.model.state_dict_for_saving(clip_sd, vae_sd, clip_vision_sd)
|
sd = model.model.state_dict_for_saving(clip_sd, vae_sd, clip_vision_sd)
|
||||||
|
|||||||
@ -107,29 +107,17 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
|
|||||||
config[k] = v
|
config[k] = v
|
||||||
|
|
||||||
operations = model_options.get("custom_operations", None)
|
operations = model_options.get("custom_operations", None)
|
||||||
scaled_fp8 = None
|
quant_config = model_options.get("quantization_metadata", None)
|
||||||
quantization_metadata = model_options.get("quantization_metadata", None)
|
|
||||||
|
|
||||||
if operations is None:
|
if operations is None:
|
||||||
layer_quant_config = None
|
if quant_config is not None:
|
||||||
if quantization_metadata is not None:
|
operations = comfy.ops.mixed_precision_ops(quant_config, dtype, full_precision_mm=True)
|
||||||
layer_quant_config = json.loads(quantization_metadata).get("layers", None)
|
logging.info("Using MixedPrecisionOps for text encoder")
|
||||||
|
|
||||||
if layer_quant_config is not None:
|
|
||||||
operations = comfy.ops.mixed_precision_ops(layer_quant_config, dtype, full_precision_mm=True)
|
|
||||||
logging.info(f"Using MixedPrecisionOps for text encoder: {len(layer_quant_config)} quantized layers")
|
|
||||||
else:
|
else:
|
||||||
# Fallback to scaled_fp8_ops for backward compatibility
|
operations = comfy.ops.manual_cast
|
||||||
scaled_fp8 = model_options.get("scaled_fp8", None)
|
|
||||||
if scaled_fp8 is not None:
|
|
||||||
operations = comfy.ops.scaled_fp8_ops(fp8_matrix_mult=False, override_dtype=scaled_fp8)
|
|
||||||
else:
|
|
||||||
operations = comfy.ops.manual_cast
|
|
||||||
|
|
||||||
self.operations = operations
|
self.operations = operations
|
||||||
self.transformer = model_class(config, dtype, device, self.operations)
|
self.transformer = model_class(config, dtype, device, self.operations)
|
||||||
if scaled_fp8 is not None:
|
|
||||||
self.transformer.scaled_fp8 = torch.nn.Parameter(torch.tensor([], dtype=scaled_fp8))
|
|
||||||
|
|
||||||
self.num_layers = self.transformer.num_layers
|
self.num_layers = self.transformer.num_layers
|
||||||
|
|
||||||
@ -147,6 +135,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
|
|||||||
self.layer_norm_hidden_state = layer_norm_hidden_state
|
self.layer_norm_hidden_state = layer_norm_hidden_state
|
||||||
self.return_projected_pooled = return_projected_pooled
|
self.return_projected_pooled = return_projected_pooled
|
||||||
self.return_attention_masks = return_attention_masks
|
self.return_attention_masks = return_attention_masks
|
||||||
|
self.execution_device = None
|
||||||
|
|
||||||
if layer == "hidden":
|
if layer == "hidden":
|
||||||
assert layer_idx is not None
|
assert layer_idx is not None
|
||||||
@ -163,6 +152,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
|
|||||||
def set_clip_options(self, options):
|
def set_clip_options(self, options):
|
||||||
layer_idx = options.get("layer", self.layer_idx)
|
layer_idx = options.get("layer", self.layer_idx)
|
||||||
self.return_projected_pooled = options.get("projected_pooled", self.return_projected_pooled)
|
self.return_projected_pooled = options.get("projected_pooled", self.return_projected_pooled)
|
||||||
|
self.execution_device = options.get("execution_device", self.execution_device)
|
||||||
if isinstance(self.layer, list) or self.layer == "all":
|
if isinstance(self.layer, list) or self.layer == "all":
|
||||||
pass
|
pass
|
||||||
elif layer_idx is None or abs(layer_idx) > self.num_layers:
|
elif layer_idx is None or abs(layer_idx) > self.num_layers:
|
||||||
@ -175,6 +165,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
|
|||||||
self.layer = self.options_default[0]
|
self.layer = self.options_default[0]
|
||||||
self.layer_idx = self.options_default[1]
|
self.layer_idx = self.options_default[1]
|
||||||
self.return_projected_pooled = self.options_default[2]
|
self.return_projected_pooled = self.options_default[2]
|
||||||
|
self.execution_device = None
|
||||||
|
|
||||||
def process_tokens(self, tokens, device):
|
def process_tokens(self, tokens, device):
|
||||||
end_token = self.special_tokens.get("end", None)
|
end_token = self.special_tokens.get("end", None)
|
||||||
@ -258,7 +249,11 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
|
|||||||
return torch.cat(embeds_out), torch.tensor(attention_masks, device=device, dtype=torch.long), num_tokens, embeds_info
|
return torch.cat(embeds_out), torch.tensor(attention_masks, device=device, dtype=torch.long), num_tokens, embeds_info
|
||||||
|
|
||||||
def forward(self, tokens):
|
def forward(self, tokens):
|
||||||
device = self.transformer.get_input_embeddings().weight.device
|
if self.execution_device is None:
|
||||||
|
device = self.transformer.get_input_embeddings().weight.device
|
||||||
|
else:
|
||||||
|
device = self.execution_device
|
||||||
|
|
||||||
embeds, attention_mask, num_tokens, embeds_info = self.process_tokens(tokens, device)
|
embeds, attention_mask, num_tokens, embeds_info = self.process_tokens(tokens, device)
|
||||||
|
|
||||||
attention_mask_model = None
|
attention_mask_model = None
|
||||||
|
|||||||
@ -21,12 +21,14 @@ import comfy.text_encoders.ace
|
|||||||
import comfy.text_encoders.omnigen2
|
import comfy.text_encoders.omnigen2
|
||||||
import comfy.text_encoders.qwen_image
|
import comfy.text_encoders.qwen_image
|
||||||
import comfy.text_encoders.hunyuan_image
|
import comfy.text_encoders.hunyuan_image
|
||||||
|
import comfy.text_encoders.kandinsky5
|
||||||
import comfy.text_encoders.z_image
|
import comfy.text_encoders.z_image
|
||||||
|
|
||||||
from . import supported_models_base
|
from . import supported_models_base
|
||||||
from . import latent_formats
|
from . import latent_formats
|
||||||
|
|
||||||
from . import diffusers_convert
|
from . import diffusers_convert
|
||||||
|
import comfy.model_management
|
||||||
|
|
||||||
class SD15(supported_models_base.BASE):
|
class SD15(supported_models_base.BASE):
|
||||||
unet_config = {
|
unet_config = {
|
||||||
@ -540,7 +542,7 @@ class SD3(supported_models_base.BASE):
|
|||||||
unet_extra_config = {}
|
unet_extra_config = {}
|
||||||
latent_format = latent_formats.SD3
|
latent_format = latent_formats.SD3
|
||||||
|
|
||||||
memory_usage_factor = 1.2
|
memory_usage_factor = 1.6
|
||||||
|
|
||||||
text_encoder_key_prefix = ["text_encoders."]
|
text_encoder_key_prefix = ["text_encoders."]
|
||||||
|
|
||||||
@ -964,7 +966,7 @@ class CosmosT2IPredict2(supported_models_base.BASE):
|
|||||||
|
|
||||||
def __init__(self, unet_config):
|
def __init__(self, unet_config):
|
||||||
super().__init__(unet_config)
|
super().__init__(unet_config)
|
||||||
self.memory_usage_factor = (unet_config.get("model_channels", 2048) / 2048) * 0.9
|
self.memory_usage_factor = (unet_config.get("model_channels", 2048) / 2048) * 0.95
|
||||||
|
|
||||||
def get_model(self, state_dict, prefix="", device=None):
|
def get_model(self, state_dict, prefix="", device=None):
|
||||||
out = model_base.CosmosPredict2(self, device=device)
|
out = model_base.CosmosPredict2(self, device=device)
|
||||||
@ -1025,7 +1027,15 @@ class ZImage(Lumina2):
|
|||||||
"shift": 3.0,
|
"shift": 3.0,
|
||||||
}
|
}
|
||||||
|
|
||||||
memory_usage_factor = 1.7
|
memory_usage_factor = 2.0
|
||||||
|
|
||||||
|
supported_inference_dtypes = [torch.bfloat16, torch.float32]
|
||||||
|
|
||||||
|
def __init__(self, unet_config):
|
||||||
|
super().__init__(unet_config)
|
||||||
|
if comfy.model_management.extended_fp16_support():
|
||||||
|
self.supported_inference_dtypes = self.supported_inference_dtypes.copy()
|
||||||
|
self.supported_inference_dtypes.insert(1, torch.float16)
|
||||||
|
|
||||||
def clip_target(self, state_dict={}):
|
def clip_target(self, state_dict={}):
|
||||||
pref = self.text_encoder_key_prefix[0]
|
pref = self.text_encoder_key_prefix[0]
|
||||||
@ -1286,7 +1296,7 @@ class ChromaRadiance(Chroma):
|
|||||||
latent_format = comfy.latent_formats.ChromaRadiance
|
latent_format = comfy.latent_formats.ChromaRadiance
|
||||||
|
|
||||||
# Pixel-space model, no spatial compression for model input.
|
# Pixel-space model, no spatial compression for model input.
|
||||||
memory_usage_factor = 0.038
|
memory_usage_factor = 0.044
|
||||||
|
|
||||||
def get_model(self, state_dict, prefix="", device=None):
|
def get_model(self, state_dict, prefix="", device=None):
|
||||||
return model_base.ChromaRadiance(self, device=device)
|
return model_base.ChromaRadiance(self, device=device)
|
||||||
@ -1329,7 +1339,7 @@ class Omnigen2(supported_models_base.BASE):
|
|||||||
"shift": 2.6,
|
"shift": 2.6,
|
||||||
}
|
}
|
||||||
|
|
||||||
memory_usage_factor = 1.65 #TODO
|
memory_usage_factor = 1.95 #TODO
|
||||||
|
|
||||||
unet_extra_config = {}
|
unet_extra_config = {}
|
||||||
latent_format = latent_formats.Flux
|
latent_format = latent_formats.Flux
|
||||||
@ -1394,7 +1404,7 @@ class HunyuanImage21(HunyuanVideo):
|
|||||||
|
|
||||||
latent_format = latent_formats.HunyuanImage21
|
latent_format = latent_formats.HunyuanImage21
|
||||||
|
|
||||||
memory_usage_factor = 7.7
|
memory_usage_factor = 8.7
|
||||||
|
|
||||||
supported_inference_dtypes = [torch.bfloat16, torch.float32]
|
supported_inference_dtypes = [torch.bfloat16, torch.float32]
|
||||||
|
|
||||||
@ -1472,7 +1482,60 @@ class HunyuanVideo15_SR_Distilled(HunyuanVideo):
|
|||||||
hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
|
hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
|
||||||
return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer, comfy.text_encoders.hunyuan_image.te(**hunyuan_detect))
|
return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer, comfy.text_encoders.hunyuan_image.te(**hunyuan_detect))
|
||||||
|
|
||||||
models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage, Flux2]
|
|
||||||
|
|
||||||
|
class Kandinsky5(supported_models_base.BASE):
|
||||||
|
unet_config = {
|
||||||
|
"image_model": "kandinsky5",
|
||||||
|
}
|
||||||
|
|
||||||
|
sampling_settings = {
|
||||||
|
"shift": 10.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
unet_extra_config = {}
|
||||||
|
latent_format = latent_formats.HunyuanVideo
|
||||||
|
|
||||||
|
memory_usage_factor = 1.25 #TODO
|
||||||
|
|
||||||
|
supported_inference_dtypes = [torch.bfloat16, torch.float32]
|
||||||
|
|
||||||
|
vae_key_prefix = ["vae."]
|
||||||
|
text_encoder_key_prefix = ["text_encoders."]
|
||||||
|
|
||||||
|
def get_model(self, state_dict, prefix="", device=None):
|
||||||
|
out = model_base.Kandinsky5(self, device=device)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def clip_target(self, state_dict={}):
|
||||||
|
pref = self.text_encoder_key_prefix[0]
|
||||||
|
hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
|
||||||
|
return supported_models_base.ClipTarget(comfy.text_encoders.kandinsky5.Kandinsky5Tokenizer, comfy.text_encoders.kandinsky5.te(**hunyuan_detect))
|
||||||
|
|
||||||
|
|
||||||
|
class Kandinsky5Image(Kandinsky5):
|
||||||
|
unet_config = {
|
||||||
|
"image_model": "kandinsky5",
|
||||||
|
"model_dim": 2560,
|
||||||
|
"visual_embed_dim": 64,
|
||||||
|
}
|
||||||
|
|
||||||
|
sampling_settings = {
|
||||||
|
"shift": 3.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
latent_format = latent_formats.Flux
|
||||||
|
memory_usage_factor = 1.25 #TODO
|
||||||
|
|
||||||
|
def get_model(self, state_dict, prefix="", device=None):
|
||||||
|
out = model_base.Kandinsky5Image(self, device=device)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def clip_target(self, state_dict={}):
|
||||||
|
pref = self.text_encoder_key_prefix[0]
|
||||||
|
hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
|
||||||
|
return supported_models_base.ClipTarget(comfy.text_encoders.kandinsky5.Kandinsky5TokenizerImage, comfy.text_encoders.kandinsky5.te(**hunyuan_detect))
|
||||||
|
|
||||||
|
|
||||||
|
models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5]
|
||||||
|
|
||||||
models += [SVD_img2vid]
|
models += [SVD_img2vid]
|
||||||
|
|||||||
@ -17,6 +17,7 @@
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
import logging
|
||||||
from . import model_base
|
from . import model_base
|
||||||
from . import utils
|
from . import utils
|
||||||
from . import latent_formats
|
from . import latent_formats
|
||||||
@ -49,8 +50,7 @@ class BASE:
|
|||||||
|
|
||||||
manual_cast_dtype = None
|
manual_cast_dtype = None
|
||||||
custom_operations = None
|
custom_operations = None
|
||||||
scaled_fp8 = None
|
quant_config = None # quantization configuration for mixed precision
|
||||||
layer_quant_config = None # Per-layer quantization configuration for mixed precision
|
|
||||||
optimizations = {"fp8": False}
|
optimizations = {"fp8": False}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -118,3 +118,7 @@ class BASE:
|
|||||||
def set_inference_dtype(self, dtype, manual_cast_dtype):
|
def set_inference_dtype(self, dtype, manual_cast_dtype):
|
||||||
self.unet_config['dtype'] = dtype
|
self.unet_config['dtype'] = dtype
|
||||||
self.manual_cast_dtype = manual_cast_dtype
|
self.manual_cast_dtype = manual_cast_dtype
|
||||||
|
|
||||||
|
def __getattr__(self, name):
|
||||||
|
logging.warning("\nWARNING, you accessed {} from the model config object which doesn't exist. Please fix your code.\n".format(name))
|
||||||
|
return None
|
||||||
|
|||||||
171
comfy/taesd/taehv.py
Normal file
171
comfy/taesd/taehv.py
Normal file
@ -0,0 +1,171 @@
|
|||||||
|
# Tiny AutoEncoder for HunyuanVideo and WanVideo https://github.com/madebyollin/taehv
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from tqdm.auto import tqdm
|
||||||
|
from collections import namedtuple, deque
|
||||||
|
|
||||||
|
import comfy.ops
|
||||||
|
operations=comfy.ops.disable_weight_init
|
||||||
|
|
||||||
|
DecoderResult = namedtuple("DecoderResult", ("frame", "memory"))
|
||||||
|
TWorkItem = namedtuple("TWorkItem", ("input_tensor", "block_index"))
|
||||||
|
|
||||||
|
def conv(n_in, n_out, **kwargs):
|
||||||
|
return operations.Conv2d(n_in, n_out, 3, padding=1, **kwargs)
|
||||||
|
|
||||||
|
class Clamp(nn.Module):
|
||||||
|
def forward(self, x):
|
||||||
|
return torch.tanh(x / 3) * 3
|
||||||
|
|
||||||
|
class MemBlock(nn.Module):
|
||||||
|
def __init__(self, n_in, n_out, act_func):
|
||||||
|
super().__init__()
|
||||||
|
self.conv = nn.Sequential(conv(n_in * 2, n_out), act_func, conv(n_out, n_out), act_func, conv(n_out, n_out))
|
||||||
|
self.skip = operations.Conv2d(n_in, n_out, 1, bias=False) if n_in != n_out else nn.Identity()
|
||||||
|
self.act = act_func
|
||||||
|
def forward(self, x, past):
|
||||||
|
return self.act(self.conv(torch.cat([x, past], 1)) + self.skip(x))
|
||||||
|
|
||||||
|
class TPool(nn.Module):
|
||||||
|
def __init__(self, n_f, stride):
|
||||||
|
super().__init__()
|
||||||
|
self.stride = stride
|
||||||
|
self.conv = operations.Conv2d(n_f*stride,n_f, 1, bias=False)
|
||||||
|
def forward(self, x):
|
||||||
|
_NT, C, H, W = x.shape
|
||||||
|
return self.conv(x.reshape(-1, self.stride * C, H, W))
|
||||||
|
|
||||||
|
class TGrow(nn.Module):
|
||||||
|
def __init__(self, n_f, stride):
|
||||||
|
super().__init__()
|
||||||
|
self.stride = stride
|
||||||
|
self.conv = operations.Conv2d(n_f, n_f*stride, 1, bias=False)
|
||||||
|
def forward(self, x):
|
||||||
|
_NT, C, H, W = x.shape
|
||||||
|
x = self.conv(x)
|
||||||
|
return x.reshape(-1, C, H, W)
|
||||||
|
|
||||||
|
def apply_model_with_memblocks(model, x, parallel, show_progress_bar):
|
||||||
|
|
||||||
|
B, T, C, H, W = x.shape
|
||||||
|
if parallel:
|
||||||
|
x = x.reshape(B*T, C, H, W)
|
||||||
|
# parallel over input timesteps, iterate over blocks
|
||||||
|
for b in tqdm(model, disable=not show_progress_bar):
|
||||||
|
if isinstance(b, MemBlock):
|
||||||
|
BT, C, H, W = x.shape
|
||||||
|
T = BT // B
|
||||||
|
_x = x.reshape(B, T, C, H, W)
|
||||||
|
mem = F.pad(_x, (0,0,0,0,0,0,1,0), value=0)[:,:T].reshape(x.shape)
|
||||||
|
x = b(x, mem)
|
||||||
|
else:
|
||||||
|
x = b(x)
|
||||||
|
BT, C, H, W = x.shape
|
||||||
|
T = BT // B
|
||||||
|
x = x.view(B, T, C, H, W)
|
||||||
|
else:
|
||||||
|
out = []
|
||||||
|
work_queue = deque([TWorkItem(xt, 0) for t, xt in enumerate(x.reshape(B, T * C, H, W).chunk(T, dim=1))])
|
||||||
|
progress_bar = tqdm(range(T), disable=not show_progress_bar)
|
||||||
|
mem = [None] * len(model)
|
||||||
|
while work_queue:
|
||||||
|
xt, i = work_queue.popleft()
|
||||||
|
if i == 0:
|
||||||
|
progress_bar.update(1)
|
||||||
|
if i == len(model):
|
||||||
|
out.append(xt)
|
||||||
|
del xt
|
||||||
|
else:
|
||||||
|
b = model[i]
|
||||||
|
if isinstance(b, MemBlock):
|
||||||
|
if mem[i] is None:
|
||||||
|
xt_new = b(xt, xt * 0)
|
||||||
|
mem[i] = xt.detach().clone()
|
||||||
|
else:
|
||||||
|
xt_new = b(xt, mem[i])
|
||||||
|
mem[i] = xt.detach().clone()
|
||||||
|
del xt
|
||||||
|
work_queue.appendleft(TWorkItem(xt_new, i+1))
|
||||||
|
elif isinstance(b, TPool):
|
||||||
|
if mem[i] is None:
|
||||||
|
mem[i] = []
|
||||||
|
mem[i].append(xt.detach().clone())
|
||||||
|
if len(mem[i]) == b.stride:
|
||||||
|
B, C, H, W = xt.shape
|
||||||
|
xt = b(torch.cat(mem[i], 1).view(B*b.stride, C, H, W))
|
||||||
|
mem[i] = []
|
||||||
|
work_queue.appendleft(TWorkItem(xt, i+1))
|
||||||
|
elif isinstance(b, TGrow):
|
||||||
|
xt = b(xt)
|
||||||
|
NT, C, H, W = xt.shape
|
||||||
|
for xt_next in reversed(xt.view(B, b.stride*C, H, W).chunk(b.stride, 1)):
|
||||||
|
work_queue.appendleft(TWorkItem(xt_next, i+1))
|
||||||
|
del xt
|
||||||
|
else:
|
||||||
|
xt = b(xt)
|
||||||
|
work_queue.appendleft(TWorkItem(xt, i+1))
|
||||||
|
progress_bar.close()
|
||||||
|
x = torch.stack(out, 1)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class TAEHV(nn.Module):
|
||||||
|
def __init__(self, latent_channels, parallel=False, decoder_time_upscale=(True, True), decoder_space_upscale=(True, True, True), latent_format=None, show_progress_bar=True):
|
||||||
|
super().__init__()
|
||||||
|
self.image_channels = 3
|
||||||
|
self.patch_size = 1
|
||||||
|
self.latent_channels = latent_channels
|
||||||
|
self.parallel = parallel
|
||||||
|
self.latent_format = latent_format
|
||||||
|
self.show_progress_bar = show_progress_bar
|
||||||
|
self.process_in = latent_format().process_in if latent_format is not None else (lambda x: x)
|
||||||
|
self.process_out = latent_format().process_out if latent_format is not None else (lambda x: x)
|
||||||
|
if self.latent_channels in [48, 32]: # Wan 2.2 and HunyuanVideo1.5
|
||||||
|
self.patch_size = 2
|
||||||
|
if self.latent_channels == 32: # HunyuanVideo1.5
|
||||||
|
act_func = nn.LeakyReLU(0.2, inplace=True)
|
||||||
|
else: # HunyuanVideo, Wan 2.1
|
||||||
|
act_func = nn.ReLU(inplace=True)
|
||||||
|
|
||||||
|
self.encoder = nn.Sequential(
|
||||||
|
conv(self.image_channels*self.patch_size**2, 64), act_func,
|
||||||
|
TPool(64, 2), conv(64, 64, stride=2, bias=False), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func),
|
||||||
|
TPool(64, 2), conv(64, 64, stride=2, bias=False), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func),
|
||||||
|
TPool(64, 1), conv(64, 64, stride=2, bias=False), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func),
|
||||||
|
conv(64, self.latent_channels),
|
||||||
|
)
|
||||||
|
n_f = [256, 128, 64, 64]
|
||||||
|
self.frames_to_trim = 2**sum(decoder_time_upscale) - 1
|
||||||
|
self.decoder = nn.Sequential(
|
||||||
|
Clamp(), conv(self.latent_channels, n_f[0]), act_func,
|
||||||
|
MemBlock(n_f[0], n_f[0], act_func), MemBlock(n_f[0], n_f[0], act_func), MemBlock(n_f[0], n_f[0], act_func), nn.Upsample(scale_factor=2 if decoder_space_upscale[0] else 1), TGrow(n_f[0], 1), conv(n_f[0], n_f[1], bias=False),
|
||||||
|
MemBlock(n_f[1], n_f[1], act_func), MemBlock(n_f[1], n_f[1], act_func), MemBlock(n_f[1], n_f[1], act_func), nn.Upsample(scale_factor=2 if decoder_space_upscale[1] else 1), TGrow(n_f[1], 2 if decoder_time_upscale[0] else 1), conv(n_f[1], n_f[2], bias=False),
|
||||||
|
MemBlock(n_f[2], n_f[2], act_func), MemBlock(n_f[2], n_f[2], act_func), MemBlock(n_f[2], n_f[2], act_func), nn.Upsample(scale_factor=2 if decoder_space_upscale[2] else 1), TGrow(n_f[2], 2 if decoder_time_upscale[1] else 1), conv(n_f[2], n_f[3], bias=False),
|
||||||
|
act_func, conv(n_f[3], self.image_channels*self.patch_size**2),
|
||||||
|
)
|
||||||
|
@property
|
||||||
|
def show_progress_bar(self):
|
||||||
|
return self._show_progress_bar
|
||||||
|
|
||||||
|
@show_progress_bar.setter
|
||||||
|
def show_progress_bar(self, value):
|
||||||
|
self._show_progress_bar = value
|
||||||
|
|
||||||
|
def encode(self, x, **kwargs):
|
||||||
|
if self.patch_size > 1: x = F.pixel_unshuffle(x, self.patch_size)
|
||||||
|
x = x.movedim(2, 1) # [B, C, T, H, W] -> [B, T, C, H, W]
|
||||||
|
if x.shape[1] % 4 != 0:
|
||||||
|
# pad at end to multiple of 4
|
||||||
|
n_pad = 4 - x.shape[1] % 4
|
||||||
|
padding = x[:, -1:].repeat_interleave(n_pad, dim=1)
|
||||||
|
x = torch.cat([x, padding], 1)
|
||||||
|
x = apply_model_with_memblocks(self.encoder, x, self.parallel, self.show_progress_bar).movedim(2, 1)
|
||||||
|
return self.process_out(x)
|
||||||
|
|
||||||
|
def decode(self, x, **kwargs):
|
||||||
|
x = self.process_in(x).movedim(2, 1) # [B, C, T, H, W] -> [B, T, C, H, W]
|
||||||
|
x = apply_model_with_memblocks(self.decoder, x, self.parallel, self.show_progress_bar)
|
||||||
|
if self.patch_size > 1: x = F.pixel_shuffle(x, self.patch_size)
|
||||||
|
return x[:, self.frames_to_trim:].movedim(2, 1)
|
||||||
@ -7,10 +7,10 @@ from transformers import T5TokenizerFast
|
|||||||
class T5XXLModel(sd1_clip.SDClipModel):
|
class T5XXLModel(sd1_clip.SDClipModel):
|
||||||
def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options={}):
|
def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options={}):
|
||||||
textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_old_config_xxl.json")
|
textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_old_config_xxl.json")
|
||||||
t5xxl_scaled_fp8 = model_options.get("t5xxl_scaled_fp8", None)
|
t5xxl_quantization_metadata = model_options.get("t5xxl_quantization_metadata", None)
|
||||||
if t5xxl_scaled_fp8 is not None:
|
if t5xxl_quantization_metadata is not None:
|
||||||
model_options = model_options.copy()
|
model_options = model_options.copy()
|
||||||
model_options["scaled_fp8"] = t5xxl_scaled_fp8
|
model_options["quantization_metadata"] = t5xxl_quantization_metadata
|
||||||
|
|
||||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, zero_out_masked=attention_mask, model_options=model_options)
|
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, zero_out_masked=attention_mask, model_options=model_options)
|
||||||
|
|
||||||
@ -30,12 +30,12 @@ class CosmosT5Tokenizer(sd1_clip.SD1Tokenizer):
|
|||||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="t5xxl", tokenizer=T5XXLTokenizer)
|
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="t5xxl", tokenizer=T5XXLTokenizer)
|
||||||
|
|
||||||
|
|
||||||
def te(dtype_t5=None, t5xxl_scaled_fp8=None):
|
def te(dtype_t5=None, t5_quantization_metadata=None):
|
||||||
class CosmosTEModel_(CosmosT5XXL):
|
class CosmosTEModel_(CosmosT5XXL):
|
||||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||||
if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options:
|
if t5_quantization_metadata is not None:
|
||||||
model_options = model_options.copy()
|
model_options = model_options.copy()
|
||||||
model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8
|
model_options["t5xxl_quantization_metadata"] = t5_quantization_metadata
|
||||||
if dtype is None:
|
if dtype is None:
|
||||||
dtype = dtype_t5
|
dtype = dtype_t5
|
||||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||||
|
|||||||
@ -63,12 +63,12 @@ class FluxClipModel(torch.nn.Module):
|
|||||||
else:
|
else:
|
||||||
return self.t5xxl.load_sd(sd)
|
return self.t5xxl.load_sd(sd)
|
||||||
|
|
||||||
def flux_clip(dtype_t5=None, t5xxl_scaled_fp8=None):
|
def flux_clip(dtype_t5=None, t5_quantization_metadata=None):
|
||||||
class FluxClipModel_(FluxClipModel):
|
class FluxClipModel_(FluxClipModel):
|
||||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||||
if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options:
|
if t5_quantization_metadata is not None:
|
||||||
model_options = model_options.copy()
|
model_options = model_options.copy()
|
||||||
model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8
|
model_options["t5xxl_quantization_metadata"] = t5_quantization_metadata
|
||||||
super().__init__(dtype_t5=dtype_t5, device=device, dtype=dtype, model_options=model_options)
|
super().__init__(dtype_t5=dtype_t5, device=device, dtype=dtype, model_options=model_options)
|
||||||
return FluxClipModel_
|
return FluxClipModel_
|
||||||
|
|
||||||
@ -159,15 +159,13 @@ class Flux2TEModel(sd1_clip.SD1ClipModel):
|
|||||||
out = out.reshape(out.shape[0], out.shape[1], -1)
|
out = out.reshape(out.shape[0], out.shape[1], -1)
|
||||||
return out, pooled, extra
|
return out, pooled, extra
|
||||||
|
|
||||||
def flux2_te(dtype_llama=None, llama_scaled_fp8=None, llama_quantization_metadata=None, pruned=False):
|
def flux2_te(dtype_llama=None, llama_quantization_metadata=None, pruned=False):
|
||||||
class Flux2TEModel_(Flux2TEModel):
|
class Flux2TEModel_(Flux2TEModel):
|
||||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||||
if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options:
|
|
||||||
model_options = model_options.copy()
|
|
||||||
model_options["scaled_fp8"] = llama_scaled_fp8
|
|
||||||
if dtype_llama is not None:
|
if dtype_llama is not None:
|
||||||
dtype = dtype_llama
|
dtype = dtype_llama
|
||||||
if llama_quantization_metadata is not None:
|
if llama_quantization_metadata is not None:
|
||||||
|
model_options = model_options.copy()
|
||||||
model_options["quantization_metadata"] = llama_quantization_metadata
|
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||||
if pruned:
|
if pruned:
|
||||||
model_options = model_options.copy()
|
model_options = model_options.copy()
|
||||||
|
|||||||
@ -26,12 +26,12 @@ class MochiT5Tokenizer(sd1_clip.SD1Tokenizer):
|
|||||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="t5xxl", tokenizer=T5XXLTokenizer)
|
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="t5xxl", tokenizer=T5XXLTokenizer)
|
||||||
|
|
||||||
|
|
||||||
def mochi_te(dtype_t5=None, t5xxl_scaled_fp8=None):
|
def mochi_te(dtype_t5=None, t5_quantization_metadata=None):
|
||||||
class MochiTEModel_(MochiT5XXL):
|
class MochiTEModel_(MochiT5XXL):
|
||||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||||
if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options:
|
if t5_quantization_metadata is not None:
|
||||||
model_options = model_options.copy()
|
model_options = model_options.copy()
|
||||||
model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8
|
model_options["t5xxl_quantization_metadata"] = t5_quantization_metadata
|
||||||
if dtype is None:
|
if dtype is None:
|
||||||
dtype = dtype_t5
|
dtype = dtype_t5
|
||||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||||
|
|||||||
@ -142,14 +142,14 @@ class HiDreamTEModel(torch.nn.Module):
|
|||||||
return self.llama.load_sd(sd)
|
return self.llama.load_sd(sd)
|
||||||
|
|
||||||
|
|
||||||
def hidream_clip(clip_l=True, clip_g=True, t5=True, llama=True, dtype_t5=None, dtype_llama=None, t5xxl_scaled_fp8=None, llama_scaled_fp8=None):
|
def hidream_clip(clip_l=True, clip_g=True, t5=True, llama=True, dtype_t5=None, dtype_llama=None, t5_quantization_metadata=None, llama_quantization_metadata=None):
|
||||||
class HiDreamTEModel_(HiDreamTEModel):
|
class HiDreamTEModel_(HiDreamTEModel):
|
||||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||||
if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options:
|
if t5_quantization_metadata is not None:
|
||||||
model_options = model_options.copy()
|
model_options = model_options.copy()
|
||||||
model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8
|
model_options["t5xxl_quantization_metadata"] = t5_quantization_metadata
|
||||||
if llama_scaled_fp8 is not None and "llama_scaled_fp8" not in model_options:
|
if llama_quantization_metadata is not None:
|
||||||
model_options = model_options.copy()
|
model_options = model_options.copy()
|
||||||
model_options["llama_scaled_fp8"] = llama_scaled_fp8
|
model_options["llama_quantization_metadata"] = llama_quantization_metadata
|
||||||
super().__init__(clip_l=clip_l, clip_g=clip_g, t5=t5, llama=llama, dtype_t5=dtype_t5, dtype_llama=dtype_llama, device=device, dtype=dtype, model_options=model_options)
|
super().__init__(clip_l=clip_l, clip_g=clip_g, t5=t5, llama=llama, dtype_t5=dtype_t5, dtype_llama=dtype_llama, device=device, dtype=dtype, model_options=model_options)
|
||||||
return HiDreamTEModel_
|
return HiDreamTEModel_
|
||||||
|
|||||||
@ -40,10 +40,10 @@ class HunyuanImageTokenizer(QwenImageTokenizer):
|
|||||||
|
|
||||||
class Qwen25_7BVLIModel(sd1_clip.SDClipModel):
|
class Qwen25_7BVLIModel(sd1_clip.SDClipModel):
|
||||||
def __init__(self, device="cpu", layer="hidden", layer_idx=-3, dtype=None, attention_mask=True, model_options={}):
|
def __init__(self, device="cpu", layer="hidden", layer_idx=-3, dtype=None, attention_mask=True, model_options={}):
|
||||||
llama_scaled_fp8 = model_options.get("qwen_scaled_fp8", None)
|
llama_quantization_metadata = model_options.get("llama_quantization_metadata", None)
|
||||||
if llama_scaled_fp8 is not None:
|
if llama_quantization_metadata is not None:
|
||||||
model_options = model_options.copy()
|
model_options = model_options.copy()
|
||||||
model_options["scaled_fp8"] = llama_scaled_fp8
|
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Qwen25_7BVLI, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
|
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Qwen25_7BVLI, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
|
||||||
|
|
||||||
|
|
||||||
@ -91,12 +91,12 @@ class HunyuanImageTEModel(QwenImageTEModel):
|
|||||||
else:
|
else:
|
||||||
return super().load_sd(sd)
|
return super().load_sd(sd)
|
||||||
|
|
||||||
def te(byt5=True, dtype_llama=None, llama_scaled_fp8=None):
|
def te(byt5=True, dtype_llama=None, llama_quantization_metadata=None):
|
||||||
class QwenImageTEModel_(HunyuanImageTEModel):
|
class QwenImageTEModel_(HunyuanImageTEModel):
|
||||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||||
if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options:
|
if llama_quantization_metadata is not None:
|
||||||
model_options = model_options.copy()
|
model_options = model_options.copy()
|
||||||
model_options["qwen_scaled_fp8"] = llama_scaled_fp8
|
model_options["llama_quantization_metadata"] = llama_quantization_metadata
|
||||||
if dtype_llama is not None:
|
if dtype_llama is not None:
|
||||||
dtype = dtype_llama
|
dtype = dtype_llama
|
||||||
super().__init__(byt5=byt5, device=device, dtype=dtype, model_options=model_options)
|
super().__init__(byt5=byt5, device=device, dtype=dtype, model_options=model_options)
|
||||||
|
|||||||
@ -6,7 +6,7 @@ from transformers import LlamaTokenizerFast
|
|||||||
import torch
|
import torch
|
||||||
import os
|
import os
|
||||||
import numbers
|
import numbers
|
||||||
|
import comfy.utils
|
||||||
|
|
||||||
def llama_detect(state_dict, prefix=""):
|
def llama_detect(state_dict, prefix=""):
|
||||||
out = {}
|
out = {}
|
||||||
@ -14,12 +14,9 @@ def llama_detect(state_dict, prefix=""):
|
|||||||
if t5_key in state_dict:
|
if t5_key in state_dict:
|
||||||
out["dtype_llama"] = state_dict[t5_key].dtype
|
out["dtype_llama"] = state_dict[t5_key].dtype
|
||||||
|
|
||||||
scaled_fp8_key = "{}scaled_fp8".format(prefix)
|
quant = comfy.utils.detect_layer_quantization(state_dict, prefix)
|
||||||
if scaled_fp8_key in state_dict:
|
if quant is not None:
|
||||||
out["llama_scaled_fp8"] = state_dict[scaled_fp8_key].dtype
|
out["llama_quantization_metadata"] = quant
|
||||||
|
|
||||||
if "_quantization_metadata" in state_dict:
|
|
||||||
out["llama_quantization_metadata"] = state_dict["_quantization_metadata"]
|
|
||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
@ -31,10 +28,10 @@ class LLAMA3Tokenizer(sd1_clip.SDTokenizer):
|
|||||||
|
|
||||||
class LLAMAModel(sd1_clip.SDClipModel):
|
class LLAMAModel(sd1_clip.SDClipModel):
|
||||||
def __init__(self, device="cpu", layer="hidden", layer_idx=-3, dtype=None, attention_mask=True, model_options={}, special_tokens={"start": 128000, "pad": 128258}):
|
def __init__(self, device="cpu", layer="hidden", layer_idx=-3, dtype=None, attention_mask=True, model_options={}, special_tokens={"start": 128000, "pad": 128258}):
|
||||||
llama_scaled_fp8 = model_options.get("llama_scaled_fp8", None)
|
llama_quantization_metadata = model_options.get("llama_quantization_metadata", None)
|
||||||
if llama_scaled_fp8 is not None:
|
if llama_quantization_metadata is not None:
|
||||||
model_options = model_options.copy()
|
model_options = model_options.copy()
|
||||||
model_options["scaled_fp8"] = llama_scaled_fp8
|
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||||
|
|
||||||
textmodel_json_config = {}
|
textmodel_json_config = {}
|
||||||
vocab_size = model_options.get("vocab_size", None)
|
vocab_size = model_options.get("vocab_size", None)
|
||||||
@ -161,11 +158,11 @@ class HunyuanVideoClipModel(torch.nn.Module):
|
|||||||
return self.llama.load_sd(sd)
|
return self.llama.load_sd(sd)
|
||||||
|
|
||||||
|
|
||||||
def hunyuan_video_clip(dtype_llama=None, llama_scaled_fp8=None):
|
def hunyuan_video_clip(dtype_llama=None, llama_quantization_metadata=None):
|
||||||
class HunyuanVideoClipModel_(HunyuanVideoClipModel):
|
class HunyuanVideoClipModel_(HunyuanVideoClipModel):
|
||||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||||
if llama_scaled_fp8 is not None and "llama_scaled_fp8" not in model_options:
|
if llama_quantization_metadata is not None:
|
||||||
model_options = model_options.copy()
|
model_options = model_options.copy()
|
||||||
model_options["llama_scaled_fp8"] = llama_scaled_fp8
|
model_options["llama_quantization_metadata"] = llama_quantization_metadata
|
||||||
super().__init__(dtype_llama=dtype_llama, device=device, dtype=dtype, model_options=model_options)
|
super().__init__(dtype_llama=dtype_llama, device=device, dtype=dtype, model_options=model_options)
|
||||||
return HunyuanVideoClipModel_
|
return HunyuanVideoClipModel_
|
||||||
|
|||||||
68
comfy/text_encoders/kandinsky5.py
Normal file
68
comfy/text_encoders/kandinsky5.py
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
from comfy import sd1_clip
|
||||||
|
from .qwen_image import QwenImageTokenizer, QwenImageTEModel
|
||||||
|
from .llama import Qwen25_7BVLI
|
||||||
|
|
||||||
|
|
||||||
|
class Kandinsky5Tokenizer(QwenImageTokenizer):
|
||||||
|
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||||
|
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
|
||||||
|
self.llama_template = "<|im_start|>system\nYou are a prompt engineer. Describe the video in detail.\nDescribe how the camera moves or shakes, describe the zoom and view angle, whether it follows the objects.\nDescribe the location of the video, main characters or objects and their action.\nDescribe the dynamism of the video and presented actions.\nName the visual style of the video: whether it is a professional footage, user generated content, some kind of animation, video game or screen content.\nDescribe the visual effects, postprocessing and transitions if they are presented in the video.\nPay attention to the order of key actions shown in the scene.<|im_end|>\n<|im_start|>user\n{}<|im_end|>"
|
||||||
|
self.clip_l = sd1_clip.SDTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
|
||||||
|
|
||||||
|
def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
|
||||||
|
out = super().tokenize_with_weights(text, return_word_ids, **kwargs)
|
||||||
|
out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids, **kwargs)
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class Kandinsky5TokenizerImage(Kandinsky5Tokenizer):
|
||||||
|
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||||
|
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
|
||||||
|
self.llama_template = "<|im_start|>system\nYou are a promt engineer. Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>"
|
||||||
|
|
||||||
|
|
||||||
|
class Qwen25_7BVLIModel(sd1_clip.SDClipModel):
|
||||||
|
def __init__(self, device="cpu", layer="hidden", layer_idx=-1, dtype=None, attention_mask=True, model_options={}):
|
||||||
|
llama_quantization_metadata = model_options.get("llama_quantization_metadata", None)
|
||||||
|
if llama_quantization_metadata is not None:
|
||||||
|
model_options = model_options.copy()
|
||||||
|
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||||
|
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=Qwen25_7BVLI, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
|
||||||
|
|
||||||
|
|
||||||
|
class Kandinsky5TEModel(QwenImageTEModel):
|
||||||
|
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||||
|
super(QwenImageTEModel, self).__init__(device=device, dtype=dtype, name="qwen25_7b", clip_model=Qwen25_7BVLIModel, model_options=model_options)
|
||||||
|
self.clip_l = sd1_clip.SDClipModel(device=device, dtype=dtype, return_projected_pooled=False, model_options=model_options)
|
||||||
|
|
||||||
|
def encode_token_weights(self, token_weight_pairs):
|
||||||
|
cond, p, extra = super().encode_token_weights(token_weight_pairs, template_end=-1)
|
||||||
|
l_out, l_pooled = self.clip_l.encode_token_weights(token_weight_pairs["l"])
|
||||||
|
|
||||||
|
return cond, l_pooled, extra
|
||||||
|
|
||||||
|
def set_clip_options(self, options):
|
||||||
|
super().set_clip_options(options)
|
||||||
|
self.clip_l.set_clip_options(options)
|
||||||
|
|
||||||
|
def reset_clip_options(self):
|
||||||
|
super().reset_clip_options()
|
||||||
|
self.clip_l.reset_clip_options()
|
||||||
|
|
||||||
|
def load_sd(self, sd):
|
||||||
|
if "text_model.encoder.layers.1.mlp.fc1.weight" in sd:
|
||||||
|
return self.clip_l.load_sd(sd)
|
||||||
|
else:
|
||||||
|
return super().load_sd(sd)
|
||||||
|
|
||||||
|
def te(dtype_llama=None, llama_quantization_metadata=None):
|
||||||
|
class Kandinsky5TEModel_(Kandinsky5TEModel):
|
||||||
|
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||||
|
if llama_quantization_metadata is not None:
|
||||||
|
model_options = model_options.copy()
|
||||||
|
model_options["llama_quantization_metadata"] = llama_quantization_metadata
|
||||||
|
if dtype_llama is not None:
|
||||||
|
dtype = dtype_llama
|
||||||
|
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||||
|
return Kandinsky5TEModel_
|
||||||
@ -100,6 +100,28 @@ class Qwen3_4BConfig:
|
|||||||
rope_scale = None
|
rope_scale = None
|
||||||
final_norm: bool = True
|
final_norm: bool = True
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Ovis25_2BConfig:
|
||||||
|
vocab_size: int = 151936
|
||||||
|
hidden_size: int = 2048
|
||||||
|
intermediate_size: int = 6144
|
||||||
|
num_hidden_layers: int = 28
|
||||||
|
num_attention_heads: int = 16
|
||||||
|
num_key_value_heads: int = 8
|
||||||
|
max_position_embeddings: int = 40960
|
||||||
|
rms_norm_eps: float = 1e-6
|
||||||
|
rope_theta: float = 1000000.0
|
||||||
|
transformer_type: str = "llama"
|
||||||
|
head_dim = 128
|
||||||
|
rms_norm_add = False
|
||||||
|
mlp_activation = "silu"
|
||||||
|
qkv_bias = False
|
||||||
|
rope_dims = None
|
||||||
|
q_norm = "gemma3"
|
||||||
|
k_norm = "gemma3"
|
||||||
|
rope_scale = None
|
||||||
|
final_norm: bool = True
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Qwen25_7BVLI_Config:
|
class Qwen25_7BVLI_Config:
|
||||||
vocab_size: int = 152064
|
vocab_size: int = 152064
|
||||||
@ -542,6 +564,15 @@ class Qwen3_4B(BaseLlama, torch.nn.Module):
|
|||||||
self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
|
self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
|
||||||
self.dtype = dtype
|
self.dtype = dtype
|
||||||
|
|
||||||
|
class Ovis25_2B(BaseLlama, torch.nn.Module):
|
||||||
|
def __init__(self, config_dict, dtype, device, operations):
|
||||||
|
super().__init__()
|
||||||
|
config = Ovis25_2BConfig(**config_dict)
|
||||||
|
self.num_layers = config.num_hidden_layers
|
||||||
|
|
||||||
|
self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
|
||||||
|
self.dtype = dtype
|
||||||
|
|
||||||
class Qwen25_7BVLI(BaseLlama, torch.nn.Module):
|
class Qwen25_7BVLI(BaseLlama, torch.nn.Module):
|
||||||
def __init__(self, config_dict, dtype, device, operations):
|
def __init__(self, config_dict, dtype, device, operations):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|||||||
@ -40,7 +40,7 @@ class LuminaModel(sd1_clip.SD1ClipModel):
|
|||||||
super().__init__(device=device, dtype=dtype, name=name, clip_model=clip_model, model_options=model_options)
|
super().__init__(device=device, dtype=dtype, name=name, clip_model=clip_model, model_options=model_options)
|
||||||
|
|
||||||
|
|
||||||
def te(dtype_llama=None, llama_scaled_fp8=None, model_type="gemma2_2b"):
|
def te(dtype_llama=None, llama_quantization_metadata=None, model_type="gemma2_2b"):
|
||||||
if model_type == "gemma2_2b":
|
if model_type == "gemma2_2b":
|
||||||
model = Gemma2_2BModel
|
model = Gemma2_2BModel
|
||||||
elif model_type == "gemma3_4b":
|
elif model_type == "gemma3_4b":
|
||||||
@ -48,9 +48,9 @@ def te(dtype_llama=None, llama_scaled_fp8=None, model_type="gemma2_2b"):
|
|||||||
|
|
||||||
class LuminaTEModel_(LuminaModel):
|
class LuminaTEModel_(LuminaModel):
|
||||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||||
if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options:
|
if llama_quantization_metadata is not None:
|
||||||
model_options = model_options.copy()
|
model_options = model_options.copy()
|
||||||
model_options["scaled_fp8"] = llama_scaled_fp8
|
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||||
if dtype_llama is not None:
|
if dtype_llama is not None:
|
||||||
dtype = dtype_llama
|
dtype = dtype_llama
|
||||||
super().__init__(device=device, dtype=dtype, name=model_type, model_options=model_options, clip_model=model)
|
super().__init__(device=device, dtype=dtype, name=model_type, model_options=model_options, clip_model=model)
|
||||||
|
|||||||
@ -32,12 +32,12 @@ class Omnigen2Model(sd1_clip.SD1ClipModel):
|
|||||||
super().__init__(device=device, dtype=dtype, name="qwen25_3b", clip_model=Qwen25_3BModel, model_options=model_options)
|
super().__init__(device=device, dtype=dtype, name="qwen25_3b", clip_model=Qwen25_3BModel, model_options=model_options)
|
||||||
|
|
||||||
|
|
||||||
def te(dtype_llama=None, llama_scaled_fp8=None):
|
def te(dtype_llama=None, llama_quantization_metadata=None):
|
||||||
class Omnigen2TEModel_(Omnigen2Model):
|
class Omnigen2TEModel_(Omnigen2Model):
|
||||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||||
if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options:
|
if llama_quantization_metadata is not None:
|
||||||
model_options = model_options.copy()
|
model_options = model_options.copy()
|
||||||
model_options["scaled_fp8"] = llama_scaled_fp8
|
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||||
if dtype_llama is not None:
|
if dtype_llama is not None:
|
||||||
dtype = dtype_llama
|
dtype = dtype_llama
|
||||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||||
|
|||||||
66
comfy/text_encoders/ovis.py
Normal file
66
comfy/text_encoders/ovis.py
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
from transformers import Qwen2Tokenizer
|
||||||
|
import comfy.text_encoders.llama
|
||||||
|
from comfy import sd1_clip
|
||||||
|
import os
|
||||||
|
import torch
|
||||||
|
import numbers
|
||||||
|
|
||||||
|
class Qwen3Tokenizer(sd1_clip.SDTokenizer):
|
||||||
|
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||||
|
tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "qwen25_tokenizer")
|
||||||
|
super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2048, embedding_key='qwen3_2b', tokenizer_class=Qwen2Tokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=284, pad_token=151643, tokenizer_data=tokenizer_data)
|
||||||
|
|
||||||
|
|
||||||
|
class OvisTokenizer(sd1_clip.SD1Tokenizer):
|
||||||
|
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||||
|
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="qwen3_2b", tokenizer=Qwen3Tokenizer)
|
||||||
|
self.llama_template = "<|im_start|>user\nDescribe the image by detailing the color, quantity, text, shape, size, texture, spatial relationships of the objects and background: {}<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
|
||||||
|
|
||||||
|
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, **kwargs):
|
||||||
|
if llama_template is None:
|
||||||
|
llama_text = self.llama_template.format(text)
|
||||||
|
else:
|
||||||
|
llama_text = llama_template.format(text)
|
||||||
|
|
||||||
|
tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, disable_weights=True, **kwargs)
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
class Ovis25_2BModel(sd1_clip.SDClipModel):
|
||||||
|
def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options={}):
|
||||||
|
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Ovis25_2B, enable_attention_masks=attention_mask, return_attention_masks=False, zero_out_masked=True, model_options=model_options)
|
||||||
|
|
||||||
|
|
||||||
|
class OvisTEModel(sd1_clip.SD1ClipModel):
|
||||||
|
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||||
|
super().__init__(device=device, dtype=dtype, name="qwen3_2b", clip_model=Ovis25_2BModel, model_options=model_options)
|
||||||
|
|
||||||
|
def encode_token_weights(self, token_weight_pairs, template_end=-1):
|
||||||
|
out, pooled = super().encode_token_weights(token_weight_pairs)
|
||||||
|
tok_pairs = token_weight_pairs["qwen3_2b"][0]
|
||||||
|
count_im_start = 0
|
||||||
|
if template_end == -1:
|
||||||
|
for i, v in enumerate(tok_pairs):
|
||||||
|
elem = v[0]
|
||||||
|
if not torch.is_tensor(elem):
|
||||||
|
if isinstance(elem, numbers.Integral):
|
||||||
|
if elem == 4004 and count_im_start < 1:
|
||||||
|
template_end = i
|
||||||
|
count_im_start += 1
|
||||||
|
|
||||||
|
if out.shape[1] > (template_end + 1):
|
||||||
|
if tok_pairs[template_end + 1][0] == 25:
|
||||||
|
template_end += 1
|
||||||
|
|
||||||
|
out = out[:, template_end:]
|
||||||
|
return out, pooled, {}
|
||||||
|
|
||||||
|
|
||||||
|
def te(dtype_llama=None, llama_quantization_metadata=None):
|
||||||
|
class OvisTEModel_(OvisTEModel):
|
||||||
|
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||||
|
if dtype_llama is not None:
|
||||||
|
dtype = dtype_llama
|
||||||
|
if llama_quantization_metadata is not None:
|
||||||
|
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||||
|
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||||
|
return OvisTEModel_
|
||||||
@ -30,12 +30,12 @@ class PixArtTokenizer(sd1_clip.SD1Tokenizer):
|
|||||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="t5xxl", tokenizer=T5XXLTokenizer)
|
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="t5xxl", tokenizer=T5XXLTokenizer)
|
||||||
|
|
||||||
def pixart_te(dtype_t5=None, t5xxl_scaled_fp8=None):
|
def pixart_te(dtype_t5=None, t5_quantization_metadata=None):
|
||||||
class PixArtTEModel_(PixArtT5XXL):
|
class PixArtTEModel_(PixArtT5XXL):
|
||||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||||
if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options:
|
if t5_quantization_metadata is not None:
|
||||||
model_options = model_options.copy()
|
model_options = model_options.copy()
|
||||||
model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8
|
model_options["t5xxl_quantization_metadata"] = t5_quantization_metadata
|
||||||
if dtype is None:
|
if dtype is None:
|
||||||
dtype = dtype_t5
|
dtype = dtype_t5
|
||||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||||
|
|||||||
@ -179,36 +179,36 @@
|
|||||||
"special": false
|
"special": false
|
||||||
},
|
},
|
||||||
"151665": {
|
"151665": {
|
||||||
"content": "<|img|>",
|
"content": "<tool_response>",
|
||||||
"lstrip": false,
|
"lstrip": false,
|
||||||
"normalized": false,
|
"normalized": false,
|
||||||
"rstrip": false,
|
"rstrip": false,
|
||||||
"single_word": false,
|
"single_word": false,
|
||||||
"special": true
|
"special": false
|
||||||
},
|
},
|
||||||
"151666": {
|
"151666": {
|
||||||
"content": "<|endofimg|>",
|
"content": "</tool_response>",
|
||||||
"lstrip": false,
|
"lstrip": false,
|
||||||
"normalized": false,
|
"normalized": false,
|
||||||
"rstrip": false,
|
"rstrip": false,
|
||||||
"single_word": false,
|
"single_word": false,
|
||||||
"special": true
|
"special": false
|
||||||
},
|
},
|
||||||
"151667": {
|
"151667": {
|
||||||
"content": "<|meta|>",
|
"content": "<think>",
|
||||||
"lstrip": false,
|
"lstrip": false,
|
||||||
"normalized": false,
|
"normalized": false,
|
||||||
"rstrip": false,
|
"rstrip": false,
|
||||||
"single_word": false,
|
"single_word": false,
|
||||||
"special": true
|
"special": false
|
||||||
},
|
},
|
||||||
"151668": {
|
"151668": {
|
||||||
"content": "<|endofmeta|>",
|
"content": "</think>",
|
||||||
"lstrip": false,
|
"lstrip": false,
|
||||||
"normalized": false,
|
"normalized": false,
|
||||||
"rstrip": false,
|
"rstrip": false,
|
||||||
"single_word": false,
|
"single_word": false,
|
||||||
"special": true
|
"special": false
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"additional_special_tokens": [
|
"additional_special_tokens": [
|
||||||
|
|||||||
@ -85,12 +85,12 @@ class QwenImageTEModel(sd1_clip.SD1ClipModel):
|
|||||||
return out, pooled, extra
|
return out, pooled, extra
|
||||||
|
|
||||||
|
|
||||||
def te(dtype_llama=None, llama_scaled_fp8=None):
|
def te(dtype_llama=None, llama_quantization_metadata=None):
|
||||||
class QwenImageTEModel_(QwenImageTEModel):
|
class QwenImageTEModel_(QwenImageTEModel):
|
||||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||||
if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options:
|
if llama_quantization_metadata is not None:
|
||||||
model_options = model_options.copy()
|
model_options = model_options.copy()
|
||||||
model_options["scaled_fp8"] = llama_scaled_fp8
|
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||||
if dtype_llama is not None:
|
if dtype_llama is not None:
|
||||||
dtype = dtype_llama
|
dtype = dtype_llama
|
||||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||||
|
|||||||
@ -6,14 +6,15 @@ import torch
|
|||||||
import os
|
import os
|
||||||
import comfy.model_management
|
import comfy.model_management
|
||||||
import logging
|
import logging
|
||||||
|
import comfy.utils
|
||||||
|
|
||||||
class T5XXLModel(sd1_clip.SDClipModel):
|
class T5XXLModel(sd1_clip.SDClipModel):
|
||||||
def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=False, model_options={}):
|
def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=False, model_options={}):
|
||||||
textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_config_xxl.json")
|
textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_config_xxl.json")
|
||||||
t5xxl_scaled_fp8 = model_options.get("t5xxl_scaled_fp8", None)
|
t5xxl_quantization_metadata = model_options.get("t5xxl_quantization_metadata", None)
|
||||||
if t5xxl_scaled_fp8 is not None:
|
if t5xxl_quantization_metadata is not None:
|
||||||
model_options = model_options.copy()
|
model_options = model_options.copy()
|
||||||
model_options["scaled_fp8"] = t5xxl_scaled_fp8
|
model_options["quantization_metadata"] = t5xxl_quantization_metadata
|
||||||
|
|
||||||
model_options = {**model_options, "model_name": "t5xxl"}
|
model_options = {**model_options, "model_name": "t5xxl"}
|
||||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
|
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
|
||||||
@ -25,9 +26,9 @@ def t5_xxl_detect(state_dict, prefix=""):
|
|||||||
if t5_key in state_dict:
|
if t5_key in state_dict:
|
||||||
out["dtype_t5"] = state_dict[t5_key].dtype
|
out["dtype_t5"] = state_dict[t5_key].dtype
|
||||||
|
|
||||||
scaled_fp8_key = "{}scaled_fp8".format(prefix)
|
quant = comfy.utils.detect_layer_quantization(state_dict, prefix)
|
||||||
if scaled_fp8_key in state_dict:
|
if quant is not None:
|
||||||
out["t5xxl_scaled_fp8"] = state_dict[scaled_fp8_key].dtype
|
out["t5_quantization_metadata"] = quant
|
||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
@ -156,11 +157,11 @@ class SD3ClipModel(torch.nn.Module):
|
|||||||
else:
|
else:
|
||||||
return self.t5xxl.load_sd(sd)
|
return self.t5xxl.load_sd(sd)
|
||||||
|
|
||||||
def sd3_clip(clip_l=True, clip_g=True, t5=True, dtype_t5=None, t5xxl_scaled_fp8=None, t5_attention_mask=False):
|
def sd3_clip(clip_l=True, clip_g=True, t5=True, dtype_t5=None, t5_quantization_metadata=None, t5_attention_mask=False):
|
||||||
class SD3ClipModel_(SD3ClipModel):
|
class SD3ClipModel_(SD3ClipModel):
|
||||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||||
if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options:
|
if t5_quantization_metadata is not None:
|
||||||
model_options = model_options.copy()
|
model_options = model_options.copy()
|
||||||
model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8
|
model_options["t5xxl_quantization_metadata"] = t5_quantization_metadata
|
||||||
super().__init__(clip_l=clip_l, clip_g=clip_g, t5=t5, dtype_t5=dtype_t5, t5_attention_mask=t5_attention_mask, device=device, dtype=dtype, model_options=model_options)
|
super().__init__(clip_l=clip_l, clip_g=clip_g, t5=t5, dtype_t5=dtype_t5, t5_attention_mask=t5_attention_mask, device=device, dtype=dtype, model_options=model_options)
|
||||||
return SD3ClipModel_
|
return SD3ClipModel_
|
||||||
|
|||||||
@ -25,12 +25,12 @@ class WanT5Model(sd1_clip.SD1ClipModel):
|
|||||||
def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs):
|
def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs):
|
||||||
super().__init__(device=device, dtype=dtype, model_options=model_options, name="umt5xxl", clip_model=UMT5XXlModel, **kwargs)
|
super().__init__(device=device, dtype=dtype, model_options=model_options, name="umt5xxl", clip_model=UMT5XXlModel, **kwargs)
|
||||||
|
|
||||||
def te(dtype_t5=None, t5xxl_scaled_fp8=None):
|
def te(dtype_t5=None, t5_quantization_metadata=None):
|
||||||
class WanTEModel(WanT5Model):
|
class WanTEModel(WanT5Model):
|
||||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||||
if t5xxl_scaled_fp8 is not None and "scaled_fp8" not in model_options:
|
if t5_quantization_metadata is not None:
|
||||||
model_options = model_options.copy()
|
model_options = model_options.copy()
|
||||||
model_options["scaled_fp8"] = t5xxl_scaled_fp8
|
model_options["quantization_metadata"] = t5_quantization_metadata
|
||||||
if dtype_t5 is not None:
|
if dtype_t5 is not None:
|
||||||
dtype = dtype_t5
|
dtype = dtype_t5
|
||||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||||
|
|||||||
@ -34,12 +34,9 @@ class ZImageTEModel(sd1_clip.SD1ClipModel):
|
|||||||
super().__init__(device=device, dtype=dtype, name="qwen3_4b", clip_model=Qwen3_4BModel, model_options=model_options)
|
super().__init__(device=device, dtype=dtype, name="qwen3_4b", clip_model=Qwen3_4BModel, model_options=model_options)
|
||||||
|
|
||||||
|
|
||||||
def te(dtype_llama=None, llama_scaled_fp8=None, llama_quantization_metadata=None):
|
def te(dtype_llama=None, llama_quantization_metadata=None):
|
||||||
class ZImageTEModel_(ZImageTEModel):
|
class ZImageTEModel_(ZImageTEModel):
|
||||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||||
if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options:
|
|
||||||
model_options = model_options.copy()
|
|
||||||
model_options["scaled_fp8"] = llama_scaled_fp8
|
|
||||||
if dtype_llama is not None:
|
if dtype_llama is not None:
|
||||||
dtype = dtype_llama
|
dtype = dtype_llama
|
||||||
if llama_quantization_metadata is not None:
|
if llama_quantization_metadata is not None:
|
||||||
|
|||||||
143
comfy/utils.py
143
comfy/utils.py
@ -29,6 +29,7 @@ import itertools
|
|||||||
from torch.nn.functional import interpolate
|
from torch.nn.functional import interpolate
|
||||||
from einops import rearrange
|
from einops import rearrange
|
||||||
from comfy.cli_args import args
|
from comfy.cli_args import args
|
||||||
|
import json
|
||||||
|
|
||||||
MMAP_TORCH_FILES = args.mmap_torch_files
|
MMAP_TORCH_FILES = args.mmap_torch_files
|
||||||
DISABLE_MMAP = args.disable_mmap
|
DISABLE_MMAP = args.disable_mmap
|
||||||
@ -52,7 +53,7 @@ if hasattr(torch.serialization, "add_safe_globals"): # TODO: this was added in
|
|||||||
ALWAYS_SAFE_LOAD = True
|
ALWAYS_SAFE_LOAD = True
|
||||||
logging.info("Checkpoint files will always be loaded safely.")
|
logging.info("Checkpoint files will always be loaded safely.")
|
||||||
else:
|
else:
|
||||||
logging.info("Warning, you are using an old pytorch version and some ckpt/pt files might be loaded unsafely. Upgrading to 2.4 or above is recommended.")
|
logging.warning("Warning, you are using an old pytorch version and some ckpt/pt files might be loaded unsafely. Upgrading to 2.4 or above is recommended as older versions of pytorch are no longer supported.")
|
||||||
|
|
||||||
def load_torch_file(ckpt, safe_load=False, device=None, return_metadata=False):
|
def load_torch_file(ckpt, safe_load=False, device=None, return_metadata=False):
|
||||||
if device is None:
|
if device is None:
|
||||||
@ -675,6 +676,72 @@ def flux_to_diffusers(mmdit_config, output_prefix=""):
|
|||||||
|
|
||||||
return key_map
|
return key_map
|
||||||
|
|
||||||
|
def z_image_to_diffusers(mmdit_config, output_prefix=""):
|
||||||
|
n_layers = mmdit_config.get("n_layers", 0)
|
||||||
|
hidden_size = mmdit_config.get("dim", 0)
|
||||||
|
n_context_refiner = mmdit_config.get("n_refiner_layers", 2)
|
||||||
|
n_noise_refiner = mmdit_config.get("n_refiner_layers", 2)
|
||||||
|
key_map = {}
|
||||||
|
|
||||||
|
def add_block_keys(prefix_from, prefix_to, has_adaln=True):
|
||||||
|
for end in ("weight", "bias"):
|
||||||
|
k = "{}.attention.".format(prefix_from)
|
||||||
|
qkv = "{}.attention.qkv.{}".format(prefix_to, end)
|
||||||
|
key_map["{}to_q.{}".format(k, end)] = (qkv, (0, 0, hidden_size))
|
||||||
|
key_map["{}to_k.{}".format(k, end)] = (qkv, (0, hidden_size, hidden_size))
|
||||||
|
key_map["{}to_v.{}".format(k, end)] = (qkv, (0, hidden_size * 2, hidden_size))
|
||||||
|
|
||||||
|
block_map = {
|
||||||
|
"attention.norm_q.weight": "attention.q_norm.weight",
|
||||||
|
"attention.norm_k.weight": "attention.k_norm.weight",
|
||||||
|
"attention.to_out.0.weight": "attention.out.weight",
|
||||||
|
"attention.to_out.0.bias": "attention.out.bias",
|
||||||
|
"attention_norm1.weight": "attention_norm1.weight",
|
||||||
|
"attention_norm2.weight": "attention_norm2.weight",
|
||||||
|
"feed_forward.w1.weight": "feed_forward.w1.weight",
|
||||||
|
"feed_forward.w2.weight": "feed_forward.w2.weight",
|
||||||
|
"feed_forward.w3.weight": "feed_forward.w3.weight",
|
||||||
|
"ffn_norm1.weight": "ffn_norm1.weight",
|
||||||
|
"ffn_norm2.weight": "ffn_norm2.weight",
|
||||||
|
}
|
||||||
|
if has_adaln:
|
||||||
|
block_map["adaLN_modulation.0.weight"] = "adaLN_modulation.0.weight"
|
||||||
|
block_map["adaLN_modulation.0.bias"] = "adaLN_modulation.0.bias"
|
||||||
|
for k, v in block_map.items():
|
||||||
|
key_map["{}.{}".format(prefix_from, k)] = "{}.{}".format(prefix_to, v)
|
||||||
|
|
||||||
|
for i in range(n_layers):
|
||||||
|
add_block_keys("layers.{}".format(i), "{}layers.{}".format(output_prefix, i))
|
||||||
|
|
||||||
|
for i in range(n_context_refiner):
|
||||||
|
add_block_keys("context_refiner.{}".format(i), "{}context_refiner.{}".format(output_prefix, i))
|
||||||
|
|
||||||
|
for i in range(n_noise_refiner):
|
||||||
|
add_block_keys("noise_refiner.{}".format(i), "{}noise_refiner.{}".format(output_prefix, i))
|
||||||
|
|
||||||
|
MAP_BASIC = [
|
||||||
|
("final_layer.linear.weight", "all_final_layer.2-1.linear.weight"),
|
||||||
|
("final_layer.linear.bias", "all_final_layer.2-1.linear.bias"),
|
||||||
|
("final_layer.adaLN_modulation.1.weight", "all_final_layer.2-1.adaLN_modulation.1.weight"),
|
||||||
|
("final_layer.adaLN_modulation.1.bias", "all_final_layer.2-1.adaLN_modulation.1.bias"),
|
||||||
|
("x_embedder.weight", "all_x_embedder.2-1.weight"),
|
||||||
|
("x_embedder.bias", "all_x_embedder.2-1.bias"),
|
||||||
|
("x_pad_token", "x_pad_token"),
|
||||||
|
("cap_embedder.0.weight", "cap_embedder.0.weight"),
|
||||||
|
("cap_embedder.1.weight", "cap_embedder.1.weight"),
|
||||||
|
("cap_embedder.1.bias", "cap_embedder.1.bias"),
|
||||||
|
("cap_pad_token", "cap_pad_token"),
|
||||||
|
("t_embedder.mlp.0.weight", "t_embedder.mlp.0.weight"),
|
||||||
|
("t_embedder.mlp.0.bias", "t_embedder.mlp.0.bias"),
|
||||||
|
("t_embedder.mlp.2.weight", "t_embedder.mlp.2.weight"),
|
||||||
|
("t_embedder.mlp.2.bias", "t_embedder.mlp.2.bias"),
|
||||||
|
]
|
||||||
|
|
||||||
|
for c, diffusers in MAP_BASIC:
|
||||||
|
key_map[diffusers] = "{}{}".format(output_prefix, c)
|
||||||
|
|
||||||
|
return key_map
|
||||||
|
|
||||||
def repeat_to_batch_size(tensor, batch_size, dim=0):
|
def repeat_to_batch_size(tensor, batch_size, dim=0):
|
||||||
if tensor.shape[dim] > batch_size:
|
if tensor.shape[dim] > batch_size:
|
||||||
return tensor.narrow(dim, 0, batch_size)
|
return tensor.narrow(dim, 0, batch_size)
|
||||||
@ -736,12 +803,17 @@ def safetensors_header(safetensors_path, max_size=100*1024*1024):
|
|||||||
return None
|
return None
|
||||||
return f.read(length_of_header)
|
return f.read(length_of_header)
|
||||||
|
|
||||||
|
ATTR_UNSET={}
|
||||||
|
|
||||||
def set_attr(obj, attr, value):
|
def set_attr(obj, attr, value):
|
||||||
attrs = attr.split(".")
|
attrs = attr.split(".")
|
||||||
for name in attrs[:-1]:
|
for name in attrs[:-1]:
|
||||||
obj = getattr(obj, name)
|
obj = getattr(obj, name)
|
||||||
prev = getattr(obj, attrs[-1])
|
prev = getattr(obj, attrs[-1], ATTR_UNSET)
|
||||||
setattr(obj, attrs[-1], value)
|
if value is ATTR_UNSET:
|
||||||
|
delattr(obj, attrs[-1])
|
||||||
|
else:
|
||||||
|
setattr(obj, attrs[-1], value)
|
||||||
return prev
|
return prev
|
||||||
|
|
||||||
def set_attr_param(obj, attr, value):
|
def set_attr_param(obj, attr, value):
|
||||||
@ -1128,3 +1200,68 @@ def unpack_latents(combined_latent, latent_shapes):
|
|||||||
else:
|
else:
|
||||||
output_tensors = combined_latent
|
output_tensors = combined_latent
|
||||||
return output_tensors
|
return output_tensors
|
||||||
|
|
||||||
|
def detect_layer_quantization(state_dict, prefix):
|
||||||
|
for k in state_dict:
|
||||||
|
if k.startswith(prefix) and k.endswith(".comfy_quant"):
|
||||||
|
logging.info("Found quantization metadata version 1")
|
||||||
|
return {"mixed_ops": True}
|
||||||
|
return None
|
||||||
|
|
||||||
|
def convert_old_quants(state_dict, model_prefix="", metadata={}):
|
||||||
|
if metadata is None:
|
||||||
|
metadata = {}
|
||||||
|
|
||||||
|
quant_metadata = None
|
||||||
|
if "_quantization_metadata" not in metadata:
|
||||||
|
scaled_fp8_key = "{}scaled_fp8".format(model_prefix)
|
||||||
|
|
||||||
|
if scaled_fp8_key in state_dict:
|
||||||
|
scaled_fp8_weight = state_dict[scaled_fp8_key]
|
||||||
|
scaled_fp8_dtype = scaled_fp8_weight.dtype
|
||||||
|
if scaled_fp8_dtype == torch.float32:
|
||||||
|
scaled_fp8_dtype = torch.float8_e4m3fn
|
||||||
|
|
||||||
|
if scaled_fp8_weight.nelement() == 2:
|
||||||
|
full_precision_matrix_mult = True
|
||||||
|
else:
|
||||||
|
full_precision_matrix_mult = False
|
||||||
|
|
||||||
|
out_sd = {}
|
||||||
|
layers = {}
|
||||||
|
for k in list(state_dict.keys()):
|
||||||
|
if not k.startswith(model_prefix):
|
||||||
|
out_sd[k] = state_dict[k]
|
||||||
|
continue
|
||||||
|
k_out = k
|
||||||
|
w = state_dict.pop(k)
|
||||||
|
layer = None
|
||||||
|
if k_out.endswith(".scale_weight"):
|
||||||
|
layer = k_out[:-len(".scale_weight")]
|
||||||
|
k_out = "{}.weight_scale".format(layer)
|
||||||
|
|
||||||
|
if layer is not None:
|
||||||
|
layer_conf = {"format": "float8_e4m3fn"} # TODO: check if anyone did some non e4m3fn scaled checkpoints
|
||||||
|
if full_precision_matrix_mult:
|
||||||
|
layer_conf["full_precision_matrix_mult"] = full_precision_matrix_mult
|
||||||
|
layers[layer] = layer_conf
|
||||||
|
|
||||||
|
if k_out.endswith(".scale_input"):
|
||||||
|
layer = k_out[:-len(".scale_input")]
|
||||||
|
k_out = "{}.input_scale".format(layer)
|
||||||
|
if w.item() == 1.0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
out_sd[k_out] = w
|
||||||
|
|
||||||
|
state_dict = out_sd
|
||||||
|
quant_metadata = {"layers": layers}
|
||||||
|
else:
|
||||||
|
quant_metadata = json.loads(metadata["_quantization_metadata"])
|
||||||
|
|
||||||
|
if quant_metadata is not None:
|
||||||
|
layers = quant_metadata["layers"]
|
||||||
|
for k, v in layers.items():
|
||||||
|
state_dict["{}.comfy_quant".format(k)] = torch.tensor(list(json.dumps(v).encode('utf-8')), dtype=torch.uint8)
|
||||||
|
|
||||||
|
return state_dict, metadata
|
||||||
|
|||||||
@ -5,19 +5,20 @@ This module handles capability negotiation between frontend and backend,
|
|||||||
allowing graceful protocol evolution while maintaining backward compatibility.
|
allowing graceful protocol evolution while maintaining backward compatibility.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import Any, Dict
|
from typing import Any
|
||||||
|
|
||||||
from comfy.cli_args import args
|
from comfy.cli_args import args
|
||||||
|
|
||||||
# Default server capabilities
|
# Default server capabilities
|
||||||
SERVER_FEATURE_FLAGS: Dict[str, Any] = {
|
SERVER_FEATURE_FLAGS: dict[str, Any] = {
|
||||||
"supports_preview_metadata": True,
|
"supports_preview_metadata": True,
|
||||||
"max_upload_size": args.max_upload_size * 1024 * 1024, # Convert MB to bytes
|
"max_upload_size": args.max_upload_size * 1024 * 1024, # Convert MB to bytes
|
||||||
|
"extension": {"manager": {"supports_v4": True}},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_connection_feature(
|
def get_connection_feature(
|
||||||
sockets_metadata: Dict[str, Dict[str, Any]],
|
sockets_metadata: dict[str, dict[str, Any]],
|
||||||
sid: str,
|
sid: str,
|
||||||
feature_name: str,
|
feature_name: str,
|
||||||
default: Any = False
|
default: Any = False
|
||||||
@ -41,7 +42,7 @@ def get_connection_feature(
|
|||||||
|
|
||||||
|
|
||||||
def supports_feature(
|
def supports_feature(
|
||||||
sockets_metadata: Dict[str, Dict[str, Any]],
|
sockets_metadata: dict[str, dict[str, Any]],
|
||||||
sid: str,
|
sid: str,
|
||||||
feature_name: str
|
feature_name: str
|
||||||
) -> bool:
|
) -> bool:
|
||||||
@ -59,7 +60,7 @@ def supports_feature(
|
|||||||
return get_connection_feature(sockets_metadata, sid, feature_name, False) is True
|
return get_connection_feature(sockets_metadata, sid, feature_name, False) is True
|
||||||
|
|
||||||
|
|
||||||
def get_server_features() -> Dict[str, Any]:
|
def get_server_features() -> dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Get the server's feature flags.
|
Get the server's feature flags.
|
||||||
|
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
from typing import Type, List, NamedTuple
|
from typing import NamedTuple
|
||||||
from comfy_api.internal.singleton import ProxiedSingleton
|
from comfy_api.internal.singleton import ProxiedSingleton
|
||||||
from packaging import version as packaging_version
|
from packaging import version as packaging_version
|
||||||
|
|
||||||
@ -10,7 +10,7 @@ class ComfyAPIBase(ProxiedSingleton):
|
|||||||
|
|
||||||
class ComfyAPIWithVersion(NamedTuple):
|
class ComfyAPIWithVersion(NamedTuple):
|
||||||
version: str
|
version: str
|
||||||
api_class: Type[ComfyAPIBase]
|
api_class: type[ComfyAPIBase]
|
||||||
|
|
||||||
|
|
||||||
def parse_version(version_str: str) -> packaging_version.Version:
|
def parse_version(version_str: str) -> packaging_version.Version:
|
||||||
@ -23,16 +23,16 @@ def parse_version(version_str: str) -> packaging_version.Version:
|
|||||||
return packaging_version.parse(version_str)
|
return packaging_version.parse(version_str)
|
||||||
|
|
||||||
|
|
||||||
registered_versions: List[ComfyAPIWithVersion] = []
|
registered_versions: list[ComfyAPIWithVersion] = []
|
||||||
|
|
||||||
|
|
||||||
def register_versions(versions: List[ComfyAPIWithVersion]):
|
def register_versions(versions: list[ComfyAPIWithVersion]):
|
||||||
versions.sort(key=lambda x: parse_version(x.version))
|
versions.sort(key=lambda x: parse_version(x.version))
|
||||||
global registered_versions
|
global registered_versions
|
||||||
registered_versions = versions
|
registered_versions = versions
|
||||||
|
|
||||||
|
|
||||||
def get_all_versions() -> List[ComfyAPIWithVersion]:
|
def get_all_versions() -> list[ComfyAPIWithVersion]:
|
||||||
"""
|
"""
|
||||||
Returns a list of all registered ComfyAPI versions.
|
Returns a list of all registered ComfyAPI versions.
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -8,7 +8,7 @@ import os
|
|||||||
import textwrap
|
import textwrap
|
||||||
import threading
|
import threading
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Optional, Type, get_origin, get_args, get_type_hints
|
from typing import Optional, get_origin, get_args, get_type_hints
|
||||||
|
|
||||||
|
|
||||||
class TypeTracker:
|
class TypeTracker:
|
||||||
@ -193,7 +193,7 @@ class AsyncToSyncConverter:
|
|||||||
return result_container["result"]
|
return result_container["result"]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_sync_class(cls, async_class: Type, thread_pool_size=10) -> Type:
|
def create_sync_class(cls, async_class: type, thread_pool_size=10) -> type:
|
||||||
"""
|
"""
|
||||||
Creates a new class with synchronous versions of all async methods.
|
Creates a new class with synchronous versions of all async methods.
|
||||||
|
|
||||||
@ -563,7 +563,7 @@ class AsyncToSyncConverter:
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _generate_imports(
|
def _generate_imports(
|
||||||
cls, async_class: Type, type_tracker: TypeTracker
|
cls, async_class: type, type_tracker: TypeTracker
|
||||||
) -> list[str]:
|
) -> list[str]:
|
||||||
"""Generate import statements for the stub file."""
|
"""Generate import statements for the stub file."""
|
||||||
imports = []
|
imports = []
|
||||||
@ -628,7 +628,7 @@ class AsyncToSyncConverter:
|
|||||||
return imports
|
return imports
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _get_class_attributes(cls, async_class: Type) -> list[tuple[str, Type]]:
|
def _get_class_attributes(cls, async_class: type) -> list[tuple[str, type]]:
|
||||||
"""Extract class attributes that are classes themselves."""
|
"""Extract class attributes that are classes themselves."""
|
||||||
class_attributes = []
|
class_attributes = []
|
||||||
|
|
||||||
@ -654,7 +654,7 @@ class AsyncToSyncConverter:
|
|||||||
def _generate_inner_class_stub(
|
def _generate_inner_class_stub(
|
||||||
cls,
|
cls,
|
||||||
name: str,
|
name: str,
|
||||||
attr: Type,
|
attr: type,
|
||||||
indent: str = " ",
|
indent: str = " ",
|
||||||
type_tracker: Optional[TypeTracker] = None,
|
type_tracker: Optional[TypeTracker] = None,
|
||||||
) -> list[str]:
|
) -> list[str]:
|
||||||
@ -782,7 +782,7 @@ class AsyncToSyncConverter:
|
|||||||
return processed
|
return processed
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def generate_stub_file(cls, async_class: Type, sync_class: Type) -> None:
|
def generate_stub_file(cls, async_class: type, sync_class: type) -> None:
|
||||||
"""
|
"""
|
||||||
Generate a .pyi stub file for the sync class to help IDEs with type checking.
|
Generate a .pyi stub file for the sync class to help IDEs with type checking.
|
||||||
"""
|
"""
|
||||||
@ -988,7 +988,7 @@ class AsyncToSyncConverter:
|
|||||||
logging.error(traceback.format_exc())
|
logging.error(traceback.format_exc())
|
||||||
|
|
||||||
|
|
||||||
def create_sync_class(async_class: Type, thread_pool_size=10) -> Type:
|
def create_sync_class(async_class: type, thread_pool_size=10) -> type:
|
||||||
"""
|
"""
|
||||||
Creates a sync version of an async class
|
Creates a sync version of an async class
|
||||||
|
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
from typing import Type, TypeVar
|
from typing import TypeVar
|
||||||
|
|
||||||
class SingletonMetaclass(type):
|
class SingletonMetaclass(type):
|
||||||
T = TypeVar("T", bound="SingletonMetaclass")
|
T = TypeVar("T", bound="SingletonMetaclass")
|
||||||
@ -11,13 +11,13 @@ class SingletonMetaclass(type):
|
|||||||
)
|
)
|
||||||
return cls._instances[cls]
|
return cls._instances[cls]
|
||||||
|
|
||||||
def inject_instance(cls: Type[T], instance: T) -> None:
|
def inject_instance(cls: type[T], instance: T) -> None:
|
||||||
assert cls not in SingletonMetaclass._instances, (
|
assert cls not in SingletonMetaclass._instances, (
|
||||||
"Cannot inject instance after first instantiation"
|
"Cannot inject instance after first instantiation"
|
||||||
)
|
)
|
||||||
SingletonMetaclass._instances[cls] = instance
|
SingletonMetaclass._instances[cls] = instance
|
||||||
|
|
||||||
def get_instance(cls: Type[T], *args, **kwargs) -> T:
|
def get_instance(cls: type[T], *args, **kwargs) -> T:
|
||||||
"""
|
"""
|
||||||
Gets the singleton instance of the class, creating it if it doesn't exist.
|
Gets the singleton instance of the class, creating it if it doesn't exist.
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -1,15 +1,15 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Type, TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
from comfy_api.internal import ComfyAPIBase
|
from comfy_api.internal import ComfyAPIBase
|
||||||
from comfy_api.internal.singleton import ProxiedSingleton
|
from comfy_api.internal.singleton import ProxiedSingleton
|
||||||
from comfy_api.internal.async_to_sync import create_sync_class
|
from comfy_api.internal.async_to_sync import create_sync_class
|
||||||
from comfy_api.latest._input import ImageInput, AudioInput, MaskInput, LatentInput, VideoInput
|
from ._input import ImageInput, AudioInput, MaskInput, LatentInput, VideoInput
|
||||||
from comfy_api.latest._input_impl import VideoFromFile, VideoFromComponents
|
from ._input_impl import VideoFromFile, VideoFromComponents
|
||||||
from comfy_api.latest._util import VideoCodec, VideoContainer, VideoComponents, MESH, VOXEL
|
from ._util import VideoCodec, VideoContainer, VideoComponents, MESH, VOXEL
|
||||||
from . import _io as io
|
from . import _io_public as io
|
||||||
from . import _ui as ui
|
from . import _ui_public as ui
|
||||||
# from comfy_api.latest._resources import _RESOURCES as resources #noqa: F401
|
# from comfy_api.latest._resources import _RESOURCES as resources #noqa: F401
|
||||||
from comfy_execution.utils import get_executing_context
|
from comfy_execution.utils import get_executing_context
|
||||||
from comfy_execution.progress import get_progress_state, PreviewImageTuple
|
from comfy_execution.progress import get_progress_state, PreviewImageTuple
|
||||||
@ -80,7 +80,7 @@ class ComfyExtension(ABC):
|
|||||||
async def on_load(self) -> None:
|
async def on_load(self) -> None:
|
||||||
"""
|
"""
|
||||||
Called when an extension is loaded.
|
Called when an extension is loaded.
|
||||||
This should be used to initialize any global resources neeeded by the extension.
|
This should be used to initialize any global resources needed by the extension.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
@ -113,7 +113,7 @@ ComfyAPI = ComfyAPI_latest
|
|||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
import comfy_api.latest.generated.ComfyAPISyncStub # type: ignore
|
import comfy_api.latest.generated.ComfyAPISyncStub # type: ignore
|
||||||
|
|
||||||
ComfyAPISync: Type[comfy_api.latest.generated.ComfyAPISyncStub.ComfyAPISyncStub]
|
ComfyAPISync: type[comfy_api.latest.generated.ComfyAPISyncStub.ComfyAPISyncStub]
|
||||||
ComfyAPISync = create_sync_class(ComfyAPI_latest)
|
ComfyAPISync = create_sync_class(ComfyAPI_latest)
|
||||||
|
|
||||||
# create new aliases for io and ui
|
# create new aliases for io and ui
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
import torch
|
import torch
|
||||||
from typing import TypedDict, List, Optional
|
from typing import TypedDict, Optional
|
||||||
|
|
||||||
ImageInput = torch.Tensor
|
ImageInput = torch.Tensor
|
||||||
"""
|
"""
|
||||||
@ -39,4 +39,4 @@ class LatentInput(TypedDict):
|
|||||||
Optional noise mask tensor in the same format as samples.
|
Optional noise mask tensor in the same format as samples.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
batch_index: Optional[List[int]]
|
batch_index: Optional[list[int]]
|
||||||
|
|||||||
@ -4,7 +4,7 @@ from fractions import Fraction
|
|||||||
from typing import Optional, Union, IO
|
from typing import Optional, Union, IO
|
||||||
import io
|
import io
|
||||||
import av
|
import av
|
||||||
from comfy_api.util import VideoContainer, VideoCodec, VideoComponents
|
from .._util import VideoContainer, VideoCodec, VideoComponents
|
||||||
|
|
||||||
class VideoInput(ABC):
|
class VideoInput(ABC):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -3,14 +3,14 @@ from av.container import InputContainer
|
|||||||
from av.subtitles.stream import SubtitleStream
|
from av.subtitles.stream import SubtitleStream
|
||||||
from fractions import Fraction
|
from fractions import Fraction
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from comfy_api.latest._input import AudioInput, VideoInput
|
from .._input import AudioInput, VideoInput
|
||||||
import av
|
import av
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import math
|
import math
|
||||||
import torch
|
import torch
|
||||||
from comfy_api.latest._util import VideoContainer, VideoCodec, VideoComponents
|
from .._util import VideoContainer, VideoCodec, VideoComponents
|
||||||
|
|
||||||
|
|
||||||
def container_to_output_format(container_format: str | None) -> str | None:
|
def container_to_output_format(container_format: str | None) -> str | None:
|
||||||
@ -336,7 +336,10 @@ class VideoFromComponents(VideoInput):
|
|||||||
raise ValueError("Only MP4 format is supported for now")
|
raise ValueError("Only MP4 format is supported for now")
|
||||||
if codec != VideoCodec.AUTO and codec != VideoCodec.H264:
|
if codec != VideoCodec.AUTO and codec != VideoCodec.H264:
|
||||||
raise ValueError("Only H264 codec is supported for now")
|
raise ValueError("Only H264 codec is supported for now")
|
||||||
with av.open(path, mode='w', options={'movflags': 'use_metadata_tags'}) as output:
|
extra_kwargs = {}
|
||||||
|
if isinstance(format, VideoContainer) and format != VideoContainer.AUTO:
|
||||||
|
extra_kwargs["format"] = format.value
|
||||||
|
with av.open(path, mode='w', options={'movflags': 'use_metadata_tags'}, **extra_kwargs) as output:
|
||||||
# Add metadata before writing any streams
|
# Add metadata before writing any streams
|
||||||
if metadata is not None:
|
if metadata is not None:
|
||||||
for key, value in metadata.items():
|
for key, value in metadata.items():
|
||||||
|
|||||||
@ -4,7 +4,8 @@ import copy
|
|||||||
import inspect
|
import inspect
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from dataclasses import asdict, dataclass
|
from collections.abc import Iterable
|
||||||
|
from dataclasses import asdict, dataclass, field
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Any, Callable, Literal, TypedDict, TypeVar, TYPE_CHECKING
|
from typing import Any, Callable, Literal, TypedDict, TypeVar, TYPE_CHECKING
|
||||||
from typing_extensions import NotRequired, final
|
from typing_extensions import NotRequired, final
|
||||||
@ -25,7 +26,7 @@ if TYPE_CHECKING:
|
|||||||
from comfy_api.input import VideoInput
|
from comfy_api.input import VideoInput
|
||||||
from comfy_api.internal import (_ComfyNodeInternal, _NodeOutputInternal, classproperty, copy_class, first_real_override, is_class,
|
from comfy_api.internal import (_ComfyNodeInternal, _NodeOutputInternal, classproperty, copy_class, first_real_override, is_class,
|
||||||
prune_dict, shallow_clone_class)
|
prune_dict, shallow_clone_class)
|
||||||
from comfy_api.latest._resources import Resources, ResourcesLocal
|
from ._resources import Resources, ResourcesLocal
|
||||||
from comfy_execution.graph_utils import ExecutionBlocker
|
from comfy_execution.graph_utils import ExecutionBlocker
|
||||||
from ._util import MESH, VOXEL
|
from ._util import MESH, VOXEL
|
||||||
|
|
||||||
@ -150,6 +151,9 @@ class _IO_V3:
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def validate(self):
|
||||||
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def io_type(self):
|
def io_type(self):
|
||||||
return self.Parent.io_type
|
return self.Parent.io_type
|
||||||
@ -182,6 +186,9 @@ class Input(_IO_V3):
|
|||||||
def get_io_type(self):
|
def get_io_type(self):
|
||||||
return _StringIOType(self.io_type)
|
return _StringIOType(self.io_type)
|
||||||
|
|
||||||
|
def get_all(self) -> list[Input]:
|
||||||
|
return [self]
|
||||||
|
|
||||||
class WidgetInput(Input):
|
class WidgetInput(Input):
|
||||||
'''
|
'''
|
||||||
Base class for a V3 Input with widget.
|
Base class for a V3 Input with widget.
|
||||||
@ -561,6 +568,8 @@ class Conditioning(ComfyTypeIO):
|
|||||||
'''Used by WAN Camera.'''
|
'''Used by WAN Camera.'''
|
||||||
time_dim_concat: NotRequired[torch.Tensor]
|
time_dim_concat: NotRequired[torch.Tensor]
|
||||||
'''Used by WAN Phantom Subject.'''
|
'''Used by WAN Phantom Subject.'''
|
||||||
|
time_dim_replace: NotRequired[torch.Tensor]
|
||||||
|
'''Used by Kandinsky5 I2V.'''
|
||||||
|
|
||||||
CondList = list[tuple[torch.Tensor, PooledDict]]
|
CondList = list[tuple[torch.Tensor, PooledDict]]
|
||||||
Type = CondList
|
Type = CondList
|
||||||
@ -765,6 +774,13 @@ class AudioEncoder(ComfyTypeIO):
|
|||||||
class AudioEncoderOutput(ComfyTypeIO):
|
class AudioEncoderOutput(ComfyTypeIO):
|
||||||
Type = Any
|
Type = Any
|
||||||
|
|
||||||
|
@comfytype(io_type="TRACKS")
|
||||||
|
class Tracks(ComfyTypeIO):
|
||||||
|
class TrackDict(TypedDict):
|
||||||
|
track_path: torch.Tensor
|
||||||
|
track_visibility: torch.Tensor
|
||||||
|
Type = TrackDict
|
||||||
|
|
||||||
@comfytype(io_type="COMFY_MULTITYPED_V3")
|
@comfytype(io_type="COMFY_MULTITYPED_V3")
|
||||||
class MultiType:
|
class MultiType:
|
||||||
Type = Any
|
Type = Any
|
||||||
@ -814,13 +830,61 @@ class MultiType:
|
|||||||
else:
|
else:
|
||||||
return super().as_dict()
|
return super().as_dict()
|
||||||
|
|
||||||
|
@comfytype(io_type="COMFY_MATCHTYPE_V3")
|
||||||
|
class MatchType(ComfyTypeIO):
|
||||||
|
class Template:
|
||||||
|
def __init__(self, template_id: str, allowed_types: _ComfyType | list[_ComfyType] = AnyType):
|
||||||
|
self.template_id = template_id
|
||||||
|
# account for syntactic sugar
|
||||||
|
if not isinstance(allowed_types, Iterable):
|
||||||
|
allowed_types = [allowed_types]
|
||||||
|
for t in allowed_types:
|
||||||
|
if not isinstance(t, type):
|
||||||
|
if not isinstance(t, _ComfyType):
|
||||||
|
raise ValueError(f"Allowed types must be a ComfyType or a list of ComfyTypes, got {t.__class__.__name__}")
|
||||||
|
else:
|
||||||
|
if not issubclass(t, _ComfyType):
|
||||||
|
raise ValueError(f"Allowed types must be a ComfyType or a list of ComfyTypes, got {t.__name__}")
|
||||||
|
self.allowed_types = allowed_types
|
||||||
|
|
||||||
|
def as_dict(self):
|
||||||
|
return {
|
||||||
|
"template_id": self.template_id,
|
||||||
|
"allowed_types": ",".join([t.io_type for t in self.allowed_types]),
|
||||||
|
}
|
||||||
|
|
||||||
|
class Input(Input):
|
||||||
|
def __init__(self, id: str, template: MatchType.Template,
|
||||||
|
display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, extra_dict=None):
|
||||||
|
super().__init__(id, display_name, optional, tooltip, lazy, extra_dict)
|
||||||
|
self.template = template
|
||||||
|
|
||||||
|
def as_dict(self):
|
||||||
|
return super().as_dict() | prune_dict({
|
||||||
|
"template": self.template.as_dict(),
|
||||||
|
})
|
||||||
|
|
||||||
|
class Output(Output):
|
||||||
|
def __init__(self, template: MatchType.Template, id: str=None, display_name: str=None, tooltip: str=None,
|
||||||
|
is_output_list=False):
|
||||||
|
super().__init__(id, display_name, tooltip, is_output_list)
|
||||||
|
self.template = template
|
||||||
|
|
||||||
|
def as_dict(self):
|
||||||
|
return super().as_dict() | prune_dict({
|
||||||
|
"template": self.template.as_dict(),
|
||||||
|
})
|
||||||
|
|
||||||
class DynamicInput(Input, ABC):
|
class DynamicInput(Input, ABC):
|
||||||
'''
|
'''
|
||||||
Abstract class for dynamic input registration.
|
Abstract class for dynamic input registration.
|
||||||
'''
|
'''
|
||||||
@abstractmethod
|
|
||||||
def get_dynamic(self) -> list[Input]:
|
def get_dynamic(self) -> list[Input]:
|
||||||
...
|
return []
|
||||||
|
|
||||||
|
def expand_schema_for_dynamic(self, d: dict[str, Any], live_inputs: dict[str, Any], curr_prefix=''):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class DynamicOutput(Output, ABC):
|
class DynamicOutput(Output, ABC):
|
||||||
'''
|
'''
|
||||||
@ -830,99 +894,223 @@ class DynamicOutput(Output, ABC):
|
|||||||
is_output_list=False):
|
is_output_list=False):
|
||||||
super().__init__(id, display_name, tooltip, is_output_list)
|
super().__init__(id, display_name, tooltip, is_output_list)
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def get_dynamic(self) -> list[Output]:
|
def get_dynamic(self) -> list[Output]:
|
||||||
...
|
return []
|
||||||
|
|
||||||
|
|
||||||
@comfytype(io_type="COMFY_AUTOGROW_V3")
|
@comfytype(io_type="COMFY_AUTOGROW_V3")
|
||||||
class AutogrowDynamic(ComfyTypeI):
|
class Autogrow(ComfyTypeI):
|
||||||
Type = list[Any]
|
Type = dict[str, Any]
|
||||||
class Input(DynamicInput):
|
_MaxNames = 100 # NOTE: max 100 names for sanity
|
||||||
def __init__(self, id: str, template_input: Input, min: int=1, max: int=None,
|
|
||||||
display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, extra_dict=None):
|
class _AutogrowTemplate:
|
||||||
super().__init__(id, display_name, optional, tooltip, lazy, extra_dict)
|
def __init__(self, input: Input):
|
||||||
self.template_input = template_input
|
# dynamic inputs are not allowed as the template input
|
||||||
if min is not None:
|
assert(not isinstance(input, DynamicInput))
|
||||||
assert(min >= 1)
|
self.input = copy.copy(input)
|
||||||
if max is not None:
|
if isinstance(self.input, WidgetInput):
|
||||||
assert(max >= 1)
|
self.input.force_input = True
|
||||||
|
self.names: list[str] = []
|
||||||
|
self.cached_inputs = {}
|
||||||
|
|
||||||
|
def _create_input(self, input: Input, name: str):
|
||||||
|
new_input = copy.copy(self.input)
|
||||||
|
new_input.id = name
|
||||||
|
return new_input
|
||||||
|
|
||||||
|
def _create_cached_inputs(self):
|
||||||
|
for name in self.names:
|
||||||
|
self.cached_inputs[name] = self._create_input(self.input, name)
|
||||||
|
|
||||||
|
def get_all(self) -> list[Input]:
|
||||||
|
return list(self.cached_inputs.values())
|
||||||
|
|
||||||
|
def as_dict(self):
|
||||||
|
return prune_dict({
|
||||||
|
"input": create_input_dict_v1([self.input]),
|
||||||
|
})
|
||||||
|
|
||||||
|
def validate(self):
|
||||||
|
self.input.validate()
|
||||||
|
|
||||||
|
def expand_schema_for_dynamic(self, d: dict[str, Any], live_inputs: dict[str, Any], curr_prefix=''):
|
||||||
|
real_inputs = []
|
||||||
|
for name, input in self.cached_inputs.items():
|
||||||
|
if name in live_inputs:
|
||||||
|
real_inputs.append(input)
|
||||||
|
add_to_input_dict_v1(d, real_inputs, live_inputs, curr_prefix)
|
||||||
|
add_dynamic_id_mapping(d, real_inputs, curr_prefix)
|
||||||
|
|
||||||
|
class TemplatePrefix(_AutogrowTemplate):
|
||||||
|
def __init__(self, input: Input, prefix: str, min: int=1, max: int=10):
|
||||||
|
super().__init__(input)
|
||||||
|
self.prefix = prefix
|
||||||
|
assert(min >= 0)
|
||||||
|
assert(max >= 1)
|
||||||
|
assert(max <= Autogrow._MaxNames)
|
||||||
self.min = min
|
self.min = min
|
||||||
self.max = max
|
self.max = max
|
||||||
|
self.names = [f"{self.prefix}{i}" for i in range(self.max)]
|
||||||
|
self._create_cached_inputs()
|
||||||
|
|
||||||
|
def as_dict(self):
|
||||||
|
return super().as_dict() | prune_dict({
|
||||||
|
"prefix": self.prefix,
|
||||||
|
"min": self.min,
|
||||||
|
"max": self.max,
|
||||||
|
})
|
||||||
|
|
||||||
|
class TemplateNames(_AutogrowTemplate):
|
||||||
|
def __init__(self, input: Input, names: list[str], min: int=1):
|
||||||
|
super().__init__(input)
|
||||||
|
self.names = names[:Autogrow._MaxNames]
|
||||||
|
assert(min >= 0)
|
||||||
|
self.min = min
|
||||||
|
self._create_cached_inputs()
|
||||||
|
|
||||||
|
def as_dict(self):
|
||||||
|
return super().as_dict() | prune_dict({
|
||||||
|
"names": self.names,
|
||||||
|
"min": self.min,
|
||||||
|
})
|
||||||
|
|
||||||
|
class Input(DynamicInput):
|
||||||
|
def __init__(self, id: str, template: Autogrow.TemplatePrefix | Autogrow.TemplateNames,
|
||||||
|
display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, extra_dict=None):
|
||||||
|
super().__init__(id, display_name, optional, tooltip, lazy, extra_dict)
|
||||||
|
self.template = template
|
||||||
|
|
||||||
|
def as_dict(self):
|
||||||
|
return super().as_dict() | prune_dict({
|
||||||
|
"template": self.template.as_dict(),
|
||||||
|
})
|
||||||
|
|
||||||
def get_dynamic(self) -> list[Input]:
|
def get_dynamic(self) -> list[Input]:
|
||||||
curr_count = 1
|
return self.template.get_all()
|
||||||
new_inputs = []
|
|
||||||
for i in range(self.min):
|
|
||||||
new_input = copy.copy(self.template_input)
|
|
||||||
new_input.id = f"{new_input.id}{curr_count}_${self.id}_ag$"
|
|
||||||
if new_input.display_name is not None:
|
|
||||||
new_input.display_name = f"{new_input.display_name}{curr_count}"
|
|
||||||
new_input.optional = self.optional or new_input.optional
|
|
||||||
if isinstance(self.template_input, WidgetInput):
|
|
||||||
new_input.force_input = True
|
|
||||||
new_inputs.append(new_input)
|
|
||||||
curr_count += 1
|
|
||||||
# pretend to expand up to max
|
|
||||||
for i in range(curr_count-1, self.max):
|
|
||||||
new_input = copy.copy(self.template_input)
|
|
||||||
new_input.id = f"{new_input.id}{curr_count}_${self.id}_ag$"
|
|
||||||
if new_input.display_name is not None:
|
|
||||||
new_input.display_name = f"{new_input.display_name}{curr_count}"
|
|
||||||
new_input.optional = True
|
|
||||||
if isinstance(self.template_input, WidgetInput):
|
|
||||||
new_input.force_input = True
|
|
||||||
new_inputs.append(new_input)
|
|
||||||
curr_count += 1
|
|
||||||
return new_inputs
|
|
||||||
|
|
||||||
@comfytype(io_type="COMFY_COMBODYNAMIC_V3")
|
def get_all(self) -> list[Input]:
|
||||||
class ComboDynamic(ComfyTypeI):
|
return [self] + self.template.get_all()
|
||||||
class Input(DynamicInput):
|
|
||||||
def __init__(self, id: str):
|
|
||||||
pass
|
|
||||||
|
|
||||||
@comfytype(io_type="COMFY_MATCHTYPE_V3")
|
def validate(self):
|
||||||
class MatchType(ComfyTypeIO):
|
self.template.validate()
|
||||||
class Template:
|
|
||||||
def __init__(self, template_id: str, allowed_types: _ComfyType | list[_ComfyType]):
|
def expand_schema_for_dynamic(self, d: dict[str, Any], live_inputs: dict[str, Any], curr_prefix=''):
|
||||||
self.template_id = template_id
|
curr_prefix = f"{curr_prefix}{self.id}."
|
||||||
self.allowed_types = [allowed_types] if isinstance(allowed_types, _ComfyType) else allowed_types
|
# need to remove self from expected inputs dictionary; replaced by template inputs in frontend
|
||||||
|
for inner_dict in d.values():
|
||||||
|
if self.id in inner_dict:
|
||||||
|
del inner_dict[self.id]
|
||||||
|
self.template.expand_schema_for_dynamic(d, live_inputs, curr_prefix)
|
||||||
|
|
||||||
|
@comfytype(io_type="COMFY_DYNAMICCOMBO_V3")
|
||||||
|
class DynamicCombo(ComfyTypeI):
|
||||||
|
Type = dict[str, Any]
|
||||||
|
|
||||||
|
class Option:
|
||||||
|
def __init__(self, key: str, inputs: list[Input]):
|
||||||
|
self.key = key
|
||||||
|
self.inputs = inputs
|
||||||
|
|
||||||
def as_dict(self):
|
def as_dict(self):
|
||||||
return {
|
return {
|
||||||
"template_id": self.template_id,
|
"key": self.key,
|
||||||
"allowed_types": "".join(t.io_type for t in self.allowed_types),
|
"inputs": create_input_dict_v1(self.inputs),
|
||||||
}
|
}
|
||||||
|
|
||||||
class Input(DynamicInput):
|
class Input(DynamicInput):
|
||||||
def __init__(self, id: str, template: MatchType.Template,
|
def __init__(self, id: str, options: list[DynamicCombo.Option],
|
||||||
display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, extra_dict=None):
|
display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, extra_dict=None):
|
||||||
super().__init__(id, display_name, optional, tooltip, lazy, extra_dict)
|
super().__init__(id, display_name, optional, tooltip, lazy, extra_dict)
|
||||||
self.template = template
|
self.options = options
|
||||||
|
|
||||||
|
def expand_schema_for_dynamic(self, d: dict[str, Any], live_inputs: dict[str, Any], curr_prefix=''):
|
||||||
|
# check if dynamic input's id is in live_inputs
|
||||||
|
if self.id in live_inputs:
|
||||||
|
curr_prefix = f"{curr_prefix}{self.id}."
|
||||||
|
key = live_inputs[self.id]
|
||||||
|
selected_option = None
|
||||||
|
for option in self.options:
|
||||||
|
if option.key == key:
|
||||||
|
selected_option = option
|
||||||
|
break
|
||||||
|
if selected_option is not None:
|
||||||
|
add_to_input_dict_v1(d, selected_option.inputs, live_inputs, curr_prefix)
|
||||||
|
add_dynamic_id_mapping(d, selected_option.inputs, curr_prefix, self)
|
||||||
|
|
||||||
def get_dynamic(self) -> list[Input]:
|
def get_dynamic(self) -> list[Input]:
|
||||||
return [self]
|
return [input for option in self.options for input in option.inputs]
|
||||||
|
|
||||||
|
def get_all(self) -> list[Input]:
|
||||||
|
return [self] + [input for option in self.options for input in option.inputs]
|
||||||
|
|
||||||
def as_dict(self):
|
def as_dict(self):
|
||||||
return super().as_dict() | prune_dict({
|
return super().as_dict() | prune_dict({
|
||||||
"template": self.template.as_dict(),
|
"options": [o.as_dict() for o in self.options],
|
||||||
})
|
})
|
||||||
|
|
||||||
class Output(DynamicOutput):
|
def validate(self):
|
||||||
def __init__(self, id: str, template: MatchType.Template, display_name: str=None, tooltip: str=None,
|
# make sure all nested inputs are validated
|
||||||
is_output_list=False):
|
for option in self.options:
|
||||||
super().__init__(id, display_name, tooltip, is_output_list)
|
for input in option.inputs:
|
||||||
self.template = template
|
input.validate()
|
||||||
|
|
||||||
def get_dynamic(self) -> list[Output]:
|
@comfytype(io_type="COMFY_DYNAMICSLOT_V3")
|
||||||
return [self]
|
class DynamicSlot(ComfyTypeI):
|
||||||
|
Type = dict[str, Any]
|
||||||
|
|
||||||
|
class Input(DynamicInput):
|
||||||
|
def __init__(self, slot: Input, inputs: list[Input],
|
||||||
|
display_name: str=None, tooltip: str=None, lazy: bool=None, extra_dict=None):
|
||||||
|
assert(not isinstance(slot, DynamicInput))
|
||||||
|
self.slot = copy.copy(slot)
|
||||||
|
self.slot.display_name = slot.display_name if slot.display_name is not None else display_name
|
||||||
|
optional = True
|
||||||
|
self.slot.tooltip = slot.tooltip if slot.tooltip is not None else tooltip
|
||||||
|
self.slot.lazy = slot.lazy if slot.lazy is not None else lazy
|
||||||
|
self.slot.extra_dict = slot.extra_dict if slot.extra_dict is not None else extra_dict
|
||||||
|
super().__init__(slot.id, self.slot.display_name, optional, self.slot.tooltip, self.slot.lazy, self.slot.extra_dict)
|
||||||
|
self.inputs = inputs
|
||||||
|
self.force_input = None
|
||||||
|
# force widget inputs to have no widgets, otherwise this would be awkward
|
||||||
|
if isinstance(self.slot, WidgetInput):
|
||||||
|
self.force_input = True
|
||||||
|
self.slot.force_input = True
|
||||||
|
|
||||||
|
def expand_schema_for_dynamic(self, d: dict[str, Any], live_inputs: dict[str, Any], curr_prefix=''):
|
||||||
|
if self.id in live_inputs:
|
||||||
|
curr_prefix = f"{curr_prefix}{self.id}."
|
||||||
|
add_to_input_dict_v1(d, self.inputs, live_inputs, curr_prefix)
|
||||||
|
add_dynamic_id_mapping(d, [self.slot] + self.inputs, curr_prefix)
|
||||||
|
|
||||||
|
def get_dynamic(self) -> list[Input]:
|
||||||
|
return [self.slot] + self.inputs
|
||||||
|
|
||||||
|
def get_all(self) -> list[Input]:
|
||||||
|
return [self] + [self.slot] + self.inputs
|
||||||
|
|
||||||
def as_dict(self):
|
def as_dict(self):
|
||||||
return super().as_dict() | prune_dict({
|
return super().as_dict() | prune_dict({
|
||||||
"template": self.template.as_dict(),
|
"slotType": str(self.slot.get_io_type()),
|
||||||
|
"inputs": create_input_dict_v1(self.inputs),
|
||||||
|
"forceInput": self.force_input,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
def validate(self):
|
||||||
|
self.slot.validate()
|
||||||
|
for input in self.inputs:
|
||||||
|
input.validate()
|
||||||
|
|
||||||
|
def add_dynamic_id_mapping(d: dict[str, Any], inputs: list[Input], curr_prefix: str, self: DynamicInput=None):
|
||||||
|
dynamic = d.setdefault("dynamic_paths", {})
|
||||||
|
if self is not None:
|
||||||
|
dynamic[self.id] = f"{curr_prefix}{self.id}"
|
||||||
|
for i in inputs:
|
||||||
|
if not isinstance(i, DynamicInput):
|
||||||
|
dynamic[f"{i.id}"] = f"{curr_prefix}{i.id}"
|
||||||
|
|
||||||
|
class V3Data(TypedDict):
|
||||||
|
hidden_inputs: dict[str, Any]
|
||||||
|
dynamic_paths: dict[str, Any]
|
||||||
|
|
||||||
class HiddenHolder:
|
class HiddenHolder:
|
||||||
def __init__(self, unique_id: str, prompt: Any,
|
def __init__(self, unique_id: str, prompt: Any,
|
||||||
@ -984,6 +1172,7 @@ class NodeInfoV1:
|
|||||||
output_is_list: list[bool]=None
|
output_is_list: list[bool]=None
|
||||||
output_name: list[str]=None
|
output_name: list[str]=None
|
||||||
output_tooltips: list[str]=None
|
output_tooltips: list[str]=None
|
||||||
|
output_matchtypes: list[str]=None
|
||||||
name: str=None
|
name: str=None
|
||||||
display_name: str=None
|
display_name: str=None
|
||||||
description: str=None
|
description: str=None
|
||||||
@ -1019,9 +1208,9 @@ class Schema:
|
|||||||
"""Display name of node."""
|
"""Display name of node."""
|
||||||
category: str = "sd"
|
category: str = "sd"
|
||||||
"""The category of the node, as per the "Add Node" menu."""
|
"""The category of the node, as per the "Add Node" menu."""
|
||||||
inputs: list[Input]=None
|
inputs: list[Input] = field(default_factory=list)
|
||||||
outputs: list[Output]=None
|
outputs: list[Output] = field(default_factory=list)
|
||||||
hidden: list[Hidden]=None
|
hidden: list[Hidden] = field(default_factory=list)
|
||||||
description: str=""
|
description: str=""
|
||||||
"""Node description, shown as a tooltip when hovering over the node."""
|
"""Node description, shown as a tooltip when hovering over the node."""
|
||||||
is_input_list: bool = False
|
is_input_list: bool = False
|
||||||
@ -1061,7 +1250,11 @@ class Schema:
|
|||||||
'''Validate the schema:
|
'''Validate the schema:
|
||||||
- verify ids on inputs and outputs are unique - both internally and in relation to each other
|
- verify ids on inputs and outputs are unique - both internally and in relation to each other
|
||||||
'''
|
'''
|
||||||
input_ids = [i.id for i in self.inputs] if self.inputs is not None else []
|
nested_inputs: list[Input] = []
|
||||||
|
if self.inputs is not None:
|
||||||
|
for input in self.inputs:
|
||||||
|
nested_inputs.extend(input.get_all())
|
||||||
|
input_ids = [i.id for i in nested_inputs] if nested_inputs is not None else []
|
||||||
output_ids = [o.id for o in self.outputs] if self.outputs is not None else []
|
output_ids = [o.id for o in self.outputs] if self.outputs is not None else []
|
||||||
input_set = set(input_ids)
|
input_set = set(input_ids)
|
||||||
output_set = set(output_ids)
|
output_set = set(output_ids)
|
||||||
@ -1077,6 +1270,13 @@ class Schema:
|
|||||||
issues.append(f"Ids must be unique between inputs and outputs, but {intersection} are not.")
|
issues.append(f"Ids must be unique between inputs and outputs, but {intersection} are not.")
|
||||||
if len(issues) > 0:
|
if len(issues) > 0:
|
||||||
raise ValueError("\n".join(issues))
|
raise ValueError("\n".join(issues))
|
||||||
|
# validate inputs and outputs
|
||||||
|
if self.inputs is not None:
|
||||||
|
for input in self.inputs:
|
||||||
|
input.validate()
|
||||||
|
if self.outputs is not None:
|
||||||
|
for output in self.outputs:
|
||||||
|
output.validate()
|
||||||
|
|
||||||
def finalize(self):
|
def finalize(self):
|
||||||
"""Add hidden based on selected schema options, and give outputs without ids default ids."""
|
"""Add hidden based on selected schema options, and give outputs without ids default ids."""
|
||||||
@ -1102,19 +1302,10 @@ class Schema:
|
|||||||
if output.id is None:
|
if output.id is None:
|
||||||
output.id = f"_{i}_{output.io_type}_"
|
output.id = f"_{i}_{output.io_type}_"
|
||||||
|
|
||||||
def get_v1_info(self, cls) -> NodeInfoV1:
|
def get_v1_info(self, cls, live_inputs: dict[str, Any]=None) -> NodeInfoV1:
|
||||||
|
# NOTE: live_inputs will not be used anymore very soon and this will be done another way
|
||||||
# get V1 inputs
|
# get V1 inputs
|
||||||
input = {
|
input = create_input_dict_v1(self.inputs, live_inputs)
|
||||||
"required": {}
|
|
||||||
}
|
|
||||||
if self.inputs:
|
|
||||||
for i in self.inputs:
|
|
||||||
if isinstance(i, DynamicInput):
|
|
||||||
dynamic_inputs = i.get_dynamic()
|
|
||||||
for d in dynamic_inputs:
|
|
||||||
add_to_dict_v1(d, input)
|
|
||||||
else:
|
|
||||||
add_to_dict_v1(i, input)
|
|
||||||
if self.hidden:
|
if self.hidden:
|
||||||
for hidden in self.hidden:
|
for hidden in self.hidden:
|
||||||
input.setdefault("hidden", {})[hidden.name] = (hidden.value,)
|
input.setdefault("hidden", {})[hidden.name] = (hidden.value,)
|
||||||
@ -1123,12 +1314,24 @@ class Schema:
|
|||||||
output_is_list = []
|
output_is_list = []
|
||||||
output_name = []
|
output_name = []
|
||||||
output_tooltips = []
|
output_tooltips = []
|
||||||
|
output_matchtypes = []
|
||||||
|
any_matchtypes = False
|
||||||
if self.outputs:
|
if self.outputs:
|
||||||
for o in self.outputs:
|
for o in self.outputs:
|
||||||
output.append(o.io_type)
|
output.append(o.io_type)
|
||||||
output_is_list.append(o.is_output_list)
|
output_is_list.append(o.is_output_list)
|
||||||
output_name.append(o.display_name if o.display_name else o.io_type)
|
output_name.append(o.display_name if o.display_name else o.io_type)
|
||||||
output_tooltips.append(o.tooltip if o.tooltip else None)
|
output_tooltips.append(o.tooltip if o.tooltip else None)
|
||||||
|
# special handling for MatchType
|
||||||
|
if isinstance(o, MatchType.Output):
|
||||||
|
output_matchtypes.append(o.template.template_id)
|
||||||
|
any_matchtypes = True
|
||||||
|
else:
|
||||||
|
output_matchtypes.append(None)
|
||||||
|
|
||||||
|
# clear out lists that are all None
|
||||||
|
if not any_matchtypes:
|
||||||
|
output_matchtypes = None
|
||||||
|
|
||||||
info = NodeInfoV1(
|
info = NodeInfoV1(
|
||||||
input=input,
|
input=input,
|
||||||
@ -1137,6 +1340,7 @@ class Schema:
|
|||||||
output_is_list=output_is_list,
|
output_is_list=output_is_list,
|
||||||
output_name=output_name,
|
output_name=output_name,
|
||||||
output_tooltips=output_tooltips,
|
output_tooltips=output_tooltips,
|
||||||
|
output_matchtypes=output_matchtypes,
|
||||||
name=self.node_id,
|
name=self.node_id,
|
||||||
display_name=self.display_name,
|
display_name=self.display_name,
|
||||||
category=self.category,
|
category=self.category,
|
||||||
@ -1182,16 +1386,57 @@ class Schema:
|
|||||||
return info
|
return info
|
||||||
|
|
||||||
|
|
||||||
def add_to_dict_v1(i: Input, input: dict):
|
def create_input_dict_v1(inputs: list[Input], live_inputs: dict[str, Any]=None) -> dict:
|
||||||
|
input = {
|
||||||
|
"required": {}
|
||||||
|
}
|
||||||
|
add_to_input_dict_v1(input, inputs, live_inputs)
|
||||||
|
return input
|
||||||
|
|
||||||
|
def add_to_input_dict_v1(d: dict[str, Any], inputs: list[Input], live_inputs: dict[str, Any]=None, curr_prefix=''):
|
||||||
|
for i in inputs:
|
||||||
|
if isinstance(i, DynamicInput):
|
||||||
|
add_to_dict_v1(i, d)
|
||||||
|
if live_inputs is not None:
|
||||||
|
i.expand_schema_for_dynamic(d, live_inputs, curr_prefix)
|
||||||
|
else:
|
||||||
|
add_to_dict_v1(i, d)
|
||||||
|
|
||||||
|
def add_to_dict_v1(i: Input, d: dict, dynamic_dict: dict=None):
|
||||||
key = "optional" if i.optional else "required"
|
key = "optional" if i.optional else "required"
|
||||||
as_dict = i.as_dict()
|
as_dict = i.as_dict()
|
||||||
# for v1, we don't want to include the optional key
|
# for v1, we don't want to include the optional key
|
||||||
as_dict.pop("optional", None)
|
as_dict.pop("optional", None)
|
||||||
input.setdefault(key, {})[i.id] = (i.get_io_type(), as_dict)
|
if dynamic_dict is None:
|
||||||
|
value = (i.get_io_type(), as_dict)
|
||||||
|
else:
|
||||||
|
value = (i.get_io_type(), as_dict, dynamic_dict)
|
||||||
|
d.setdefault(key, {})[i.id] = value
|
||||||
|
|
||||||
def add_to_dict_v3(io: Input | Output, d: dict):
|
def add_to_dict_v3(io: Input | Output, d: dict):
|
||||||
d[io.id] = (io.get_io_type(), io.as_dict())
|
d[io.id] = (io.get_io_type(), io.as_dict())
|
||||||
|
|
||||||
|
def build_nested_inputs(values: dict[str, Any], v3_data: V3Data):
|
||||||
|
paths = v3_data.get("dynamic_paths", None)
|
||||||
|
if paths is None:
|
||||||
|
return values
|
||||||
|
values = values.copy()
|
||||||
|
result = {}
|
||||||
|
|
||||||
|
for key, path in paths.items():
|
||||||
|
parts = path.split(".")
|
||||||
|
current = result
|
||||||
|
|
||||||
|
for i, p in enumerate(parts):
|
||||||
|
is_last = (i == len(parts) - 1)
|
||||||
|
|
||||||
|
if is_last:
|
||||||
|
current[p] = values.pop(key, None)
|
||||||
|
else:
|
||||||
|
current = current.setdefault(p, {})
|
||||||
|
|
||||||
|
values.update(result)
|
||||||
|
return values
|
||||||
|
|
||||||
|
|
||||||
class _ComfyNodeBaseInternal(_ComfyNodeInternal):
|
class _ComfyNodeBaseInternal(_ComfyNodeInternal):
|
||||||
@ -1311,12 +1556,12 @@ class _ComfyNodeBaseInternal(_ComfyNodeInternal):
|
|||||||
|
|
||||||
@final
|
@final
|
||||||
@classmethod
|
@classmethod
|
||||||
def PREPARE_CLASS_CLONE(cls, hidden_inputs: dict) -> type[ComfyNode]:
|
def PREPARE_CLASS_CLONE(cls, v3_data: V3Data) -> type[ComfyNode]:
|
||||||
"""Creates clone of real node class to prevent monkey-patching."""
|
"""Creates clone of real node class to prevent monkey-patching."""
|
||||||
c_type: type[ComfyNode] = cls if is_class(cls) else type(cls)
|
c_type: type[ComfyNode] = cls if is_class(cls) else type(cls)
|
||||||
type_clone: type[ComfyNode] = shallow_clone_class(c_type)
|
type_clone: type[ComfyNode] = shallow_clone_class(c_type)
|
||||||
# set hidden
|
# set hidden
|
||||||
type_clone.hidden = HiddenHolder.from_dict(hidden_inputs)
|
type_clone.hidden = HiddenHolder.from_dict(v3_data["hidden_inputs"])
|
||||||
return type_clone
|
return type_clone
|
||||||
|
|
||||||
@final
|
@final
|
||||||
@ -1433,14 +1678,18 @@ class _ComfyNodeBaseInternal(_ComfyNodeInternal):
|
|||||||
|
|
||||||
@final
|
@final
|
||||||
@classmethod
|
@classmethod
|
||||||
def INPUT_TYPES(cls, include_hidden=True, return_schema=False) -> dict[str, dict] | tuple[dict[str, dict], Schema]:
|
def INPUT_TYPES(cls, include_hidden=True, return_schema=False, live_inputs=None) -> dict[str, dict] | tuple[dict[str, dict], Schema, V3Data]:
|
||||||
schema = cls.FINALIZE_SCHEMA()
|
schema = cls.FINALIZE_SCHEMA()
|
||||||
info = schema.get_v1_info(cls)
|
info = schema.get_v1_info(cls, live_inputs)
|
||||||
input = info.input
|
input = info.input
|
||||||
if not include_hidden:
|
if not include_hidden:
|
||||||
input.pop("hidden", None)
|
input.pop("hidden", None)
|
||||||
if return_schema:
|
if return_schema:
|
||||||
return input, schema
|
v3_data: V3Data = {}
|
||||||
|
dynamic = input.pop("dynamic_paths", None)
|
||||||
|
if dynamic is not None:
|
||||||
|
v3_data["dynamic_paths"] = dynamic
|
||||||
|
return input, schema, v3_data
|
||||||
return input
|
return input
|
||||||
|
|
||||||
@final
|
@final
|
||||||
@ -1513,7 +1762,7 @@ class ComfyNode(_ComfyNodeBaseInternal):
|
|||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def validate_inputs(cls, **kwargs) -> bool:
|
def validate_inputs(cls, **kwargs) -> bool | str:
|
||||||
"""Optionally, define this function to validate inputs; equivalent to V1's VALIDATE_INPUTS."""
|
"""Optionally, define this function to validate inputs; equivalent to V1's VALIDATE_INPUTS."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@ -1573,7 +1822,7 @@ class NodeOutput(_NodeOutputInternal):
|
|||||||
ui = data["ui"]
|
ui = data["ui"]
|
||||||
if "expand" in data:
|
if "expand" in data:
|
||||||
expand = data["expand"]
|
expand = data["expand"]
|
||||||
return cls(args=args, ui=ui, expand=expand)
|
return cls(*args, ui=ui, expand=expand)
|
||||||
|
|
||||||
def __getitem__(self, index) -> Any:
|
def __getitem__(self, index) -> Any:
|
||||||
return self.args[index]
|
return self.args[index]
|
||||||
@ -1628,6 +1877,7 @@ __all__ = [
|
|||||||
"StyleModel",
|
"StyleModel",
|
||||||
"Gligen",
|
"Gligen",
|
||||||
"UpscaleModel",
|
"UpscaleModel",
|
||||||
|
"LatentUpscaleModel",
|
||||||
"Audio",
|
"Audio",
|
||||||
"Video",
|
"Video",
|
||||||
"SVG",
|
"SVG",
|
||||||
@ -1651,6 +1901,11 @@ __all__ = [
|
|||||||
"SEGS",
|
"SEGS",
|
||||||
"AnyType",
|
"AnyType",
|
||||||
"MultiType",
|
"MultiType",
|
||||||
|
"Tracks",
|
||||||
|
# Dynamic Types
|
||||||
|
"MatchType",
|
||||||
|
# "DynamicCombo",
|
||||||
|
# "Autogrow",
|
||||||
# Other classes
|
# Other classes
|
||||||
"HiddenHolder",
|
"HiddenHolder",
|
||||||
"Hidden",
|
"Hidden",
|
||||||
@ -1661,4 +1916,5 @@ __all__ = [
|
|||||||
"NodeOutput",
|
"NodeOutput",
|
||||||
"add_to_dict_v1",
|
"add_to_dict_v1",
|
||||||
"add_to_dict_v3",
|
"add_to_dict_v3",
|
||||||
|
"V3Data",
|
||||||
]
|
]
|
||||||
|
|||||||
1
comfy_api/latest/_io_public.py
Normal file
1
comfy_api/latest/_io_public.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
from ._io import * # noqa: F403
|
||||||
@ -3,8 +3,8 @@ from __future__ import annotations
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
|
import uuid
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from typing import Type
|
|
||||||
|
|
||||||
import av
|
import av
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -21,7 +21,7 @@ import folder_paths
|
|||||||
|
|
||||||
# used for image preview
|
# used for image preview
|
||||||
from comfy.cli_args import args
|
from comfy.cli_args import args
|
||||||
from comfy_api.latest._io import ComfyNode, FolderType, Image, _UIOutput
|
from ._io import ComfyNode, FolderType, Image, _UIOutput
|
||||||
|
|
||||||
|
|
||||||
class SavedResult(dict):
|
class SavedResult(dict):
|
||||||
@ -82,7 +82,7 @@ class ImageSaveHelper:
|
|||||||
return PILImage.fromarray(np.clip(255.0 * image_tensor.cpu().numpy(), 0, 255).astype(np.uint8))
|
return PILImage.fromarray(np.clip(255.0 * image_tensor.cpu().numpy(), 0, 255).astype(np.uint8))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _create_png_metadata(cls: Type[ComfyNode] | None) -> PngInfo | None:
|
def _create_png_metadata(cls: type[ComfyNode] | None) -> PngInfo | None:
|
||||||
"""Creates a PngInfo object with prompt and extra_pnginfo."""
|
"""Creates a PngInfo object with prompt and extra_pnginfo."""
|
||||||
if args.disable_metadata or cls is None or not cls.hidden:
|
if args.disable_metadata or cls is None or not cls.hidden:
|
||||||
return None
|
return None
|
||||||
@ -95,7 +95,7 @@ class ImageSaveHelper:
|
|||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _create_animated_png_metadata(cls: Type[ComfyNode] | None) -> PngInfo | None:
|
def _create_animated_png_metadata(cls: type[ComfyNode] | None) -> PngInfo | None:
|
||||||
"""Creates a PngInfo object with prompt and extra_pnginfo for animated PNGs (APNG)."""
|
"""Creates a PngInfo object with prompt and extra_pnginfo for animated PNGs (APNG)."""
|
||||||
if args.disable_metadata or cls is None or not cls.hidden:
|
if args.disable_metadata or cls is None or not cls.hidden:
|
||||||
return None
|
return None
|
||||||
@ -120,7 +120,7 @@ class ImageSaveHelper:
|
|||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _create_webp_metadata(pil_image: PILImage.Image, cls: Type[ComfyNode] | None) -> PILImage.Exif:
|
def _create_webp_metadata(pil_image: PILImage.Image, cls: type[ComfyNode] | None) -> PILImage.Exif:
|
||||||
"""Creates EXIF metadata bytes for WebP images."""
|
"""Creates EXIF metadata bytes for WebP images."""
|
||||||
exif_data = pil_image.getexif()
|
exif_data = pil_image.getexif()
|
||||||
if args.disable_metadata or cls is None or cls.hidden is None:
|
if args.disable_metadata or cls is None or cls.hidden is None:
|
||||||
@ -136,7 +136,7 @@ class ImageSaveHelper:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def save_images(
|
def save_images(
|
||||||
images, filename_prefix: str, folder_type: FolderType, cls: Type[ComfyNode] | None, compress_level = 4,
|
images, filename_prefix: str, folder_type: FolderType, cls: type[ComfyNode] | None, compress_level = 4,
|
||||||
) -> list[SavedResult]:
|
) -> list[SavedResult]:
|
||||||
"""Saves a batch of images as individual PNG files."""
|
"""Saves a batch of images as individual PNG files."""
|
||||||
full_output_folder, filename, counter, subfolder, _ = folder_paths.get_save_image_path(
|
full_output_folder, filename, counter, subfolder, _ = folder_paths.get_save_image_path(
|
||||||
@ -154,7 +154,7 @@ class ImageSaveHelper:
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_save_images_ui(images, filename_prefix: str, cls: Type[ComfyNode] | None, compress_level=4) -> SavedImages:
|
def get_save_images_ui(images, filename_prefix: str, cls: type[ComfyNode] | None, compress_level=4) -> SavedImages:
|
||||||
"""Saves a batch of images and returns a UI object for the node output."""
|
"""Saves a batch of images and returns a UI object for the node output."""
|
||||||
return SavedImages(
|
return SavedImages(
|
||||||
ImageSaveHelper.save_images(
|
ImageSaveHelper.save_images(
|
||||||
@ -168,7 +168,7 @@ class ImageSaveHelper:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def save_animated_png(
|
def save_animated_png(
|
||||||
images, filename_prefix: str, folder_type: FolderType, cls: Type[ComfyNode] | None, fps: float, compress_level: int
|
images, filename_prefix: str, folder_type: FolderType, cls: type[ComfyNode] | None, fps: float, compress_level: int
|
||||||
) -> SavedResult:
|
) -> SavedResult:
|
||||||
"""Saves a batch of images as a single animated PNG."""
|
"""Saves a batch of images as a single animated PNG."""
|
||||||
full_output_folder, filename, counter, subfolder, _ = folder_paths.get_save_image_path(
|
full_output_folder, filename, counter, subfolder, _ = folder_paths.get_save_image_path(
|
||||||
@ -190,7 +190,7 @@ class ImageSaveHelper:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_save_animated_png_ui(
|
def get_save_animated_png_ui(
|
||||||
images, filename_prefix: str, cls: Type[ComfyNode] | None, fps: float, compress_level: int
|
images, filename_prefix: str, cls: type[ComfyNode] | None, fps: float, compress_level: int
|
||||||
) -> SavedImages:
|
) -> SavedImages:
|
||||||
"""Saves an animated PNG and returns a UI object for the node output."""
|
"""Saves an animated PNG and returns a UI object for the node output."""
|
||||||
result = ImageSaveHelper.save_animated_png(
|
result = ImageSaveHelper.save_animated_png(
|
||||||
@ -208,7 +208,7 @@ class ImageSaveHelper:
|
|||||||
images,
|
images,
|
||||||
filename_prefix: str,
|
filename_prefix: str,
|
||||||
folder_type: FolderType,
|
folder_type: FolderType,
|
||||||
cls: Type[ComfyNode] | None,
|
cls: type[ComfyNode] | None,
|
||||||
fps: float,
|
fps: float,
|
||||||
lossless: bool,
|
lossless: bool,
|
||||||
quality: int,
|
quality: int,
|
||||||
@ -237,7 +237,7 @@ class ImageSaveHelper:
|
|||||||
def get_save_animated_webp_ui(
|
def get_save_animated_webp_ui(
|
||||||
images,
|
images,
|
||||||
filename_prefix: str,
|
filename_prefix: str,
|
||||||
cls: Type[ComfyNode] | None,
|
cls: type[ComfyNode] | None,
|
||||||
fps: float,
|
fps: float,
|
||||||
lossless: bool,
|
lossless: bool,
|
||||||
quality: int,
|
quality: int,
|
||||||
@ -266,7 +266,7 @@ class AudioSaveHelper:
|
|||||||
audio: dict,
|
audio: dict,
|
||||||
filename_prefix: str,
|
filename_prefix: str,
|
||||||
folder_type: FolderType,
|
folder_type: FolderType,
|
||||||
cls: Type[ComfyNode] | None,
|
cls: type[ComfyNode] | None,
|
||||||
format: str = "flac",
|
format: str = "flac",
|
||||||
quality: str = "128k",
|
quality: str = "128k",
|
||||||
) -> list[SavedResult]:
|
) -> list[SavedResult]:
|
||||||
@ -318,9 +318,10 @@ class AudioSaveHelper:
|
|||||||
for key, value in metadata.items():
|
for key, value in metadata.items():
|
||||||
output_container.metadata[key] = value
|
output_container.metadata[key] = value
|
||||||
|
|
||||||
|
layout = "mono" if waveform.shape[0] == 1 else "stereo"
|
||||||
# Set up the output stream with appropriate properties
|
# Set up the output stream with appropriate properties
|
||||||
if format == "opus":
|
if format == "opus":
|
||||||
out_stream = output_container.add_stream("libopus", rate=sample_rate)
|
out_stream = output_container.add_stream("libopus", rate=sample_rate, layout=layout)
|
||||||
if quality == "64k":
|
if quality == "64k":
|
||||||
out_stream.bit_rate = 64000
|
out_stream.bit_rate = 64000
|
||||||
elif quality == "96k":
|
elif quality == "96k":
|
||||||
@ -332,7 +333,7 @@ class AudioSaveHelper:
|
|||||||
elif quality == "320k":
|
elif quality == "320k":
|
||||||
out_stream.bit_rate = 320000
|
out_stream.bit_rate = 320000
|
||||||
elif format == "mp3":
|
elif format == "mp3":
|
||||||
out_stream = output_container.add_stream("libmp3lame", rate=sample_rate)
|
out_stream = output_container.add_stream("libmp3lame", rate=sample_rate, layout=layout)
|
||||||
if quality == "V0":
|
if quality == "V0":
|
||||||
# TODO i would really love to support V3 and V5 but there doesn't seem to be a way to set the qscale level, the property below is a bool
|
# TODO i would really love to support V3 and V5 but there doesn't seem to be a way to set the qscale level, the property below is a bool
|
||||||
out_stream.codec_context.qscale = 1
|
out_stream.codec_context.qscale = 1
|
||||||
@ -341,12 +342,12 @@ class AudioSaveHelper:
|
|||||||
elif quality == "320k":
|
elif quality == "320k":
|
||||||
out_stream.bit_rate = 320000
|
out_stream.bit_rate = 320000
|
||||||
else: # format == "flac":
|
else: # format == "flac":
|
||||||
out_stream = output_container.add_stream("flac", rate=sample_rate)
|
out_stream = output_container.add_stream("flac", rate=sample_rate, layout=layout)
|
||||||
|
|
||||||
frame = av.AudioFrame.from_ndarray(
|
frame = av.AudioFrame.from_ndarray(
|
||||||
waveform.movedim(0, 1).reshape(1, -1).float().numpy(),
|
waveform.movedim(0, 1).reshape(1, -1).float().numpy(),
|
||||||
format="flt",
|
format="flt",
|
||||||
layout="mono" if waveform.shape[0] == 1 else "stereo",
|
layout=layout,
|
||||||
)
|
)
|
||||||
frame.sample_rate = sample_rate
|
frame.sample_rate = sample_rate
|
||||||
frame.pts = 0
|
frame.pts = 0
|
||||||
@ -370,7 +371,7 @@ class AudioSaveHelper:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_save_audio_ui(
|
def get_save_audio_ui(
|
||||||
audio, filename_prefix: str, cls: Type[ComfyNode] | None, format: str = "flac", quality: str = "128k",
|
audio, filename_prefix: str, cls: type[ComfyNode] | None, format: str = "flac", quality: str = "128k",
|
||||||
) -> SavedAudios:
|
) -> SavedAudios:
|
||||||
"""Save and instantly wrap for UI."""
|
"""Save and instantly wrap for UI."""
|
||||||
return SavedAudios(
|
return SavedAudios(
|
||||||
@ -386,7 +387,7 @@ class AudioSaveHelper:
|
|||||||
|
|
||||||
|
|
||||||
class PreviewImage(_UIOutput):
|
class PreviewImage(_UIOutput):
|
||||||
def __init__(self, image: Image.Type, animated: bool = False, cls: Type[ComfyNode] = None, **kwargs):
|
def __init__(self, image: Image.Type, animated: bool = False, cls: type[ComfyNode] = None, **kwargs):
|
||||||
self.values = ImageSaveHelper.save_images(
|
self.values = ImageSaveHelper.save_images(
|
||||||
image,
|
image,
|
||||||
filename_prefix="ComfyUI_temp_" + ''.join(random.choice("abcdefghijklmnopqrstupvxyz") for _ in range(5)),
|
filename_prefix="ComfyUI_temp_" + ''.join(random.choice("abcdefghijklmnopqrstupvxyz") for _ in range(5)),
|
||||||
@ -410,7 +411,7 @@ class PreviewMask(PreviewImage):
|
|||||||
|
|
||||||
|
|
||||||
class PreviewAudio(_UIOutput):
|
class PreviewAudio(_UIOutput):
|
||||||
def __init__(self, audio: dict, cls: Type[ComfyNode] = None, **kwargs):
|
def __init__(self, audio: dict, cls: type[ComfyNode] = None, **kwargs):
|
||||||
self.values = AudioSaveHelper.save_audio(
|
self.values = AudioSaveHelper.save_audio(
|
||||||
audio,
|
audio,
|
||||||
filename_prefix="ComfyUI_temp_" + "".join(random.choice("abcdefghijklmnopqrstuvwxyz") for _ in range(5)),
|
filename_prefix="ComfyUI_temp_" + "".join(random.choice("abcdefghijklmnopqrstuvwxyz") for _ in range(5)),
|
||||||
@ -436,9 +437,19 @@ class PreviewUI3D(_UIOutput):
|
|||||||
def __init__(self, model_file, camera_info, **kwargs):
|
def __init__(self, model_file, camera_info, **kwargs):
|
||||||
self.model_file = model_file
|
self.model_file = model_file
|
||||||
self.camera_info = camera_info
|
self.camera_info = camera_info
|
||||||
|
self.bg_image_path = None
|
||||||
|
bg_image = kwargs.get("bg_image", None)
|
||||||
|
if bg_image is not None:
|
||||||
|
img_array = (bg_image[0].cpu().numpy() * 255).astype(np.uint8)
|
||||||
|
img = PILImage.fromarray(img_array)
|
||||||
|
temp_dir = folder_paths.get_temp_directory()
|
||||||
|
filename = f"bg_{uuid.uuid4().hex}.png"
|
||||||
|
bg_image_path = os.path.join(temp_dir, filename)
|
||||||
|
img.save(bg_image_path, compress_level=1)
|
||||||
|
self.bg_image_path = f"temp/{filename}"
|
||||||
|
|
||||||
def as_dict(self):
|
def as_dict(self):
|
||||||
return {"result": [self.model_file, self.camera_info]}
|
return {"result": [self.model_file, self.camera_info, self.bg_image_path]}
|
||||||
|
|
||||||
|
|
||||||
class PreviewText(_UIOutput):
|
class PreviewText(_UIOutput):
|
||||||
|
|||||||
1
comfy_api/latest/_ui_public.py
Normal file
1
comfy_api/latest/_ui_public.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
from ._ui import * # noqa: F403
|
||||||
@ -3,7 +3,7 @@ from dataclasses import dataclass
|
|||||||
from enum import Enum
|
from enum import Enum
|
||||||
from fractions import Fraction
|
from fractions import Fraction
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from comfy_api.latest._input import ImageInput, AudioInput
|
from .._input import ImageInput, AudioInput
|
||||||
|
|
||||||
class VideoCodec(str, Enum):
|
class VideoCodec(str, Enum):
|
||||||
AUTO = "auto"
|
AUTO = "auto"
|
||||||
|
|||||||
@ -6,7 +6,7 @@ from comfy_api.latest import (
|
|||||||
)
|
)
|
||||||
from typing import Type, TYPE_CHECKING
|
from typing import Type, TYPE_CHECKING
|
||||||
from comfy_api.internal.async_to_sync import create_sync_class
|
from comfy_api.internal.async_to_sync import create_sync_class
|
||||||
from comfy_api.latest import io, ui, ComfyExtension #noqa: F401
|
from comfy_api.latest import io, ui, IO, UI, ComfyExtension #noqa: F401
|
||||||
|
|
||||||
|
|
||||||
class ComfyAPIAdapter_v0_0_2(ComfyAPI_latest):
|
class ComfyAPIAdapter_v0_0_2(ComfyAPI_latest):
|
||||||
@ -42,4 +42,8 @@ __all__ = [
|
|||||||
"InputImpl",
|
"InputImpl",
|
||||||
"Types",
|
"Types",
|
||||||
"ComfyExtension",
|
"ComfyExtension",
|
||||||
|
"io",
|
||||||
|
"IO",
|
||||||
|
"ui",
|
||||||
|
"UI",
|
||||||
]
|
]
|
||||||
|
|||||||
@ -2,9 +2,8 @@ from comfy_api.latest import ComfyAPI_latest
|
|||||||
from comfy_api.v0_0_2 import ComfyAPIAdapter_v0_0_2
|
from comfy_api.v0_0_2 import ComfyAPIAdapter_v0_0_2
|
||||||
from comfy_api.v0_0_1 import ComfyAPIAdapter_v0_0_1
|
from comfy_api.v0_0_1 import ComfyAPIAdapter_v0_0_1
|
||||||
from comfy_api.internal import ComfyAPIBase
|
from comfy_api.internal import ComfyAPIBase
|
||||||
from typing import List, Type
|
|
||||||
|
|
||||||
supported_versions: List[Type[ComfyAPIBase]] = [
|
supported_versions: list[type[ComfyAPIBase]] = [
|
||||||
ComfyAPI_latest,
|
ComfyAPI_latest,
|
||||||
ComfyAPIAdapter_v0_0_2,
|
ComfyAPIAdapter_v0_0_2,
|
||||||
ComfyAPIAdapter_v0_0_1,
|
ComfyAPIAdapter_v0_0_1,
|
||||||
|
|||||||
144
comfy_api_nodes/apis/bytedance_api.py
Normal file
144
comfy_api_nodes/apis/bytedance_api.py
Normal file
@ -0,0 +1,144 @@
|
|||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
|
||||||
|
class Text2ImageTaskCreationRequest(BaseModel):
|
||||||
|
model: str = Field(...)
|
||||||
|
prompt: str = Field(...)
|
||||||
|
response_format: str | None = Field("url")
|
||||||
|
size: str | None = Field(None)
|
||||||
|
seed: int | None = Field(0, ge=0, le=2147483647)
|
||||||
|
guidance_scale: float | None = Field(..., ge=1.0, le=10.0)
|
||||||
|
watermark: bool | None = Field(True)
|
||||||
|
|
||||||
|
|
||||||
|
class Image2ImageTaskCreationRequest(BaseModel):
|
||||||
|
model: str = Field(...)
|
||||||
|
prompt: str = Field(...)
|
||||||
|
response_format: str | None = Field("url")
|
||||||
|
image: str = Field(..., description="Base64 encoded string or image URL")
|
||||||
|
size: str | None = Field("adaptive")
|
||||||
|
seed: int | None = Field(..., ge=0, le=2147483647)
|
||||||
|
guidance_scale: float | None = Field(..., ge=1.0, le=10.0)
|
||||||
|
watermark: bool | None = Field(True)
|
||||||
|
|
||||||
|
|
||||||
|
class Seedream4Options(BaseModel):
|
||||||
|
max_images: int = Field(15)
|
||||||
|
|
||||||
|
|
||||||
|
class Seedream4TaskCreationRequest(BaseModel):
|
||||||
|
model: str = Field(...)
|
||||||
|
prompt: str = Field(...)
|
||||||
|
response_format: str = Field("url")
|
||||||
|
image: list[str] | None = Field(None, description="Image URLs")
|
||||||
|
size: str = Field(...)
|
||||||
|
seed: int = Field(..., ge=0, le=2147483647)
|
||||||
|
sequential_image_generation: str = Field("disabled")
|
||||||
|
sequential_image_generation_options: Seedream4Options = Field(Seedream4Options(max_images=15))
|
||||||
|
watermark: bool = Field(True)
|
||||||
|
|
||||||
|
|
||||||
|
class ImageTaskCreationResponse(BaseModel):
|
||||||
|
model: str = Field(...)
|
||||||
|
created: int = Field(..., description="Unix timestamp (in seconds) indicating time when the request was created.")
|
||||||
|
data: list = Field([], description="Contains information about the generated image(s).")
|
||||||
|
error: dict = Field({}, description="Contains `code` and `message` fields in case of error.")
|
||||||
|
|
||||||
|
|
||||||
|
class TaskTextContent(BaseModel):
|
||||||
|
type: str = Field("text")
|
||||||
|
text: str = Field(...)
|
||||||
|
|
||||||
|
|
||||||
|
class TaskImageContentUrl(BaseModel):
|
||||||
|
url: str = Field(...)
|
||||||
|
|
||||||
|
|
||||||
|
class TaskImageContent(BaseModel):
|
||||||
|
type: str = Field("image_url")
|
||||||
|
image_url: TaskImageContentUrl = Field(...)
|
||||||
|
role: Literal["first_frame", "last_frame", "reference_image"] | None = Field(None)
|
||||||
|
|
||||||
|
|
||||||
|
class Text2VideoTaskCreationRequest(BaseModel):
|
||||||
|
model: str = Field(...)
|
||||||
|
content: list[TaskTextContent] = Field(..., min_length=1)
|
||||||
|
|
||||||
|
|
||||||
|
class Image2VideoTaskCreationRequest(BaseModel):
|
||||||
|
model: str = Field(...)
|
||||||
|
content: list[TaskTextContent | TaskImageContent] = Field(..., min_length=2)
|
||||||
|
|
||||||
|
|
||||||
|
class TaskCreationResponse(BaseModel):
|
||||||
|
id: str = Field(...)
|
||||||
|
|
||||||
|
|
||||||
|
class TaskStatusError(BaseModel):
|
||||||
|
code: str = Field(...)
|
||||||
|
message: str = Field(...)
|
||||||
|
|
||||||
|
|
||||||
|
class TaskStatusResult(BaseModel):
|
||||||
|
video_url: str = Field(...)
|
||||||
|
|
||||||
|
|
||||||
|
class TaskStatusResponse(BaseModel):
|
||||||
|
id: str = Field(...)
|
||||||
|
model: str = Field(...)
|
||||||
|
status: Literal["queued", "running", "cancelled", "succeeded", "failed"] = Field(...)
|
||||||
|
error: TaskStatusError | None = Field(None)
|
||||||
|
content: TaskStatusResult | None = Field(None)
|
||||||
|
|
||||||
|
|
||||||
|
RECOMMENDED_PRESETS = [
|
||||||
|
("1024x1024 (1:1)", 1024, 1024),
|
||||||
|
("864x1152 (3:4)", 864, 1152),
|
||||||
|
("1152x864 (4:3)", 1152, 864),
|
||||||
|
("1280x720 (16:9)", 1280, 720),
|
||||||
|
("720x1280 (9:16)", 720, 1280),
|
||||||
|
("832x1248 (2:3)", 832, 1248),
|
||||||
|
("1248x832 (3:2)", 1248, 832),
|
||||||
|
("1512x648 (21:9)", 1512, 648),
|
||||||
|
("2048x2048 (1:1)", 2048, 2048),
|
||||||
|
("Custom", None, None),
|
||||||
|
]
|
||||||
|
|
||||||
|
RECOMMENDED_PRESETS_SEEDREAM_4 = [
|
||||||
|
("2048x2048 (1:1)", 2048, 2048),
|
||||||
|
("2304x1728 (4:3)", 2304, 1728),
|
||||||
|
("1728x2304 (3:4)", 1728, 2304),
|
||||||
|
("2560x1440 (16:9)", 2560, 1440),
|
||||||
|
("1440x2560 (9:16)", 1440, 2560),
|
||||||
|
("2496x1664 (3:2)", 2496, 1664),
|
||||||
|
("1664x2496 (2:3)", 1664, 2496),
|
||||||
|
("3024x1296 (21:9)", 3024, 1296),
|
||||||
|
("4096x4096 (1:1)", 4096, 4096),
|
||||||
|
("Custom", None, None),
|
||||||
|
]
|
||||||
|
|
||||||
|
# The time in this dictionary are given for 10 seconds duration.
|
||||||
|
VIDEO_TASKS_EXECUTION_TIME = {
|
||||||
|
"seedance-1-0-lite-t2v-250428": {
|
||||||
|
"480p": 40,
|
||||||
|
"720p": 60,
|
||||||
|
"1080p": 90,
|
||||||
|
},
|
||||||
|
"seedance-1-0-lite-i2v-250428": {
|
||||||
|
"480p": 40,
|
||||||
|
"720p": 60,
|
||||||
|
"1080p": 90,
|
||||||
|
},
|
||||||
|
"seedance-1-0-pro-250528": {
|
||||||
|
"480p": 70,
|
||||||
|
"720p": 85,
|
||||||
|
"1080p": 115,
|
||||||
|
},
|
||||||
|
"seedance-1-0-pro-fast-251015": {
|
||||||
|
"480p": 50,
|
||||||
|
"720p": 65,
|
||||||
|
"1080p": 100,
|
||||||
|
},
|
||||||
|
}
|
||||||
@ -58,8 +58,14 @@ class GeminiInlineData(BaseModel):
|
|||||||
mimeType: GeminiMimeType | None = Field(None)
|
mimeType: GeminiMimeType | None = Field(None)
|
||||||
|
|
||||||
|
|
||||||
|
class GeminiFileData(BaseModel):
|
||||||
|
fileUri: str | None = Field(None)
|
||||||
|
mimeType: GeminiMimeType | None = Field(None)
|
||||||
|
|
||||||
|
|
||||||
class GeminiPart(BaseModel):
|
class GeminiPart(BaseModel):
|
||||||
inlineData: GeminiInlineData | None = Field(None)
|
inlineData: GeminiInlineData | None = Field(None)
|
||||||
|
fileData: GeminiFileData | None = Field(None)
|
||||||
text: str | None = Field(None)
|
text: str | None = Field(None)
|
||||||
|
|
||||||
|
|
||||||
@ -78,15 +84,7 @@ class GeminiSystemInstructionContent(BaseModel):
|
|||||||
description="A list of ordered parts that make up a single message. "
|
description="A list of ordered parts that make up a single message. "
|
||||||
"Different parts may have different IANA MIME types.",
|
"Different parts may have different IANA MIME types.",
|
||||||
)
|
)
|
||||||
role: GeminiRole = Field(
|
role: GeminiRole | None = Field(..., description="The role field of systemInstruction may be ignored.")
|
||||||
...,
|
|
||||||
description="The identity of the entity that creates the message. "
|
|
||||||
"The following values are supported: "
|
|
||||||
"user: This indicates that the message is sent by a real person, typically a user-generated message. "
|
|
||||||
"model: This indicates that the message is generated by the model. "
|
|
||||||
"The model value is used to insert messages from model into the conversation during multi-turn conversations. "
|
|
||||||
"For non-multi-turn conversations, this field can be left blank or unset.",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class GeminiFunctionDeclaration(BaseModel):
|
class GeminiFunctionDeclaration(BaseModel):
|
||||||
|
|||||||
104
comfy_api_nodes/apis/kling_api.py
Normal file
104
comfy_api_nodes/apis/kling_api.py
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
|
||||||
|
class OmniProText2VideoRequest(BaseModel):
|
||||||
|
model_name: str = Field(..., description="kling-video-o1")
|
||||||
|
aspect_ratio: str = Field(..., description="'16:9', '9:16' or '1:1'")
|
||||||
|
duration: str = Field(..., description="'5' or '10'")
|
||||||
|
prompt: str = Field(...)
|
||||||
|
mode: str = Field("pro")
|
||||||
|
|
||||||
|
|
||||||
|
class OmniParamImage(BaseModel):
|
||||||
|
image_url: str = Field(...)
|
||||||
|
type: str | None = Field(None, description="Can be 'first_frame' or 'end_frame'")
|
||||||
|
|
||||||
|
|
||||||
|
class OmniParamVideo(BaseModel):
|
||||||
|
video_url: str = Field(...)
|
||||||
|
refer_type: str | None = Field(..., description="Can be 'base' or 'feature'")
|
||||||
|
keep_original_sound: str = Field(..., description="'yes' or 'no'")
|
||||||
|
|
||||||
|
|
||||||
|
class OmniProFirstLastFrameRequest(BaseModel):
|
||||||
|
model_name: str = Field(..., description="kling-video-o1")
|
||||||
|
image_list: list[OmniParamImage] = Field(..., min_length=1, max_length=7)
|
||||||
|
duration: str = Field(..., description="'5' or '10'")
|
||||||
|
prompt: str = Field(...)
|
||||||
|
mode: str = Field("pro")
|
||||||
|
|
||||||
|
|
||||||
|
class OmniProReferences2VideoRequest(BaseModel):
|
||||||
|
model_name: str = Field(..., description="kling-video-o1")
|
||||||
|
aspect_ratio: str | None = Field(..., description="'16:9', '9:16' or '1:1'")
|
||||||
|
image_list: list[OmniParamImage] | None = Field(
|
||||||
|
None, max_length=7, description="Max length 4 when video is present."
|
||||||
|
)
|
||||||
|
video_list: list[OmniParamVideo] | None = Field(None, max_length=1)
|
||||||
|
duration: str | None = Field(..., description="From 3 to 10.")
|
||||||
|
prompt: str = Field(...)
|
||||||
|
mode: str = Field("pro")
|
||||||
|
|
||||||
|
|
||||||
|
class TaskStatusVideoResult(BaseModel):
|
||||||
|
duration: str | None = Field(None, description="Total video duration")
|
||||||
|
id: str | None = Field(None, description="Generated video ID")
|
||||||
|
url: str | None = Field(None, description="URL for generated video")
|
||||||
|
|
||||||
|
|
||||||
|
class TaskStatusImageResult(BaseModel):
|
||||||
|
index: int = Field(..., description="Image Number,0-9")
|
||||||
|
url: str = Field(..., description="URL for generated image")
|
||||||
|
|
||||||
|
|
||||||
|
class TaskStatusResults(BaseModel):
|
||||||
|
videos: list[TaskStatusVideoResult] | None = Field(None)
|
||||||
|
images: list[TaskStatusImageResult] | None = Field(None)
|
||||||
|
|
||||||
|
|
||||||
|
class TaskStatusResponseData(BaseModel):
|
||||||
|
created_at: int | None = Field(None, description="Task creation time")
|
||||||
|
updated_at: int | None = Field(None, description="Task update time")
|
||||||
|
task_status: str | None = None
|
||||||
|
task_status_msg: str | None = Field(None, description="Additional failure reason. Only for polling endpoint.")
|
||||||
|
task_id: str | None = Field(None, description="Task ID")
|
||||||
|
task_result: TaskStatusResults | None = Field(None)
|
||||||
|
|
||||||
|
|
||||||
|
class TaskStatusResponse(BaseModel):
|
||||||
|
code: int | None = Field(None, description="Error code")
|
||||||
|
message: str | None = Field(None, description="Error message")
|
||||||
|
request_id: str | None = Field(None, description="Request ID")
|
||||||
|
data: TaskStatusResponseData | None = Field(None)
|
||||||
|
|
||||||
|
|
||||||
|
class OmniImageParamImage(BaseModel):
|
||||||
|
image: str = Field(...)
|
||||||
|
|
||||||
|
|
||||||
|
class OmniProImageRequest(BaseModel):
|
||||||
|
model_name: str = Field(..., description="kling-image-o1")
|
||||||
|
resolution: str = Field(..., description="'1k' or '2k'")
|
||||||
|
aspect_ratio: str | None = Field(...)
|
||||||
|
prompt: str = Field(...)
|
||||||
|
mode: str = Field("pro")
|
||||||
|
n: int | None = Field(1, le=9)
|
||||||
|
image_list: list[OmniImageParamImage] | None = Field(..., max_length=10)
|
||||||
|
|
||||||
|
|
||||||
|
class TextToVideoWithAudioRequest(BaseModel):
|
||||||
|
model_name: str = Field(..., description="kling-v2-6")
|
||||||
|
aspect_ratio: str = Field(..., description="'16:9', '9:16' or '1:1'")
|
||||||
|
duration: str = Field(..., description="'5' or '10'")
|
||||||
|
prompt: str = Field(...)
|
||||||
|
mode: str = Field("pro")
|
||||||
|
sound: str = Field(..., description="'on' or 'off'")
|
||||||
|
|
||||||
|
|
||||||
|
class ImageToVideoWithAudioRequest(BaseModel):
|
||||||
|
model_name: str = Field(..., description="kling-v2-6")
|
||||||
|
image: str = Field(...)
|
||||||
|
duration: str = Field(..., description="'5' or '10'")
|
||||||
|
prompt: str = Field(...)
|
||||||
|
mode: str = Field("pro")
|
||||||
|
sound: str = Field(..., description="'on' or 'off'")
|
||||||
@ -1,100 +0,0 @@
|
|||||||
from typing import Optional
|
|
||||||
from enum import Enum
|
|
||||||
from pydantic import BaseModel, Field
|
|
||||||
|
|
||||||
|
|
||||||
class Pikaffect(str, Enum):
|
|
||||||
Cake_ify = "Cake-ify"
|
|
||||||
Crumble = "Crumble"
|
|
||||||
Crush = "Crush"
|
|
||||||
Decapitate = "Decapitate"
|
|
||||||
Deflate = "Deflate"
|
|
||||||
Dissolve = "Dissolve"
|
|
||||||
Explode = "Explode"
|
|
||||||
Eye_pop = "Eye-pop"
|
|
||||||
Inflate = "Inflate"
|
|
||||||
Levitate = "Levitate"
|
|
||||||
Melt = "Melt"
|
|
||||||
Peel = "Peel"
|
|
||||||
Poke = "Poke"
|
|
||||||
Squish = "Squish"
|
|
||||||
Ta_da = "Ta-da"
|
|
||||||
Tear = "Tear"
|
|
||||||
|
|
||||||
|
|
||||||
class PikaBodyGenerate22C2vGenerate22PikascenesPost(BaseModel):
|
|
||||||
aspectRatio: Optional[float] = Field(None, description='Aspect ratio (width / height)')
|
|
||||||
duration: Optional[int] = Field(5)
|
|
||||||
ingredientsMode: str = Field(...)
|
|
||||||
negativePrompt: Optional[str] = Field(None)
|
|
||||||
promptText: Optional[str] = Field(None)
|
|
||||||
resolution: Optional[str] = Field('1080p')
|
|
||||||
seed: Optional[int] = Field(None)
|
|
||||||
|
|
||||||
|
|
||||||
class PikaGenerateResponse(BaseModel):
|
|
||||||
video_id: str = Field(...)
|
|
||||||
|
|
||||||
|
|
||||||
class PikaBodyGenerate22I2vGenerate22I2vPost(BaseModel):
|
|
||||||
duration: Optional[int] = 5
|
|
||||||
negativePrompt: Optional[str] = Field(None)
|
|
||||||
promptText: Optional[str] = Field(None)
|
|
||||||
resolution: Optional[str] = '1080p'
|
|
||||||
seed: Optional[int] = Field(None)
|
|
||||||
|
|
||||||
|
|
||||||
class PikaBodyGenerate22KeyframeGenerate22PikaframesPost(BaseModel):
|
|
||||||
duration: Optional[int] = Field(None, ge=5, le=10)
|
|
||||||
negativePrompt: Optional[str] = Field(None)
|
|
||||||
promptText: str = Field(...)
|
|
||||||
resolution: Optional[str] = '1080p'
|
|
||||||
seed: Optional[int] = Field(None)
|
|
||||||
|
|
||||||
|
|
||||||
class PikaBodyGenerate22T2vGenerate22T2vPost(BaseModel):
|
|
||||||
aspectRatio: Optional[float] = Field(
|
|
||||||
1.7777777777777777,
|
|
||||||
description='Aspect ratio (width / height)',
|
|
||||||
ge=0.4,
|
|
||||||
le=2.5,
|
|
||||||
)
|
|
||||||
duration: Optional[int] = 5
|
|
||||||
negativePrompt: Optional[str] = Field(None)
|
|
||||||
promptText: str = Field(...)
|
|
||||||
resolution: Optional[str] = '1080p'
|
|
||||||
seed: Optional[int] = Field(None)
|
|
||||||
|
|
||||||
|
|
||||||
class PikaBodyGeneratePikadditionsGeneratePikadditionsPost(BaseModel):
|
|
||||||
negativePrompt: Optional[str] = Field(None)
|
|
||||||
promptText: Optional[str] = Field(None)
|
|
||||||
seed: Optional[int] = Field(None)
|
|
||||||
|
|
||||||
|
|
||||||
class PikaBodyGeneratePikaffectsGeneratePikaffectsPost(BaseModel):
|
|
||||||
negativePrompt: Optional[str] = Field(None)
|
|
||||||
pikaffect: Optional[str] = None
|
|
||||||
promptText: Optional[str] = Field(None)
|
|
||||||
seed: Optional[int] = Field(None)
|
|
||||||
|
|
||||||
|
|
||||||
class PikaBodyGeneratePikaswapsGeneratePikaswapsPost(BaseModel):
|
|
||||||
negativePrompt: Optional[str] = Field(None)
|
|
||||||
promptText: Optional[str] = Field(None)
|
|
||||||
seed: Optional[int] = Field(None)
|
|
||||||
modifyRegionRoi: Optional[str] = Field(None)
|
|
||||||
|
|
||||||
|
|
||||||
class PikaStatusEnum(str, Enum):
|
|
||||||
queued = "queued"
|
|
||||||
started = "started"
|
|
||||||
finished = "finished"
|
|
||||||
failed = "failed"
|
|
||||||
|
|
||||||
|
|
||||||
class PikaVideoResponse(BaseModel):
|
|
||||||
id: str = Field(...)
|
|
||||||
progress: Optional[int] = Field(None)
|
|
||||||
status: PikaStatusEnum
|
|
||||||
url: Optional[str] = Field(None)
|
|
||||||
@ -5,11 +5,17 @@ from typing import Optional, List, Dict, Any, Union
|
|||||||
from pydantic import BaseModel, Field, RootModel
|
from pydantic import BaseModel, Field, RootModel
|
||||||
|
|
||||||
class TripoModelVersion(str, Enum):
|
class TripoModelVersion(str, Enum):
|
||||||
|
v3_0_20250812 = 'v3.0-20250812'
|
||||||
v2_5_20250123 = 'v2.5-20250123'
|
v2_5_20250123 = 'v2.5-20250123'
|
||||||
v2_0_20240919 = 'v2.0-20240919'
|
v2_0_20240919 = 'v2.0-20240919'
|
||||||
v1_4_20240625 = 'v1.4-20240625'
|
v1_4_20240625 = 'v1.4-20240625'
|
||||||
|
|
||||||
|
|
||||||
|
class TripoGeometryQuality(str, Enum):
|
||||||
|
standard = 'standard'
|
||||||
|
detailed = 'detailed'
|
||||||
|
|
||||||
|
|
||||||
class TripoTextureQuality(str, Enum):
|
class TripoTextureQuality(str, Enum):
|
||||||
standard = 'standard'
|
standard = 'standard'
|
||||||
detailed = 'detailed'
|
detailed = 'detailed'
|
||||||
@ -61,14 +67,20 @@ class TripoSpec(str, Enum):
|
|||||||
class TripoAnimation(str, Enum):
|
class TripoAnimation(str, Enum):
|
||||||
IDLE = "preset:idle"
|
IDLE = "preset:idle"
|
||||||
WALK = "preset:walk"
|
WALK = "preset:walk"
|
||||||
|
RUN = "preset:run"
|
||||||
|
DIVE = "preset:dive"
|
||||||
CLIMB = "preset:climb"
|
CLIMB = "preset:climb"
|
||||||
JUMP = "preset:jump"
|
JUMP = "preset:jump"
|
||||||
RUN = "preset:run"
|
|
||||||
SLASH = "preset:slash"
|
SLASH = "preset:slash"
|
||||||
SHOOT = "preset:shoot"
|
SHOOT = "preset:shoot"
|
||||||
HURT = "preset:hurt"
|
HURT = "preset:hurt"
|
||||||
FALL = "preset:fall"
|
FALL = "preset:fall"
|
||||||
TURN = "preset:turn"
|
TURN = "preset:turn"
|
||||||
|
QUADRUPED_WALK = "preset:quadruped:walk"
|
||||||
|
HEXAPOD_WALK = "preset:hexapod:walk"
|
||||||
|
OCTOPOD_WALK = "preset:octopod:walk"
|
||||||
|
SERPENTINE_MARCH = "preset:serpentine:march"
|
||||||
|
AQUATIC_MARCH = "preset:aquatic:march"
|
||||||
|
|
||||||
class TripoStylizeStyle(str, Enum):
|
class TripoStylizeStyle(str, Enum):
|
||||||
LEGO = "lego"
|
LEGO = "lego"
|
||||||
@ -105,6 +117,11 @@ class TripoTaskStatus(str, Enum):
|
|||||||
BANNED = "banned"
|
BANNED = "banned"
|
||||||
EXPIRED = "expired"
|
EXPIRED = "expired"
|
||||||
|
|
||||||
|
class TripoFbxPreset(str, Enum):
|
||||||
|
BLENDER = "blender"
|
||||||
|
MIXAMO = "mixamo"
|
||||||
|
_3DSMAX = "3dsmax"
|
||||||
|
|
||||||
class TripoFileTokenReference(BaseModel):
|
class TripoFileTokenReference(BaseModel):
|
||||||
type: Optional[str] = Field(None, description='The type of the reference')
|
type: Optional[str] = Field(None, description='The type of the reference')
|
||||||
file_token: str
|
file_token: str
|
||||||
@ -142,6 +159,7 @@ class TripoTextToModelRequest(BaseModel):
|
|||||||
model_seed: Optional[int] = Field(None, description='The seed for the model')
|
model_seed: Optional[int] = Field(None, description='The seed for the model')
|
||||||
texture_seed: Optional[int] = Field(None, description='The seed for the texture')
|
texture_seed: Optional[int] = Field(None, description='The seed for the texture')
|
||||||
texture_quality: Optional[TripoTextureQuality] = TripoTextureQuality.standard
|
texture_quality: Optional[TripoTextureQuality] = TripoTextureQuality.standard
|
||||||
|
geometry_quality: Optional[TripoGeometryQuality] = TripoGeometryQuality.standard
|
||||||
style: Optional[TripoStyle] = None
|
style: Optional[TripoStyle] = None
|
||||||
auto_size: Optional[bool] = Field(False, description='Whether to auto-size the model')
|
auto_size: Optional[bool] = Field(False, description='Whether to auto-size the model')
|
||||||
quad: Optional[bool] = Field(False, description='Whether to apply quad to the generated model')
|
quad: Optional[bool] = Field(False, description='Whether to apply quad to the generated model')
|
||||||
@ -156,6 +174,7 @@ class TripoImageToModelRequest(BaseModel):
|
|||||||
model_seed: Optional[int] = Field(None, description='The seed for the model')
|
model_seed: Optional[int] = Field(None, description='The seed for the model')
|
||||||
texture_seed: Optional[int] = Field(None, description='The seed for the texture')
|
texture_seed: Optional[int] = Field(None, description='The seed for the texture')
|
||||||
texture_quality: Optional[TripoTextureQuality] = TripoTextureQuality.standard
|
texture_quality: Optional[TripoTextureQuality] = TripoTextureQuality.standard
|
||||||
|
geometry_quality: Optional[TripoGeometryQuality] = TripoGeometryQuality.standard
|
||||||
texture_alignment: Optional[TripoTextureAlignment] = Field(TripoTextureAlignment.ORIGINAL_IMAGE, description='The texture alignment method')
|
texture_alignment: Optional[TripoTextureAlignment] = Field(TripoTextureAlignment.ORIGINAL_IMAGE, description='The texture alignment method')
|
||||||
style: Optional[TripoStyle] = Field(None, description='The style to apply to the generated model')
|
style: Optional[TripoStyle] = Field(None, description='The style to apply to the generated model')
|
||||||
auto_size: Optional[bool] = Field(False, description='Whether to auto-size the model')
|
auto_size: Optional[bool] = Field(False, description='Whether to auto-size the model')
|
||||||
@ -173,6 +192,7 @@ class TripoMultiviewToModelRequest(BaseModel):
|
|||||||
model_seed: Optional[int] = Field(None, description='The seed for the model')
|
model_seed: Optional[int] = Field(None, description='The seed for the model')
|
||||||
texture_seed: Optional[int] = Field(None, description='The seed for the texture')
|
texture_seed: Optional[int] = Field(None, description='The seed for the texture')
|
||||||
texture_quality: Optional[TripoTextureQuality] = TripoTextureQuality.standard
|
texture_quality: Optional[TripoTextureQuality] = TripoTextureQuality.standard
|
||||||
|
geometry_quality: Optional[TripoGeometryQuality] = TripoGeometryQuality.standard
|
||||||
texture_alignment: Optional[TripoTextureAlignment] = TripoTextureAlignment.ORIGINAL_IMAGE
|
texture_alignment: Optional[TripoTextureAlignment] = TripoTextureAlignment.ORIGINAL_IMAGE
|
||||||
auto_size: Optional[bool] = Field(False, description='Whether to auto-size the model')
|
auto_size: Optional[bool] = Field(False, description='Whether to auto-size the model')
|
||||||
orientation: Optional[TripoOrientation] = Field(TripoOrientation.DEFAULT, description='The orientation for the model')
|
orientation: Optional[TripoOrientation] = Field(TripoOrientation.DEFAULT, description='The orientation for the model')
|
||||||
@ -219,14 +239,24 @@ class TripoConvertModelRequest(BaseModel):
|
|||||||
type: TripoTaskType = Field(TripoTaskType.CONVERT_MODEL, description='Type of task')
|
type: TripoTaskType = Field(TripoTaskType.CONVERT_MODEL, description='Type of task')
|
||||||
format: TripoConvertFormat = Field(..., description='The format to convert to')
|
format: TripoConvertFormat = Field(..., description='The format to convert to')
|
||||||
original_model_task_id: str = Field(..., description='The task ID of the original model')
|
original_model_task_id: str = Field(..., description='The task ID of the original model')
|
||||||
quad: Optional[bool] = Field(False, description='Whether to apply quad to the model')
|
quad: Optional[bool] = Field(None, description='Whether to apply quad to the model')
|
||||||
force_symmetry: Optional[bool] = Field(False, description='Whether to force symmetry')
|
force_symmetry: Optional[bool] = Field(None, description='Whether to force symmetry')
|
||||||
face_limit: Optional[int] = Field(10000, description='The number of faces to limit the conversion to')
|
face_limit: Optional[int] = Field(None, description='The number of faces to limit the conversion to')
|
||||||
flatten_bottom: Optional[bool] = Field(False, description='Whether to flatten the bottom of the model')
|
flatten_bottom: Optional[bool] = Field(None, description='Whether to flatten the bottom of the model')
|
||||||
flatten_bottom_threshold: Optional[float] = Field(0.01, description='The threshold for flattening the bottom')
|
flatten_bottom_threshold: Optional[float] = Field(None, description='The threshold for flattening the bottom')
|
||||||
texture_size: Optional[int] = Field(4096, description='The size of the texture')
|
texture_size: Optional[int] = Field(None, description='The size of the texture')
|
||||||
texture_format: Optional[TripoTextureFormat] = Field(TripoTextureFormat.JPEG, description='The format of the texture')
|
texture_format: Optional[TripoTextureFormat] = Field(TripoTextureFormat.JPEG, description='The format of the texture')
|
||||||
pivot_to_center_bottom: Optional[bool] = Field(False, description='Whether to pivot to the center bottom')
|
pivot_to_center_bottom: Optional[bool] = Field(None, description='Whether to pivot to the center bottom')
|
||||||
|
scale_factor: Optional[float] = Field(None, description='The scale factor for the model')
|
||||||
|
with_animation: Optional[bool] = Field(None, description='Whether to include animations')
|
||||||
|
pack_uv: Optional[bool] = Field(None, description='Whether to pack the UVs')
|
||||||
|
bake: Optional[bool] = Field(None, description='Whether to bake the model')
|
||||||
|
part_names: Optional[List[str]] = Field(None, description='The names of the parts to include')
|
||||||
|
fbx_preset: Optional[TripoFbxPreset] = Field(None, description='The preset for the FBX export')
|
||||||
|
export_vertex_colors: Optional[bool] = Field(None, description='Whether to export the vertex colors')
|
||||||
|
export_orientation: Optional[TripoOrientation] = Field(None, description='The orientation for the export')
|
||||||
|
animate_in_place: Optional[bool] = Field(None, description='Whether to animate in place')
|
||||||
|
|
||||||
|
|
||||||
class TripoTaskRequest(RootModel):
|
class TripoTaskRequest(RootModel):
|
||||||
root: Union[
|
root: Union[
|
||||||
|
|||||||
@ -1,34 +1,21 @@
|
|||||||
from typing import Optional, Union
|
from typing import Optional
|
||||||
from enum import Enum
|
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
|
||||||
class Image2(BaseModel):
|
class VeoRequestInstanceImage(BaseModel):
|
||||||
bytesBase64Encoded: str
|
bytesBase64Encoded: str | None = Field(None)
|
||||||
gcsUri: Optional[str] = None
|
gcsUri: str | None = Field(None)
|
||||||
mimeType: Optional[str] = None
|
mimeType: str | None = Field(None)
|
||||||
|
|
||||||
|
|
||||||
class Image3(BaseModel):
|
class VeoRequestInstance(BaseModel):
|
||||||
bytesBase64Encoded: Optional[str] = None
|
image: VeoRequestInstanceImage | None = Field(None)
|
||||||
gcsUri: str
|
lastFrame: VeoRequestInstanceImage | None = Field(None)
|
||||||
mimeType: Optional[str] = None
|
|
||||||
|
|
||||||
|
|
||||||
class Instance1(BaseModel):
|
|
||||||
image: Optional[Union[Image2, Image3]] = Field(
|
|
||||||
None, description='Optional image to guide video generation'
|
|
||||||
)
|
|
||||||
prompt: str = Field(..., description='Text description of the video')
|
prompt: str = Field(..., description='Text description of the video')
|
||||||
|
|
||||||
|
|
||||||
class PersonGeneration1(str, Enum):
|
class VeoRequestParameters(BaseModel):
|
||||||
ALLOW = 'ALLOW'
|
|
||||||
BLOCK = 'BLOCK'
|
|
||||||
|
|
||||||
|
|
||||||
class Parameters1(BaseModel):
|
|
||||||
aspectRatio: Optional[str] = Field(None, examples=['16:9'])
|
aspectRatio: Optional[str] = Field(None, examples=['16:9'])
|
||||||
durationSeconds: Optional[int] = None
|
durationSeconds: Optional[int] = None
|
||||||
enhancePrompt: Optional[bool] = None
|
enhancePrompt: Optional[bool] = None
|
||||||
@ -37,17 +24,18 @@ class Parameters1(BaseModel):
|
|||||||
description='Generate audio for the video. Only supported by veo 3 models.',
|
description='Generate audio for the video. Only supported by veo 3 models.',
|
||||||
)
|
)
|
||||||
negativePrompt: Optional[str] = None
|
negativePrompt: Optional[str] = None
|
||||||
personGeneration: Optional[PersonGeneration1] = None
|
personGeneration: str | None = Field(None, description="ALLOW or BLOCK")
|
||||||
sampleCount: Optional[int] = None
|
sampleCount: Optional[int] = None
|
||||||
seed: Optional[int] = None
|
seed: Optional[int] = None
|
||||||
storageUri: Optional[str] = Field(
|
storageUri: Optional[str] = Field(
|
||||||
None, description='Optional Cloud Storage URI to upload the video'
|
None, description='Optional Cloud Storage URI to upload the video'
|
||||||
)
|
)
|
||||||
|
resolution: str | None = Field(None)
|
||||||
|
|
||||||
|
|
||||||
class VeoGenVidRequest(BaseModel):
|
class VeoGenVidRequest(BaseModel):
|
||||||
instances: Optional[list[Instance1]] = None
|
instances: list[VeoRequestInstance] | None = Field(None)
|
||||||
parameters: Optional[Parameters1] = None
|
parameters: VeoRequestParameters | None = Field(None)
|
||||||
|
|
||||||
|
|
||||||
class VeoGenVidResponse(BaseModel):
|
class VeoGenVidResponse(BaseModel):
|
||||||
@ -97,7 +85,7 @@ class Response1(BaseModel):
|
|||||||
raiMediaFilteredReasons: Optional[list[str]] = Field(
|
raiMediaFilteredReasons: Optional[list[str]] = Field(
|
||||||
None, description='Reasons why media was filtered by responsible AI policies'
|
None, description='Reasons why media was filtered by responsible AI policies'
|
||||||
)
|
)
|
||||||
videos: Optional[list[Video]] = None
|
videos: Optional[list[Video]] = Field(None)
|
||||||
|
|
||||||
|
|
||||||
class VeoGenVidPollResponse(BaseModel):
|
class VeoGenVidPollResponse(BaseModel):
|
||||||
|
|||||||
@ -1,13 +1,27 @@
|
|||||||
import logging
|
import logging
|
||||||
import math
|
import math
|
||||||
from enum import Enum
|
|
||||||
from typing import Literal, Optional, Union
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from pydantic import BaseModel, Field
|
|
||||||
from typing_extensions import override
|
from typing_extensions import override
|
||||||
|
|
||||||
from comfy_api.latest import IO, ComfyExtension
|
from comfy_api.latest import IO, ComfyExtension, Input
|
||||||
|
from comfy_api_nodes.apis.bytedance_api import (
|
||||||
|
RECOMMENDED_PRESETS,
|
||||||
|
RECOMMENDED_PRESETS_SEEDREAM_4,
|
||||||
|
VIDEO_TASKS_EXECUTION_TIME,
|
||||||
|
Image2ImageTaskCreationRequest,
|
||||||
|
Image2VideoTaskCreationRequest,
|
||||||
|
ImageTaskCreationResponse,
|
||||||
|
Seedream4Options,
|
||||||
|
Seedream4TaskCreationRequest,
|
||||||
|
TaskCreationResponse,
|
||||||
|
TaskImageContent,
|
||||||
|
TaskImageContentUrl,
|
||||||
|
TaskStatusResponse,
|
||||||
|
TaskTextContent,
|
||||||
|
Text2ImageTaskCreationRequest,
|
||||||
|
Text2VideoTaskCreationRequest,
|
||||||
|
)
|
||||||
from comfy_api_nodes.util import (
|
from comfy_api_nodes.util import (
|
||||||
ApiEndpoint,
|
ApiEndpoint,
|
||||||
download_url_to_image_tensor,
|
download_url_to_image_tensor,
|
||||||
@ -29,162 +43,6 @@ BYTEPLUS_TASK_ENDPOINT = "/proxy/byteplus/api/v3/contents/generations/tasks"
|
|||||||
BYTEPLUS_TASK_STATUS_ENDPOINT = "/proxy/byteplus/api/v3/contents/generations/tasks" # + /{task_id}
|
BYTEPLUS_TASK_STATUS_ENDPOINT = "/proxy/byteplus/api/v3/contents/generations/tasks" # + /{task_id}
|
||||||
|
|
||||||
|
|
||||||
class Text2ImageModelName(str, Enum):
|
|
||||||
seedream_3 = "seedream-3-0-t2i-250415"
|
|
||||||
|
|
||||||
|
|
||||||
class Image2ImageModelName(str, Enum):
|
|
||||||
seededit_3 = "seededit-3-0-i2i-250628"
|
|
||||||
|
|
||||||
|
|
||||||
class Text2VideoModelName(str, Enum):
|
|
||||||
seedance_1_pro = "seedance-1-0-pro-250528"
|
|
||||||
seedance_1_lite = "seedance-1-0-lite-t2v-250428"
|
|
||||||
|
|
||||||
|
|
||||||
class Image2VideoModelName(str, Enum):
|
|
||||||
"""note(August 31): Pro model only supports FirstFrame: https://docs.byteplus.com/en/docs/ModelArk/1520757"""
|
|
||||||
|
|
||||||
seedance_1_pro = "seedance-1-0-pro-250528"
|
|
||||||
seedance_1_lite = "seedance-1-0-lite-i2v-250428"
|
|
||||||
|
|
||||||
|
|
||||||
class Text2ImageTaskCreationRequest(BaseModel):
|
|
||||||
model: Text2ImageModelName = Text2ImageModelName.seedream_3
|
|
||||||
prompt: str = Field(...)
|
|
||||||
response_format: Optional[str] = Field("url")
|
|
||||||
size: Optional[str] = Field(None)
|
|
||||||
seed: Optional[int] = Field(0, ge=0, le=2147483647)
|
|
||||||
guidance_scale: Optional[float] = Field(..., ge=1.0, le=10.0)
|
|
||||||
watermark: Optional[bool] = Field(True)
|
|
||||||
|
|
||||||
|
|
||||||
class Image2ImageTaskCreationRequest(BaseModel):
|
|
||||||
model: Image2ImageModelName = Image2ImageModelName.seededit_3
|
|
||||||
prompt: str = Field(...)
|
|
||||||
response_format: Optional[str] = Field("url")
|
|
||||||
image: str = Field(..., description="Base64 encoded string or image URL")
|
|
||||||
size: Optional[str] = Field("adaptive")
|
|
||||||
seed: Optional[int] = Field(..., ge=0, le=2147483647)
|
|
||||||
guidance_scale: Optional[float] = Field(..., ge=1.0, le=10.0)
|
|
||||||
watermark: Optional[bool] = Field(True)
|
|
||||||
|
|
||||||
|
|
||||||
class Seedream4Options(BaseModel):
|
|
||||||
max_images: int = Field(15)
|
|
||||||
|
|
||||||
|
|
||||||
class Seedream4TaskCreationRequest(BaseModel):
|
|
||||||
model: str = Field("seedream-4-0-250828")
|
|
||||||
prompt: str = Field(...)
|
|
||||||
response_format: str = Field("url")
|
|
||||||
image: Optional[list[str]] = Field(None, description="Image URLs")
|
|
||||||
size: str = Field(...)
|
|
||||||
seed: int = Field(..., ge=0, le=2147483647)
|
|
||||||
sequential_image_generation: str = Field("disabled")
|
|
||||||
sequential_image_generation_options: Seedream4Options = Field(Seedream4Options(max_images=15))
|
|
||||||
watermark: bool = Field(True)
|
|
||||||
|
|
||||||
|
|
||||||
class ImageTaskCreationResponse(BaseModel):
|
|
||||||
model: str = Field(...)
|
|
||||||
created: int = Field(..., description="Unix timestamp (in seconds) indicating time when the request was created.")
|
|
||||||
data: list = Field([], description="Contains information about the generated image(s).")
|
|
||||||
error: dict = Field({}, description="Contains `code` and `message` fields in case of error.")
|
|
||||||
|
|
||||||
|
|
||||||
class TaskTextContent(BaseModel):
|
|
||||||
type: str = Field("text")
|
|
||||||
text: str = Field(...)
|
|
||||||
|
|
||||||
|
|
||||||
class TaskImageContentUrl(BaseModel):
|
|
||||||
url: str = Field(...)
|
|
||||||
|
|
||||||
|
|
||||||
class TaskImageContent(BaseModel):
|
|
||||||
type: str = Field("image_url")
|
|
||||||
image_url: TaskImageContentUrl = Field(...)
|
|
||||||
role: Optional[Literal["first_frame", "last_frame", "reference_image"]] = Field(None)
|
|
||||||
|
|
||||||
|
|
||||||
class Text2VideoTaskCreationRequest(BaseModel):
|
|
||||||
model: Text2VideoModelName = Text2VideoModelName.seedance_1_pro
|
|
||||||
content: list[TaskTextContent] = Field(..., min_length=1)
|
|
||||||
|
|
||||||
|
|
||||||
class Image2VideoTaskCreationRequest(BaseModel):
|
|
||||||
model: Image2VideoModelName = Image2VideoModelName.seedance_1_pro
|
|
||||||
content: list[Union[TaskTextContent, TaskImageContent]] = Field(..., min_length=2)
|
|
||||||
|
|
||||||
|
|
||||||
class TaskCreationResponse(BaseModel):
|
|
||||||
id: str = Field(...)
|
|
||||||
|
|
||||||
|
|
||||||
class TaskStatusError(BaseModel):
|
|
||||||
code: str = Field(...)
|
|
||||||
message: str = Field(...)
|
|
||||||
|
|
||||||
|
|
||||||
class TaskStatusResult(BaseModel):
|
|
||||||
video_url: str = Field(...)
|
|
||||||
|
|
||||||
|
|
||||||
class TaskStatusResponse(BaseModel):
|
|
||||||
id: str = Field(...)
|
|
||||||
model: str = Field(...)
|
|
||||||
status: Literal["queued", "running", "cancelled", "succeeded", "failed"] = Field(...)
|
|
||||||
error: Optional[TaskStatusError] = Field(None)
|
|
||||||
content: Optional[TaskStatusResult] = Field(None)
|
|
||||||
|
|
||||||
|
|
||||||
RECOMMENDED_PRESETS = [
|
|
||||||
("1024x1024 (1:1)", 1024, 1024),
|
|
||||||
("864x1152 (3:4)", 864, 1152),
|
|
||||||
("1152x864 (4:3)", 1152, 864),
|
|
||||||
("1280x720 (16:9)", 1280, 720),
|
|
||||||
("720x1280 (9:16)", 720, 1280),
|
|
||||||
("832x1248 (2:3)", 832, 1248),
|
|
||||||
("1248x832 (3:2)", 1248, 832),
|
|
||||||
("1512x648 (21:9)", 1512, 648),
|
|
||||||
("2048x2048 (1:1)", 2048, 2048),
|
|
||||||
("Custom", None, None),
|
|
||||||
]
|
|
||||||
|
|
||||||
RECOMMENDED_PRESETS_SEEDREAM_4 = [
|
|
||||||
("2048x2048 (1:1)", 2048, 2048),
|
|
||||||
("2304x1728 (4:3)", 2304, 1728),
|
|
||||||
("1728x2304 (3:4)", 1728, 2304),
|
|
||||||
("2560x1440 (16:9)", 2560, 1440),
|
|
||||||
("1440x2560 (9:16)", 1440, 2560),
|
|
||||||
("2496x1664 (3:2)", 2496, 1664),
|
|
||||||
("1664x2496 (2:3)", 1664, 2496),
|
|
||||||
("3024x1296 (21:9)", 3024, 1296),
|
|
||||||
("4096x4096 (1:1)", 4096, 4096),
|
|
||||||
("Custom", None, None),
|
|
||||||
]
|
|
||||||
|
|
||||||
# The time in this dictionary are given for 10 seconds duration.
|
|
||||||
VIDEO_TASKS_EXECUTION_TIME = {
|
|
||||||
"seedance-1-0-lite-t2v-250428": {
|
|
||||||
"480p": 40,
|
|
||||||
"720p": 60,
|
|
||||||
"1080p": 90,
|
|
||||||
},
|
|
||||||
"seedance-1-0-lite-i2v-250428": {
|
|
||||||
"480p": 40,
|
|
||||||
"720p": 60,
|
|
||||||
"1080p": 90,
|
|
||||||
},
|
|
||||||
"seedance-1-0-pro-250528": {
|
|
||||||
"480p": 70,
|
|
||||||
"720p": 85,
|
|
||||||
"1080p": 115,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def get_image_url_from_response(response: ImageTaskCreationResponse) -> str:
|
def get_image_url_from_response(response: ImageTaskCreationResponse) -> str:
|
||||||
if response.error:
|
if response.error:
|
||||||
error_msg = f"ByteDance request failed. Code: {response.error['code']}, message: {response.error['message']}"
|
error_msg = f"ByteDance request failed. Code: {response.error['code']}, message: {response.error['message']}"
|
||||||
@ -194,13 +52,6 @@ def get_image_url_from_response(response: ImageTaskCreationResponse) -> str:
|
|||||||
return response.data[0]["url"]
|
return response.data[0]["url"]
|
||||||
|
|
||||||
|
|
||||||
def get_video_url_from_task_status(response: TaskStatusResponse) -> Union[str, None]:
|
|
||||||
"""Returns the video URL from the task status response if it exists."""
|
|
||||||
if hasattr(response, "content") and response.content:
|
|
||||||
return response.content.video_url
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
class ByteDanceImageNode(IO.ComfyNode):
|
class ByteDanceImageNode(IO.ComfyNode):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -211,12 +62,7 @@ class ByteDanceImageNode(IO.ComfyNode):
|
|||||||
category="api node/image/ByteDance",
|
category="api node/image/ByteDance",
|
||||||
description="Generate images using ByteDance models via api based on prompt",
|
description="Generate images using ByteDance models via api based on prompt",
|
||||||
inputs=[
|
inputs=[
|
||||||
IO.Combo.Input(
|
IO.Combo.Input("model", options=["seedream-3-0-t2i-250415"]),
|
||||||
"model",
|
|
||||||
options=Text2ImageModelName,
|
|
||||||
default=Text2ImageModelName.seedream_3,
|
|
||||||
tooltip="Model name",
|
|
||||||
),
|
|
||||||
IO.String.Input(
|
IO.String.Input(
|
||||||
"prompt",
|
"prompt",
|
||||||
multiline=True,
|
multiline=True,
|
||||||
@ -335,12 +181,7 @@ class ByteDanceImageEditNode(IO.ComfyNode):
|
|||||||
category="api node/image/ByteDance",
|
category="api node/image/ByteDance",
|
||||||
description="Edit images using ByteDance models via api based on prompt",
|
description="Edit images using ByteDance models via api based on prompt",
|
||||||
inputs=[
|
inputs=[
|
||||||
IO.Combo.Input(
|
IO.Combo.Input("model", options=["seededit-3-0-i2i-250628"]),
|
||||||
"model",
|
|
||||||
options=Image2ImageModelName,
|
|
||||||
default=Image2ImageModelName.seededit_3,
|
|
||||||
tooltip="Model name",
|
|
||||||
),
|
|
||||||
IO.Image.Input(
|
IO.Image.Input(
|
||||||
"image",
|
"image",
|
||||||
tooltip="The base image to edit",
|
tooltip="The base image to edit",
|
||||||
@ -394,7 +235,7 @@ class ByteDanceImageEditNode(IO.ComfyNode):
|
|||||||
async def execute(
|
async def execute(
|
||||||
cls,
|
cls,
|
||||||
model: str,
|
model: str,
|
||||||
image: torch.Tensor,
|
image: Input.Image,
|
||||||
prompt: str,
|
prompt: str,
|
||||||
seed: int,
|
seed: int,
|
||||||
guidance_scale: float,
|
guidance_scale: float,
|
||||||
@ -434,7 +275,7 @@ class ByteDanceSeedreamNode(IO.ComfyNode):
|
|||||||
inputs=[
|
inputs=[
|
||||||
IO.Combo.Input(
|
IO.Combo.Input(
|
||||||
"model",
|
"model",
|
||||||
options=["seedream-4-0-250828"],
|
options=["seedream-4-5-251128", "seedream-4-0-250828"],
|
||||||
tooltip="Model name",
|
tooltip="Model name",
|
||||||
),
|
),
|
||||||
IO.String.Input(
|
IO.String.Input(
|
||||||
@ -459,7 +300,7 @@ class ByteDanceSeedreamNode(IO.ComfyNode):
|
|||||||
default=2048,
|
default=2048,
|
||||||
min=1024,
|
min=1024,
|
||||||
max=4096,
|
max=4096,
|
||||||
step=64,
|
step=8,
|
||||||
tooltip="Custom width for image. Value is working only if `size_preset` is set to `Custom`",
|
tooltip="Custom width for image. Value is working only if `size_preset` is set to `Custom`",
|
||||||
optional=True,
|
optional=True,
|
||||||
),
|
),
|
||||||
@ -468,7 +309,7 @@ class ByteDanceSeedreamNode(IO.ComfyNode):
|
|||||||
default=2048,
|
default=2048,
|
||||||
min=1024,
|
min=1024,
|
||||||
max=4096,
|
max=4096,
|
||||||
step=64,
|
step=8,
|
||||||
tooltip="Custom height for image. Value is working only if `size_preset` is set to `Custom`",
|
tooltip="Custom height for image. Value is working only if `size_preset` is set to `Custom`",
|
||||||
optional=True,
|
optional=True,
|
||||||
),
|
),
|
||||||
@ -532,7 +373,7 @@ class ByteDanceSeedreamNode(IO.ComfyNode):
|
|||||||
cls,
|
cls,
|
||||||
model: str,
|
model: str,
|
||||||
prompt: str,
|
prompt: str,
|
||||||
image: torch.Tensor = None,
|
image: Input.Image | None = None,
|
||||||
size_preset: str = RECOMMENDED_PRESETS_SEEDREAM_4[0][0],
|
size_preset: str = RECOMMENDED_PRESETS_SEEDREAM_4[0][0],
|
||||||
width: int = 2048,
|
width: int = 2048,
|
||||||
height: int = 2048,
|
height: int = 2048,
|
||||||
@ -555,6 +396,18 @@ class ByteDanceSeedreamNode(IO.ComfyNode):
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Custom size out of range: {w}x{h}. " "Both width and height must be between 1024 and 4096 pixels."
|
f"Custom size out of range: {w}x{h}. " "Both width and height must be between 1024 and 4096 pixels."
|
||||||
)
|
)
|
||||||
|
out_num_pixels = w * h
|
||||||
|
mp_provided = out_num_pixels / 1_000_000.0
|
||||||
|
if "seedream-4-5" in model and out_num_pixels < 3686400:
|
||||||
|
raise ValueError(
|
||||||
|
f"Minimum image resolution that Seedream 4.5 can generate is 3.68MP, "
|
||||||
|
f"but {mp_provided:.2f}MP provided."
|
||||||
|
)
|
||||||
|
if "seedream-4-0" in model and out_num_pixels < 921600:
|
||||||
|
raise ValueError(
|
||||||
|
f"Minimum image resolution that the selected model can generate is 0.92MP, "
|
||||||
|
f"but {mp_provided:.2f}MP provided."
|
||||||
|
)
|
||||||
n_input_images = get_number_of_images(image) if image is not None else 0
|
n_input_images = get_number_of_images(image) if image is not None else 0
|
||||||
if n_input_images > 10:
|
if n_input_images > 10:
|
||||||
raise ValueError(f"Maximum of 10 reference images are supported, but {n_input_images} received.")
|
raise ValueError(f"Maximum of 10 reference images are supported, but {n_input_images} received.")
|
||||||
@ -607,9 +460,8 @@ class ByteDanceTextToVideoNode(IO.ComfyNode):
|
|||||||
inputs=[
|
inputs=[
|
||||||
IO.Combo.Input(
|
IO.Combo.Input(
|
||||||
"model",
|
"model",
|
||||||
options=Text2VideoModelName,
|
options=["seedance-1-0-pro-250528", "seedance-1-0-lite-t2v-250428", "seedance-1-0-pro-fast-251015"],
|
||||||
default=Text2VideoModelName.seedance_1_pro,
|
default="seedance-1-0-pro-fast-251015",
|
||||||
tooltip="Model name",
|
|
||||||
),
|
),
|
||||||
IO.String.Input(
|
IO.String.Input(
|
||||||
"prompt",
|
"prompt",
|
||||||
@ -714,9 +566,8 @@ class ByteDanceImageToVideoNode(IO.ComfyNode):
|
|||||||
inputs=[
|
inputs=[
|
||||||
IO.Combo.Input(
|
IO.Combo.Input(
|
||||||
"model",
|
"model",
|
||||||
options=Image2VideoModelName,
|
options=["seedance-1-0-pro-250528", "seedance-1-0-lite-t2v-250428", "seedance-1-0-pro-fast-251015"],
|
||||||
default=Image2VideoModelName.seedance_1_pro,
|
default="seedance-1-0-pro-fast-251015",
|
||||||
tooltip="Model name",
|
|
||||||
),
|
),
|
||||||
IO.String.Input(
|
IO.String.Input(
|
||||||
"prompt",
|
"prompt",
|
||||||
@ -787,7 +638,7 @@ class ByteDanceImageToVideoNode(IO.ComfyNode):
|
|||||||
cls,
|
cls,
|
||||||
model: str,
|
model: str,
|
||||||
prompt: str,
|
prompt: str,
|
||||||
image: torch.Tensor,
|
image: Input.Image,
|
||||||
resolution: str,
|
resolution: str,
|
||||||
aspect_ratio: str,
|
aspect_ratio: str,
|
||||||
duration: int,
|
duration: int,
|
||||||
@ -833,9 +684,8 @@ class ByteDanceFirstLastFrameNode(IO.ComfyNode):
|
|||||||
inputs=[
|
inputs=[
|
||||||
IO.Combo.Input(
|
IO.Combo.Input(
|
||||||
"model",
|
"model",
|
||||||
options=[model.value for model in Image2VideoModelName],
|
options=["seedance-1-0-pro-250528", "seedance-1-0-lite-i2v-250428"],
|
||||||
default=Image2VideoModelName.seedance_1_lite.value,
|
default="seedance-1-0-lite-i2v-250428",
|
||||||
tooltip="Model name",
|
|
||||||
),
|
),
|
||||||
IO.String.Input(
|
IO.String.Input(
|
||||||
"prompt",
|
"prompt",
|
||||||
@ -910,8 +760,8 @@ class ByteDanceFirstLastFrameNode(IO.ComfyNode):
|
|||||||
cls,
|
cls,
|
||||||
model: str,
|
model: str,
|
||||||
prompt: str,
|
prompt: str,
|
||||||
first_frame: torch.Tensor,
|
first_frame: Input.Image,
|
||||||
last_frame: torch.Tensor,
|
last_frame: Input.Image,
|
||||||
resolution: str,
|
resolution: str,
|
||||||
aspect_ratio: str,
|
aspect_ratio: str,
|
||||||
duration: int,
|
duration: int,
|
||||||
@ -968,9 +818,8 @@ class ByteDanceImageReferenceNode(IO.ComfyNode):
|
|||||||
inputs=[
|
inputs=[
|
||||||
IO.Combo.Input(
|
IO.Combo.Input(
|
||||||
"model",
|
"model",
|
||||||
options=[Image2VideoModelName.seedance_1_lite.value],
|
options=["seedance-1-0-pro-250528", "seedance-1-0-lite-i2v-250428"],
|
||||||
default=Image2VideoModelName.seedance_1_lite.value,
|
default="seedance-1-0-lite-i2v-250428",
|
||||||
tooltip="Model name",
|
|
||||||
),
|
),
|
||||||
IO.String.Input(
|
IO.String.Input(
|
||||||
"prompt",
|
"prompt",
|
||||||
@ -1034,7 +883,7 @@ class ByteDanceImageReferenceNode(IO.ComfyNode):
|
|||||||
cls,
|
cls,
|
||||||
model: str,
|
model: str,
|
||||||
prompt: str,
|
prompt: str,
|
||||||
images: torch.Tensor,
|
images: Input.Image,
|
||||||
resolution: str,
|
resolution: str,
|
||||||
aspect_ratio: str,
|
aspect_ratio: str,
|
||||||
duration: int,
|
duration: int,
|
||||||
@ -1069,8 +918,8 @@ class ByteDanceImageReferenceNode(IO.ComfyNode):
|
|||||||
|
|
||||||
async def process_video_task(
|
async def process_video_task(
|
||||||
cls: type[IO.ComfyNode],
|
cls: type[IO.ComfyNode],
|
||||||
payload: Union[Text2VideoTaskCreationRequest, Image2VideoTaskCreationRequest],
|
payload: Text2VideoTaskCreationRequest | Image2VideoTaskCreationRequest,
|
||||||
estimated_duration: Optional[int],
|
estimated_duration: int | None,
|
||||||
) -> IO.NodeOutput:
|
) -> IO.NodeOutput:
|
||||||
initial_response = await sync_op(
|
initial_response = await sync_op(
|
||||||
cls,
|
cls,
|
||||||
@ -1085,7 +934,7 @@ async def process_video_task(
|
|||||||
estimated_duration=estimated_duration,
|
estimated_duration=estimated_duration,
|
||||||
response_model=TaskStatusResponse,
|
response_model=TaskStatusResponse,
|
||||||
)
|
)
|
||||||
return IO.NodeOutput(await download_url_to_video_output(get_video_url_from_task_status(response)))
|
return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))
|
||||||
|
|
||||||
|
|
||||||
def raise_if_text_params(prompt: str, text_params: list[str]) -> None:
|
def raise_if_text_params(prompt: str, text_params: list[str]) -> None:
|
||||||
|
|||||||
@ -4,10 +4,7 @@ See: https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/infer
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
import json
|
|
||||||
import os
|
import os
|
||||||
import time
|
|
||||||
import uuid
|
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from typing import Literal
|
from typing import Literal
|
||||||
@ -16,10 +13,10 @@ import torch
|
|||||||
from typing_extensions import override
|
from typing_extensions import override
|
||||||
|
|
||||||
import folder_paths
|
import folder_paths
|
||||||
from comfy_api.latest import IO, ComfyExtension, Input
|
from comfy_api.latest import IO, ComfyExtension, Input, Types
|
||||||
from comfy_api.util import VideoCodec, VideoContainer
|
|
||||||
from comfy_api_nodes.apis.gemini_api import (
|
from comfy_api_nodes.apis.gemini_api import (
|
||||||
GeminiContent,
|
GeminiContent,
|
||||||
|
GeminiFileData,
|
||||||
GeminiGenerateContentRequest,
|
GeminiGenerateContentRequest,
|
||||||
GeminiGenerateContentResponse,
|
GeminiGenerateContentResponse,
|
||||||
GeminiImageConfig,
|
GeminiImageConfig,
|
||||||
@ -29,6 +26,8 @@ from comfy_api_nodes.apis.gemini_api import (
|
|||||||
GeminiMimeType,
|
GeminiMimeType,
|
||||||
GeminiPart,
|
GeminiPart,
|
||||||
GeminiRole,
|
GeminiRole,
|
||||||
|
GeminiSystemInstructionContent,
|
||||||
|
GeminiTextPart,
|
||||||
Modality,
|
Modality,
|
||||||
)
|
)
|
||||||
from comfy_api_nodes.util import (
|
from comfy_api_nodes.util import (
|
||||||
@ -38,13 +37,21 @@ from comfy_api_nodes.util import (
|
|||||||
get_number_of_images,
|
get_number_of_images,
|
||||||
sync_op,
|
sync_op,
|
||||||
tensor_to_base64_string,
|
tensor_to_base64_string,
|
||||||
|
upload_images_to_comfyapi,
|
||||||
validate_string,
|
validate_string,
|
||||||
video_to_base64_string,
|
video_to_base64_string,
|
||||||
)
|
)
|
||||||
from server import PromptServer
|
|
||||||
|
|
||||||
GEMINI_BASE_ENDPOINT = "/proxy/vertexai/gemini"
|
GEMINI_BASE_ENDPOINT = "/proxy/vertexai/gemini"
|
||||||
GEMINI_MAX_INPUT_FILE_SIZE = 20 * 1024 * 1024 # 20 MB
|
GEMINI_MAX_INPUT_FILE_SIZE = 20 * 1024 * 1024 # 20 MB
|
||||||
|
GEMINI_IMAGE_SYS_PROMPT = (
|
||||||
|
"You are an expert image-generation engine. You must ALWAYS produce an image.\n"
|
||||||
|
"Interpret all user input—regardless of "
|
||||||
|
"format, intent, or abstraction—as literal visual directives for image composition.\n"
|
||||||
|
"If a prompt is conversational or lacks specific visual details, "
|
||||||
|
"you must creatively invent a concrete visual scenario that depicts the concept.\n"
|
||||||
|
"Prioritize generating the visual representation above any text, formatting, or conversational requests."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class GeminiModel(str, Enum):
|
class GeminiModel(str, Enum):
|
||||||
@ -68,24 +75,43 @@ class GeminiImageModel(str, Enum):
|
|||||||
gemini_2_5_flash_image = "gemini-2.5-flash-image"
|
gemini_2_5_flash_image = "gemini-2.5-flash-image"
|
||||||
|
|
||||||
|
|
||||||
def create_image_parts(image_input: torch.Tensor) -> list[GeminiPart]:
|
async def create_image_parts(
|
||||||
"""
|
cls: type[IO.ComfyNode],
|
||||||
Convert image tensor input to Gemini API compatible parts.
|
images: Input.Image,
|
||||||
|
image_limit: int = 0,
|
||||||
Args:
|
) -> list[GeminiPart]:
|
||||||
image_input: Batch of image tensors from ComfyUI.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of GeminiPart objects containing the encoded images.
|
|
||||||
"""
|
|
||||||
image_parts: list[GeminiPart] = []
|
image_parts: list[GeminiPart] = []
|
||||||
for image_index in range(image_input.shape[0]):
|
if image_limit < 0:
|
||||||
image_as_b64 = tensor_to_base64_string(image_input[image_index].unsqueeze(0))
|
raise ValueError("image_limit must be greater than or equal to 0 when creating Gemini image parts.")
|
||||||
|
total_images = get_number_of_images(images)
|
||||||
|
if total_images <= 0:
|
||||||
|
raise ValueError("No images provided to create_image_parts; at least one image is required.")
|
||||||
|
|
||||||
|
# If image_limit == 0 --> use all images; otherwise clamp to image_limit.
|
||||||
|
effective_max = total_images if image_limit == 0 else min(total_images, image_limit)
|
||||||
|
|
||||||
|
# Number of images we'll send as URLs (fileData)
|
||||||
|
num_url_images = min(effective_max, 10) # Vertex API max number of image links
|
||||||
|
reference_images_urls = await upload_images_to_comfyapi(
|
||||||
|
cls,
|
||||||
|
images,
|
||||||
|
max_images=num_url_images,
|
||||||
|
)
|
||||||
|
for reference_image_url in reference_images_urls:
|
||||||
|
image_parts.append(
|
||||||
|
GeminiPart(
|
||||||
|
fileData=GeminiFileData(
|
||||||
|
mimeType=GeminiMimeType.image_png,
|
||||||
|
fileUri=reference_image_url,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
for idx in range(num_url_images, effective_max):
|
||||||
image_parts.append(
|
image_parts.append(
|
||||||
GeminiPart(
|
GeminiPart(
|
||||||
inlineData=GeminiInlineData(
|
inlineData=GeminiInlineData(
|
||||||
mimeType=GeminiMimeType.image_png,
|
mimeType=GeminiMimeType.image_png,
|
||||||
data=image_as_b64,
|
data=tensor_to_base64_string(images[idx]),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@ -137,8 +163,8 @@ def get_text_from_response(response: GeminiGenerateContentResponse) -> str:
|
|||||||
return "\n".join([part.text for part in parts])
|
return "\n".join([part.text for part in parts])
|
||||||
|
|
||||||
|
|
||||||
def get_image_from_response(response: GeminiGenerateContentResponse) -> torch.Tensor:
|
def get_image_from_response(response: GeminiGenerateContentResponse) -> Input.Image:
|
||||||
image_tensors: list[torch.Tensor] = []
|
image_tensors: list[Input.Image] = []
|
||||||
parts = get_parts_by_type(response, "image/png")
|
parts = get_parts_by_type(response, "image/png")
|
||||||
for part in parts:
|
for part in parts:
|
||||||
image_data = base64.b64decode(part.inlineData.data)
|
image_data = base64.b64decode(part.inlineData.data)
|
||||||
@ -260,6 +286,13 @@ class GeminiNode(IO.ComfyNode):
|
|||||||
tooltip="Optional file(s) to use as context for the model. "
|
tooltip="Optional file(s) to use as context for the model. "
|
||||||
"Accepts inputs from the Gemini Generate Content Input Files node.",
|
"Accepts inputs from the Gemini Generate Content Input Files node.",
|
||||||
),
|
),
|
||||||
|
IO.String.Input(
|
||||||
|
"system_prompt",
|
||||||
|
multiline=True,
|
||||||
|
default="",
|
||||||
|
optional=True,
|
||||||
|
tooltip="Foundational instructions that dictate an AI's behavior.",
|
||||||
|
),
|
||||||
],
|
],
|
||||||
outputs=[
|
outputs=[
|
||||||
IO.String.Output(),
|
IO.String.Output(),
|
||||||
@ -276,7 +309,9 @@ class GeminiNode(IO.ComfyNode):
|
|||||||
def create_video_parts(cls, video_input: Input.Video) -> list[GeminiPart]:
|
def create_video_parts(cls, video_input: Input.Video) -> list[GeminiPart]:
|
||||||
"""Convert video input to Gemini API compatible parts."""
|
"""Convert video input to Gemini API compatible parts."""
|
||||||
|
|
||||||
base_64_string = video_to_base64_string(video_input, container_format=VideoContainer.MP4, codec=VideoCodec.H264)
|
base_64_string = video_to_base64_string(
|
||||||
|
video_input, container_format=Types.VideoContainer.MP4, codec=Types.VideoCodec.H264
|
||||||
|
)
|
||||||
return [
|
return [
|
||||||
GeminiPart(
|
GeminiPart(
|
||||||
inlineData=GeminiInlineData(
|
inlineData=GeminiInlineData(
|
||||||
@ -326,10 +361,11 @@ class GeminiNode(IO.ComfyNode):
|
|||||||
prompt: str,
|
prompt: str,
|
||||||
model: str,
|
model: str,
|
||||||
seed: int,
|
seed: int,
|
||||||
images: torch.Tensor | None = None,
|
images: Input.Image | None = None,
|
||||||
audio: Input.Audio | None = None,
|
audio: Input.Audio | None = None,
|
||||||
video: Input.Video | None = None,
|
video: Input.Video | None = None,
|
||||||
files: list[GeminiPart] | None = None,
|
files: list[GeminiPart] | None = None,
|
||||||
|
system_prompt: str = "",
|
||||||
) -> IO.NodeOutput:
|
) -> IO.NodeOutput:
|
||||||
validate_string(prompt, strip_whitespace=False)
|
validate_string(prompt, strip_whitespace=False)
|
||||||
|
|
||||||
@ -338,8 +374,7 @@ class GeminiNode(IO.ComfyNode):
|
|||||||
|
|
||||||
# Add other modal parts
|
# Add other modal parts
|
||||||
if images is not None:
|
if images is not None:
|
||||||
image_parts = create_image_parts(images)
|
parts.extend(await create_image_parts(cls, images))
|
||||||
parts.extend(image_parts)
|
|
||||||
if audio is not None:
|
if audio is not None:
|
||||||
parts.extend(cls.create_audio_parts(audio))
|
parts.extend(cls.create_audio_parts(audio))
|
||||||
if video is not None:
|
if video is not None:
|
||||||
@ -347,7 +382,10 @@ class GeminiNode(IO.ComfyNode):
|
|||||||
if files is not None:
|
if files is not None:
|
||||||
parts.extend(files)
|
parts.extend(files)
|
||||||
|
|
||||||
# Create response
|
gemini_system_prompt = None
|
||||||
|
if system_prompt:
|
||||||
|
gemini_system_prompt = GeminiSystemInstructionContent(parts=[GeminiTextPart(text=system_prompt)], role=None)
|
||||||
|
|
||||||
response = await sync_op(
|
response = await sync_op(
|
||||||
cls,
|
cls,
|
||||||
endpoint=ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model}", method="POST"),
|
endpoint=ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model}", method="POST"),
|
||||||
@ -357,36 +395,14 @@ class GeminiNode(IO.ComfyNode):
|
|||||||
role=GeminiRole.user,
|
role=GeminiRole.user,
|
||||||
parts=parts,
|
parts=parts,
|
||||||
)
|
)
|
||||||
]
|
],
|
||||||
|
systemInstruction=gemini_system_prompt,
|
||||||
),
|
),
|
||||||
response_model=GeminiGenerateContentResponse,
|
response_model=GeminiGenerateContentResponse,
|
||||||
price_extractor=calculate_tokens_price,
|
price_extractor=calculate_tokens_price,
|
||||||
)
|
)
|
||||||
|
|
||||||
output_text = get_text_from_response(response)
|
output_text = get_text_from_response(response)
|
||||||
if output_text:
|
|
||||||
# Not a true chat history like the OpenAI Chat node. It is emulated so the frontend can show a copy button.
|
|
||||||
render_spec = {
|
|
||||||
"node_id": cls.hidden.unique_id,
|
|
||||||
"component": "ChatHistoryWidget",
|
|
||||||
"props": {
|
|
||||||
"history": json.dumps(
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"prompt": prompt,
|
|
||||||
"response": output_text,
|
|
||||||
"response_id": str(uuid.uuid4()),
|
|
||||||
"timestamp": time.time(),
|
|
||||||
}
|
|
||||||
]
|
|
||||||
),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
PromptServer.instance.send_sync(
|
|
||||||
"display_component",
|
|
||||||
render_spec,
|
|
||||||
)
|
|
||||||
|
|
||||||
return IO.NodeOutput(output_text or "Empty response from Gemini model...")
|
return IO.NodeOutput(output_text or "Empty response from Gemini model...")
|
||||||
|
|
||||||
|
|
||||||
@ -530,6 +546,13 @@ class GeminiImage(IO.ComfyNode):
|
|||||||
"'IMAGE+TEXT' to return both the generated image and a text response.",
|
"'IMAGE+TEXT' to return both the generated image and a text response.",
|
||||||
optional=True,
|
optional=True,
|
||||||
),
|
),
|
||||||
|
IO.String.Input(
|
||||||
|
"system_prompt",
|
||||||
|
multiline=True,
|
||||||
|
default=GEMINI_IMAGE_SYS_PROMPT,
|
||||||
|
optional=True,
|
||||||
|
tooltip="Foundational instructions that dictate an AI's behavior.",
|
||||||
|
),
|
||||||
],
|
],
|
||||||
outputs=[
|
outputs=[
|
||||||
IO.Image.Output(),
|
IO.Image.Output(),
|
||||||
@ -549,10 +572,11 @@ class GeminiImage(IO.ComfyNode):
|
|||||||
prompt: str,
|
prompt: str,
|
||||||
model: str,
|
model: str,
|
||||||
seed: int,
|
seed: int,
|
||||||
images: torch.Tensor | None = None,
|
images: Input.Image | None = None,
|
||||||
files: list[GeminiPart] | None = None,
|
files: list[GeminiPart] | None = None,
|
||||||
aspect_ratio: str = "auto",
|
aspect_ratio: str = "auto",
|
||||||
response_modalities: str = "IMAGE+TEXT",
|
response_modalities: str = "IMAGE+TEXT",
|
||||||
|
system_prompt: str = "",
|
||||||
) -> IO.NodeOutput:
|
) -> IO.NodeOutput:
|
||||||
validate_string(prompt, strip_whitespace=True, min_length=1)
|
validate_string(prompt, strip_whitespace=True, min_length=1)
|
||||||
parts: list[GeminiPart] = [GeminiPart(text=prompt)]
|
parts: list[GeminiPart] = [GeminiPart(text=prompt)]
|
||||||
@ -562,11 +586,14 @@ class GeminiImage(IO.ComfyNode):
|
|||||||
image_config = GeminiImageConfig(aspectRatio=aspect_ratio)
|
image_config = GeminiImageConfig(aspectRatio=aspect_ratio)
|
||||||
|
|
||||||
if images is not None:
|
if images is not None:
|
||||||
image_parts = create_image_parts(images)
|
parts.extend(await create_image_parts(cls, images))
|
||||||
parts.extend(image_parts)
|
|
||||||
if files is not None:
|
if files is not None:
|
||||||
parts.extend(files)
|
parts.extend(files)
|
||||||
|
|
||||||
|
gemini_system_prompt = None
|
||||||
|
if system_prompt:
|
||||||
|
gemini_system_prompt = GeminiSystemInstructionContent(parts=[GeminiTextPart(text=system_prompt)], role=None)
|
||||||
|
|
||||||
response = await sync_op(
|
response = await sync_op(
|
||||||
cls,
|
cls,
|
||||||
endpoint=ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model}", method="POST"),
|
endpoint=ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model}", method="POST"),
|
||||||
@ -578,34 +605,12 @@ class GeminiImage(IO.ComfyNode):
|
|||||||
responseModalities=(["IMAGE"] if response_modalities == "IMAGE" else ["TEXT", "IMAGE"]),
|
responseModalities=(["IMAGE"] if response_modalities == "IMAGE" else ["TEXT", "IMAGE"]),
|
||||||
imageConfig=None if aspect_ratio == "auto" else image_config,
|
imageConfig=None if aspect_ratio == "auto" else image_config,
|
||||||
),
|
),
|
||||||
|
systemInstruction=gemini_system_prompt,
|
||||||
),
|
),
|
||||||
response_model=GeminiGenerateContentResponse,
|
response_model=GeminiGenerateContentResponse,
|
||||||
price_extractor=calculate_tokens_price,
|
price_extractor=calculate_tokens_price,
|
||||||
)
|
)
|
||||||
|
return IO.NodeOutput(get_image_from_response(response), get_text_from_response(response))
|
||||||
output_text = get_text_from_response(response)
|
|
||||||
if output_text:
|
|
||||||
render_spec = {
|
|
||||||
"node_id": cls.hidden.unique_id,
|
|
||||||
"component": "ChatHistoryWidget",
|
|
||||||
"props": {
|
|
||||||
"history": json.dumps(
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"prompt": prompt,
|
|
||||||
"response": output_text,
|
|
||||||
"response_id": str(uuid.uuid4()),
|
|
||||||
"timestamp": time.time(),
|
|
||||||
}
|
|
||||||
]
|
|
||||||
),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
PromptServer.instance.send_sync(
|
|
||||||
"display_component",
|
|
||||||
render_spec,
|
|
||||||
)
|
|
||||||
return IO.NodeOutput(get_image_from_response(response), output_text)
|
|
||||||
|
|
||||||
|
|
||||||
class GeminiImage2(IO.ComfyNode):
|
class GeminiImage2(IO.ComfyNode):
|
||||||
@ -671,6 +676,13 @@ class GeminiImage2(IO.ComfyNode):
|
|||||||
tooltip="Optional file(s) to use as context for the model. "
|
tooltip="Optional file(s) to use as context for the model. "
|
||||||
"Accepts inputs from the Gemini Generate Content Input Files node.",
|
"Accepts inputs from the Gemini Generate Content Input Files node.",
|
||||||
),
|
),
|
||||||
|
IO.String.Input(
|
||||||
|
"system_prompt",
|
||||||
|
multiline=True,
|
||||||
|
default=GEMINI_IMAGE_SYS_PROMPT,
|
||||||
|
optional=True,
|
||||||
|
tooltip="Foundational instructions that dictate an AI's behavior.",
|
||||||
|
),
|
||||||
],
|
],
|
||||||
outputs=[
|
outputs=[
|
||||||
IO.Image.Output(),
|
IO.Image.Output(),
|
||||||
@ -693,8 +705,9 @@ class GeminiImage2(IO.ComfyNode):
|
|||||||
aspect_ratio: str,
|
aspect_ratio: str,
|
||||||
resolution: str,
|
resolution: str,
|
||||||
response_modalities: str,
|
response_modalities: str,
|
||||||
images: torch.Tensor | None = None,
|
images: Input.Image | None = None,
|
||||||
files: list[GeminiPart] | None = None,
|
files: list[GeminiPart] | None = None,
|
||||||
|
system_prompt: str = "",
|
||||||
) -> IO.NodeOutput:
|
) -> IO.NodeOutput:
|
||||||
validate_string(prompt, strip_whitespace=True, min_length=1)
|
validate_string(prompt, strip_whitespace=True, min_length=1)
|
||||||
|
|
||||||
@ -702,7 +715,7 @@ class GeminiImage2(IO.ComfyNode):
|
|||||||
if images is not None:
|
if images is not None:
|
||||||
if get_number_of_images(images) > 14:
|
if get_number_of_images(images) > 14:
|
||||||
raise ValueError("The current maximum number of supported images is 14.")
|
raise ValueError("The current maximum number of supported images is 14.")
|
||||||
parts.extend(create_image_parts(images))
|
parts.extend(await create_image_parts(cls, images))
|
||||||
if files is not None:
|
if files is not None:
|
||||||
parts.extend(files)
|
parts.extend(files)
|
||||||
|
|
||||||
@ -710,6 +723,10 @@ class GeminiImage2(IO.ComfyNode):
|
|||||||
if aspect_ratio != "auto":
|
if aspect_ratio != "auto":
|
||||||
image_config.aspectRatio = aspect_ratio
|
image_config.aspectRatio = aspect_ratio
|
||||||
|
|
||||||
|
gemini_system_prompt = None
|
||||||
|
if system_prompt:
|
||||||
|
gemini_system_prompt = GeminiSystemInstructionContent(parts=[GeminiTextPart(text=system_prompt)], role=None)
|
||||||
|
|
||||||
response = await sync_op(
|
response = await sync_op(
|
||||||
cls,
|
cls,
|
||||||
ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model}", method="POST"),
|
ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model}", method="POST"),
|
||||||
@ -721,34 +738,12 @@ class GeminiImage2(IO.ComfyNode):
|
|||||||
responseModalities=(["IMAGE"] if response_modalities == "IMAGE" else ["TEXT", "IMAGE"]),
|
responseModalities=(["IMAGE"] if response_modalities == "IMAGE" else ["TEXT", "IMAGE"]),
|
||||||
imageConfig=image_config,
|
imageConfig=image_config,
|
||||||
),
|
),
|
||||||
|
systemInstruction=gemini_system_prompt,
|
||||||
),
|
),
|
||||||
response_model=GeminiGenerateContentResponse,
|
response_model=GeminiGenerateContentResponse,
|
||||||
price_extractor=calculate_tokens_price,
|
price_extractor=calculate_tokens_price,
|
||||||
)
|
)
|
||||||
|
return IO.NodeOutput(get_image_from_response(response), get_text_from_response(response))
|
||||||
output_text = get_text_from_response(response)
|
|
||||||
if output_text:
|
|
||||||
render_spec = {
|
|
||||||
"node_id": cls.hidden.unique_id,
|
|
||||||
"component": "ChatHistoryWidget",
|
|
||||||
"props": {
|
|
||||||
"history": json.dumps(
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"prompt": prompt,
|
|
||||||
"response": output_text,
|
|
||||||
"response_id": str(uuid.uuid4()),
|
|
||||||
"timestamp": time.time(),
|
|
||||||
}
|
|
||||||
]
|
|
||||||
),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
PromptServer.instance.send_sync(
|
|
||||||
"display_component",
|
|
||||||
render_spec,
|
|
||||||
)
|
|
||||||
return IO.NodeOutput(get_image_from_response(response), output_text)
|
|
||||||
|
|
||||||
|
|
||||||
class GeminiExtension(ComfyExtension):
|
class GeminiExtension(ComfyExtension):
|
||||||
|
|||||||
@ -4,15 +4,14 @@ For source of truth on the allowed permutations of request fields, please refere
|
|||||||
- [Compatibility Table](https://app.klingai.com/global/dev/document-api/apiReference/model/skillsMap)
|
- [Compatibility Table](https://app.klingai.com/global/dev/document-api/apiReference/model/skillsMap)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
from typing import Optional, TypeVar
|
|
||||||
import math
|
|
||||||
import logging
|
import logging
|
||||||
|
import math
|
||||||
from typing_extensions import override
|
import re
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
from typing_extensions import override
|
||||||
|
|
||||||
|
from comfy_api.latest import IO, ComfyExtension, Input, InputImpl
|
||||||
from comfy_api_nodes.apis import (
|
from comfy_api_nodes.apis import (
|
||||||
KlingCameraControl,
|
KlingCameraControl,
|
||||||
KlingCameraConfig,
|
KlingCameraConfig,
|
||||||
@ -50,25 +49,35 @@ from comfy_api_nodes.apis import (
|
|||||||
KlingCharacterEffectModelName,
|
KlingCharacterEffectModelName,
|
||||||
KlingSingleImageEffectModelName,
|
KlingSingleImageEffectModelName,
|
||||||
)
|
)
|
||||||
|
from comfy_api_nodes.apis.kling_api import (
|
||||||
|
ImageToVideoWithAudioRequest,
|
||||||
|
OmniImageParamImage,
|
||||||
|
OmniParamImage,
|
||||||
|
OmniParamVideo,
|
||||||
|
OmniProFirstLastFrameRequest,
|
||||||
|
OmniProImageRequest,
|
||||||
|
OmniProReferences2VideoRequest,
|
||||||
|
OmniProText2VideoRequest,
|
||||||
|
TaskStatusResponse,
|
||||||
|
TextToVideoWithAudioRequest,
|
||||||
|
)
|
||||||
from comfy_api_nodes.util import (
|
from comfy_api_nodes.util import (
|
||||||
validate_image_dimensions,
|
ApiEndpoint,
|
||||||
|
download_url_to_image_tensor,
|
||||||
|
download_url_to_video_output,
|
||||||
|
get_number_of_images,
|
||||||
|
poll_op,
|
||||||
|
sync_op,
|
||||||
|
tensor_to_base64_string,
|
||||||
|
upload_audio_to_comfyapi,
|
||||||
|
upload_images_to_comfyapi,
|
||||||
|
upload_video_to_comfyapi,
|
||||||
validate_image_aspect_ratio,
|
validate_image_aspect_ratio,
|
||||||
|
validate_image_dimensions,
|
||||||
|
validate_string,
|
||||||
validate_video_dimensions,
|
validate_video_dimensions,
|
||||||
validate_video_duration,
|
validate_video_duration,
|
||||||
tensor_to_base64_string,
|
|
||||||
validate_string,
|
|
||||||
upload_audio_to_comfyapi,
|
|
||||||
download_url_to_image_tensor,
|
|
||||||
upload_video_to_comfyapi,
|
|
||||||
download_url_to_video_output,
|
|
||||||
sync_op,
|
|
||||||
ApiEndpoint,
|
|
||||||
poll_op,
|
|
||||||
)
|
)
|
||||||
from comfy_api.input_impl import VideoFromFile
|
|
||||||
from comfy_api.input.basic_types import AudioInput
|
|
||||||
from comfy_api.input.video_types import VideoInput
|
|
||||||
from comfy_api.latest import ComfyExtension, IO
|
|
||||||
|
|
||||||
KLING_API_VERSION = "v1"
|
KLING_API_VERSION = "v1"
|
||||||
PATH_TEXT_TO_VIDEO = f"/proxy/kling/{KLING_API_VERSION}/videos/text2video"
|
PATH_TEXT_TO_VIDEO = f"/proxy/kling/{KLING_API_VERSION}/videos/text2video"
|
||||||
@ -94,14 +103,8 @@ AVERAGE_DURATION_IMAGE_GEN = 32
|
|||||||
AVERAGE_DURATION_VIDEO_EFFECTS = 320
|
AVERAGE_DURATION_VIDEO_EFFECTS = 320
|
||||||
AVERAGE_DURATION_VIDEO_EXTEND = 320
|
AVERAGE_DURATION_VIDEO_EXTEND = 320
|
||||||
|
|
||||||
R = TypeVar("R")
|
|
||||||
|
|
||||||
|
|
||||||
MODE_TEXT2VIDEO = {
|
MODE_TEXT2VIDEO = {
|
||||||
"standard mode / 5s duration / kling-v1": ("std", "5", "kling-v1"),
|
|
||||||
"standard mode / 10s duration / kling-v1": ("std", "10", "kling-v1"),
|
|
||||||
"pro mode / 5s duration / kling-v1": ("pro", "5", "kling-v1"),
|
|
||||||
"pro mode / 10s duration / kling-v1": ("pro", "10", "kling-v1"),
|
|
||||||
"standard mode / 5s duration / kling-v1-6": ("std", "5", "kling-v1-6"),
|
"standard mode / 5s duration / kling-v1-6": ("std", "5", "kling-v1-6"),
|
||||||
"standard mode / 10s duration / kling-v1-6": ("std", "10", "kling-v1-6"),
|
"standard mode / 10s duration / kling-v1-6": ("std", "10", "kling-v1-6"),
|
||||||
"pro mode / 5s duration / kling-v2-master": ("pro", "5", "kling-v2-master"),
|
"pro mode / 5s duration / kling-v2-master": ("pro", "5", "kling-v2-master"),
|
||||||
@ -122,14 +125,14 @@ See: [Kling API Docs Capability Map](https://app.klingai.com/global/dev/document
|
|||||||
|
|
||||||
|
|
||||||
MODE_START_END_FRAME = {
|
MODE_START_END_FRAME = {
|
||||||
"standard mode / 5s duration / kling-v1": ("std", "5", "kling-v1"),
|
|
||||||
"pro mode / 5s duration / kling-v1": ("pro", "5", "kling-v1"),
|
|
||||||
"pro mode / 5s duration / kling-v1-5": ("pro", "5", "kling-v1-5"),
|
"pro mode / 5s duration / kling-v1-5": ("pro", "5", "kling-v1-5"),
|
||||||
"pro mode / 10s duration / kling-v1-5": ("pro", "10", "kling-v1-5"),
|
"pro mode / 10s duration / kling-v1-5": ("pro", "10", "kling-v1-5"),
|
||||||
"pro mode / 5s duration / kling-v1-6": ("pro", "5", "kling-v1-6"),
|
"pro mode / 5s duration / kling-v1-6": ("pro", "5", "kling-v1-6"),
|
||||||
"pro mode / 10s duration / kling-v1-6": ("pro", "10", "kling-v1-6"),
|
"pro mode / 10s duration / kling-v1-6": ("pro", "10", "kling-v1-6"),
|
||||||
"pro mode / 5s duration / kling-v2-1": ("pro", "5", "kling-v2-1"),
|
"pro mode / 5s duration / kling-v2-1": ("pro", "5", "kling-v2-1"),
|
||||||
"pro mode / 10s duration / kling-v2-1": ("pro", "10", "kling-v2-1"),
|
"pro mode / 10s duration / kling-v2-1": ("pro", "10", "kling-v2-1"),
|
||||||
|
"pro mode / 5s duration / kling-v2-5-turbo": ("pro", "5", "kling-v2-5-turbo"),
|
||||||
|
"pro mode / 10s duration / kling-v2-5-turbo": ("pro", "10", "kling-v2-5-turbo"),
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
Returns a mapping of mode strings to their corresponding (mode, duration, model_name) tuples.
|
Returns a mapping of mode strings to their corresponding (mode, duration, model_name) tuples.
|
||||||
@ -206,6 +209,50 @@ VOICES_CONFIG = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_omni_prompt_references(prompt: str) -> str:
|
||||||
|
"""
|
||||||
|
Rewrites Kling Omni-style placeholders used in the app, like:
|
||||||
|
|
||||||
|
@image, @image1, @image2, ... @imageN
|
||||||
|
@video, @video1, @video2, ... @videoN
|
||||||
|
|
||||||
|
into the API-compatible form:
|
||||||
|
|
||||||
|
<<<image_1>>>, <<<image_2>>>, ...
|
||||||
|
<<<video_1>>>, <<<video_2>>>, ...
|
||||||
|
|
||||||
|
This is a UX shim for ComfyUI so users can type the same syntax as in the Kling app.
|
||||||
|
"""
|
||||||
|
if not prompt:
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
def _image_repl(match):
|
||||||
|
return f"<<<image_{match.group('idx') or '1'}>>>"
|
||||||
|
|
||||||
|
def _video_repl(match):
|
||||||
|
return f"<<<video_{match.group('idx') or '1'}>>>"
|
||||||
|
|
||||||
|
# (?<!\w) avoids matching e.g. "test@image.com"
|
||||||
|
# (?!\w) makes sure we only match @image / @image<digits> and not @imageFoo
|
||||||
|
prompt = re.sub(r"(?<!\w)@image(?P<idx>\d*)(?!\w)", _image_repl, prompt)
|
||||||
|
return re.sub(r"(?<!\w)@video(?P<idx>\d*)(?!\w)", _video_repl, prompt)
|
||||||
|
|
||||||
|
|
||||||
|
async def finish_omni_video_task(cls: type[IO.ComfyNode], response: TaskStatusResponse) -> IO.NodeOutput:
|
||||||
|
if response.code:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Kling request failed. Code: {response.code}, Message: {response.message}, Data: {response.data}"
|
||||||
|
)
|
||||||
|
final_response = await poll_op(
|
||||||
|
cls,
|
||||||
|
ApiEndpoint(path=f"/proxy/kling/v1/videos/omni-video/{response.data.task_id}"),
|
||||||
|
response_model=TaskStatusResponse,
|
||||||
|
status_extractor=lambda r: (r.data.task_status if r.data else None),
|
||||||
|
max_poll_attempts=160,
|
||||||
|
)
|
||||||
|
return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url))
|
||||||
|
|
||||||
|
|
||||||
def is_valid_camera_control_configs(configs: list[float]) -> bool:
|
def is_valid_camera_control_configs(configs: list[float]) -> bool:
|
||||||
"""Verifies that at least one camera control configuration is non-zero."""
|
"""Verifies that at least one camera control configuration is non-zero."""
|
||||||
return any(not math.isclose(value, 0.0) for value in configs)
|
return any(not math.isclose(value, 0.0) for value in configs)
|
||||||
@ -296,7 +343,7 @@ def get_video_from_response(response) -> KlingVideoResult:
|
|||||||
return video
|
return video
|
||||||
|
|
||||||
|
|
||||||
def get_video_url_from_response(response) -> Optional[str]:
|
def get_video_url_from_response(response) -> str | None:
|
||||||
"""Returns the first video url from the Kling video generation task result.
|
"""Returns the first video url from the Kling video generation task result.
|
||||||
Will not raise an error if the response is not valid.
|
Will not raise an error if the response is not valid.
|
||||||
"""
|
"""
|
||||||
@ -315,7 +362,7 @@ def get_images_from_response(response) -> list[KlingImageResult]:
|
|||||||
return images
|
return images
|
||||||
|
|
||||||
|
|
||||||
def get_images_urls_from_response(response) -> Optional[str]:
|
def get_images_urls_from_response(response) -> str | None:
|
||||||
"""Returns the list of image urls from the Kling image generation task result.
|
"""Returns the list of image urls from the Kling image generation task result.
|
||||||
Will not raise an error if the response is not valid. If there is only one image, returns the url as a string. If there are multiple images, returns a list of urls.
|
Will not raise an error if the response is not valid. If there is only one image, returns the url as a string. If there are multiple images, returns a list of urls.
|
||||||
"""
|
"""
|
||||||
@ -349,7 +396,7 @@ async def execute_text2video(
|
|||||||
model_mode: str,
|
model_mode: str,
|
||||||
duration: str,
|
duration: str,
|
||||||
aspect_ratio: str,
|
aspect_ratio: str,
|
||||||
camera_control: Optional[KlingCameraControl] = None,
|
camera_control: KlingCameraControl | None = None,
|
||||||
) -> IO.NodeOutput:
|
) -> IO.NodeOutput:
|
||||||
validate_prompts(prompt, negative_prompt, MAX_PROMPT_LENGTH_T2V)
|
validate_prompts(prompt, negative_prompt, MAX_PROMPT_LENGTH_T2V)
|
||||||
task_creation_response = await sync_op(
|
task_creation_response = await sync_op(
|
||||||
@ -394,8 +441,8 @@ async def execute_image2video(
|
|||||||
model_mode: str,
|
model_mode: str,
|
||||||
aspect_ratio: str,
|
aspect_ratio: str,
|
||||||
duration: str,
|
duration: str,
|
||||||
camera_control: Optional[KlingCameraControl] = None,
|
camera_control: KlingCameraControl | None = None,
|
||||||
end_frame: Optional[torch.Tensor] = None,
|
end_frame: torch.Tensor | None = None,
|
||||||
) -> IO.NodeOutput:
|
) -> IO.NodeOutput:
|
||||||
validate_prompts(prompt, negative_prompt, MAX_PROMPT_LENGTH_I2V)
|
validate_prompts(prompt, negative_prompt, MAX_PROMPT_LENGTH_I2V)
|
||||||
validate_input_image(start_frame)
|
validate_input_image(start_frame)
|
||||||
@ -432,12 +479,12 @@ async def execute_image2video(
|
|||||||
task_id = task_creation_response.data.task_id
|
task_id = task_creation_response.data.task_id
|
||||||
|
|
||||||
final_response = await poll_op(
|
final_response = await poll_op(
|
||||||
cls,
|
cls,
|
||||||
ApiEndpoint(path=f"{PATH_IMAGE_TO_VIDEO}/{task_id}"),
|
ApiEndpoint(path=f"{PATH_IMAGE_TO_VIDEO}/{task_id}"),
|
||||||
response_model=KlingImage2VideoResponse,
|
response_model=KlingImage2VideoResponse,
|
||||||
estimated_duration=AVERAGE_DURATION_I2V,
|
estimated_duration=AVERAGE_DURATION_I2V,
|
||||||
status_extractor=lambda r: (r.data.task_status.value if r.data and r.data.task_status else None),
|
status_extractor=lambda r: (r.data.task_status.value if r.data and r.data.task_status else None),
|
||||||
)
|
)
|
||||||
validate_video_result_response(final_response)
|
validate_video_result_response(final_response)
|
||||||
|
|
||||||
video = get_video_from_response(final_response)
|
video = get_video_from_response(final_response)
|
||||||
@ -451,9 +498,9 @@ async def execute_video_effect(
|
|||||||
model_name: str,
|
model_name: str,
|
||||||
duration: KlingVideoGenDuration,
|
duration: KlingVideoGenDuration,
|
||||||
image_1: torch.Tensor,
|
image_1: torch.Tensor,
|
||||||
image_2: Optional[torch.Tensor] = None,
|
image_2: torch.Tensor | None = None,
|
||||||
model_mode: Optional[KlingVideoGenMode] = None,
|
model_mode: KlingVideoGenMode | None = None,
|
||||||
) -> tuple[VideoFromFile, str, str]:
|
) -> tuple[InputImpl.VideoFromFile, str, str]:
|
||||||
if dual_character:
|
if dual_character:
|
||||||
request_input_field = KlingDualCharacterEffectInput(
|
request_input_field = KlingDualCharacterEffectInput(
|
||||||
model_name=model_name,
|
model_name=model_name,
|
||||||
@ -499,13 +546,13 @@ async def execute_video_effect(
|
|||||||
|
|
||||||
async def execute_lipsync(
|
async def execute_lipsync(
|
||||||
cls: type[IO.ComfyNode],
|
cls: type[IO.ComfyNode],
|
||||||
video: VideoInput,
|
video: Input.Video,
|
||||||
audio: Optional[AudioInput] = None,
|
audio: Input.Audio | None = None,
|
||||||
voice_language: Optional[str] = None,
|
voice_language: str | None = None,
|
||||||
model_mode: Optional[str] = None,
|
model_mode: str | None = None,
|
||||||
text: Optional[str] = None,
|
text: str | None = None,
|
||||||
voice_speed: Optional[float] = None,
|
voice_speed: float | None = None,
|
||||||
voice_id: Optional[str] = None,
|
voice_id: str | None = None,
|
||||||
) -> IO.NodeOutput:
|
) -> IO.NodeOutput:
|
||||||
if text:
|
if text:
|
||||||
validate_string(text, field_name="Text", max_length=MAX_PROMPT_LENGTH_LIP_SYNC)
|
validate_string(text, field_name="Text", max_length=MAX_PROMPT_LENGTH_LIP_SYNC)
|
||||||
@ -701,7 +748,7 @@ class KlingTextToVideoNode(IO.ComfyNode):
|
|||||||
IO.Combo.Input(
|
IO.Combo.Input(
|
||||||
"mode",
|
"mode",
|
||||||
options=modes,
|
options=modes,
|
||||||
default=modes[4],
|
default=modes[8],
|
||||||
tooltip="The configuration to use for the video generation following the format: mode / duration / model_name.",
|
tooltip="The configuration to use for the video generation following the format: mode / duration / model_name.",
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@ -740,6 +787,474 @@ class KlingTextToVideoNode(IO.ComfyNode):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class OmniProTextToVideoNode(IO.ComfyNode):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def define_schema(cls) -> IO.Schema:
|
||||||
|
return IO.Schema(
|
||||||
|
node_id="KlingOmniProTextToVideoNode",
|
||||||
|
display_name="Kling Omni Text to Video (Pro)",
|
||||||
|
category="api node/video/Kling",
|
||||||
|
description="Use text prompts to generate videos with the latest Kling model.",
|
||||||
|
inputs=[
|
||||||
|
IO.Combo.Input("model_name", options=["kling-video-o1"]),
|
||||||
|
IO.String.Input(
|
||||||
|
"prompt",
|
||||||
|
multiline=True,
|
||||||
|
tooltip="A text prompt describing the video content. "
|
||||||
|
"This can include both positive and negative descriptions.",
|
||||||
|
),
|
||||||
|
IO.Combo.Input("aspect_ratio", options=["16:9", "9:16", "1:1"]),
|
||||||
|
IO.Combo.Input("duration", options=[5, 10]),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
IO.Video.Output(),
|
||||||
|
],
|
||||||
|
hidden=[
|
||||||
|
IO.Hidden.auth_token_comfy_org,
|
||||||
|
IO.Hidden.api_key_comfy_org,
|
||||||
|
IO.Hidden.unique_id,
|
||||||
|
],
|
||||||
|
is_api_node=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def execute(
|
||||||
|
cls,
|
||||||
|
model_name: str,
|
||||||
|
prompt: str,
|
||||||
|
aspect_ratio: str,
|
||||||
|
duration: int,
|
||||||
|
) -> IO.NodeOutput:
|
||||||
|
validate_string(prompt, min_length=1, max_length=2500)
|
||||||
|
response = await sync_op(
|
||||||
|
cls,
|
||||||
|
ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"),
|
||||||
|
response_model=TaskStatusResponse,
|
||||||
|
data=OmniProText2VideoRequest(
|
||||||
|
model_name=model_name,
|
||||||
|
prompt=prompt,
|
||||||
|
aspect_ratio=aspect_ratio,
|
||||||
|
duration=str(duration),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
return await finish_omni_video_task(cls, response)
|
||||||
|
|
||||||
|
|
||||||
|
class OmniProFirstLastFrameNode(IO.ComfyNode):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def define_schema(cls) -> IO.Schema:
|
||||||
|
return IO.Schema(
|
||||||
|
node_id="KlingOmniProFirstLastFrameNode",
|
||||||
|
display_name="Kling Omni First-Last-Frame to Video (Pro)",
|
||||||
|
category="api node/video/Kling",
|
||||||
|
description="Use a start frame, an optional end frame, or reference images with the latest Kling model.",
|
||||||
|
inputs=[
|
||||||
|
IO.Combo.Input("model_name", options=["kling-video-o1"]),
|
||||||
|
IO.String.Input(
|
||||||
|
"prompt",
|
||||||
|
multiline=True,
|
||||||
|
tooltip="A text prompt describing the video content. "
|
||||||
|
"This can include both positive and negative descriptions.",
|
||||||
|
),
|
||||||
|
IO.Combo.Input("duration", options=["5", "10"]),
|
||||||
|
IO.Image.Input("first_frame"),
|
||||||
|
IO.Image.Input(
|
||||||
|
"end_frame",
|
||||||
|
optional=True,
|
||||||
|
tooltip="An optional end frame for the video. "
|
||||||
|
"This cannot be used simultaneously with 'reference_images'.",
|
||||||
|
),
|
||||||
|
IO.Image.Input(
|
||||||
|
"reference_images",
|
||||||
|
optional=True,
|
||||||
|
tooltip="Up to 6 additional reference images.",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
IO.Video.Output(),
|
||||||
|
],
|
||||||
|
hidden=[
|
||||||
|
IO.Hidden.auth_token_comfy_org,
|
||||||
|
IO.Hidden.api_key_comfy_org,
|
||||||
|
IO.Hidden.unique_id,
|
||||||
|
],
|
||||||
|
is_api_node=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def execute(
|
||||||
|
cls,
|
||||||
|
model_name: str,
|
||||||
|
prompt: str,
|
||||||
|
duration: int,
|
||||||
|
first_frame: Input.Image,
|
||||||
|
end_frame: Input.Image | None = None,
|
||||||
|
reference_images: Input.Image | None = None,
|
||||||
|
) -> IO.NodeOutput:
|
||||||
|
prompt = normalize_omni_prompt_references(prompt)
|
||||||
|
validate_string(prompt, min_length=1, max_length=2500)
|
||||||
|
if end_frame is not None and reference_images is not None:
|
||||||
|
raise ValueError("The 'end_frame' input cannot be used simultaneously with 'reference_images'.")
|
||||||
|
validate_image_dimensions(first_frame, min_width=300, min_height=300)
|
||||||
|
validate_image_aspect_ratio(first_frame, (1, 2.5), (2.5, 1))
|
||||||
|
image_list: list[OmniParamImage] = [
|
||||||
|
OmniParamImage(
|
||||||
|
image_url=(await upload_images_to_comfyapi(cls, first_frame, wait_label="Uploading first frame"))[0],
|
||||||
|
type="first_frame",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
if end_frame is not None:
|
||||||
|
validate_image_dimensions(end_frame, min_width=300, min_height=300)
|
||||||
|
validate_image_aspect_ratio(end_frame, (1, 2.5), (2.5, 1))
|
||||||
|
image_list.append(
|
||||||
|
OmniParamImage(
|
||||||
|
image_url=(await upload_images_to_comfyapi(cls, end_frame, wait_label="Uploading end frame"))[0],
|
||||||
|
type="end_frame",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if reference_images is not None:
|
||||||
|
if get_number_of_images(reference_images) > 6:
|
||||||
|
raise ValueError("The maximum number of reference images allowed is 6.")
|
||||||
|
for i in reference_images:
|
||||||
|
validate_image_dimensions(i, min_width=300, min_height=300)
|
||||||
|
validate_image_aspect_ratio(i, (1, 2.5), (2.5, 1))
|
||||||
|
for i in await upload_images_to_comfyapi(cls, reference_images, wait_label="Uploading reference frame(s)"):
|
||||||
|
image_list.append(OmniParamImage(image_url=i))
|
||||||
|
response = await sync_op(
|
||||||
|
cls,
|
||||||
|
ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"),
|
||||||
|
response_model=TaskStatusResponse,
|
||||||
|
data=OmniProFirstLastFrameRequest(
|
||||||
|
model_name=model_name,
|
||||||
|
prompt=prompt,
|
||||||
|
duration=str(duration),
|
||||||
|
image_list=image_list,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
return await finish_omni_video_task(cls, response)
|
||||||
|
|
||||||
|
|
||||||
|
class OmniProImageToVideoNode(IO.ComfyNode):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def define_schema(cls) -> IO.Schema:
|
||||||
|
return IO.Schema(
|
||||||
|
node_id="KlingOmniProImageToVideoNode",
|
||||||
|
display_name="Kling Omni Image to Video (Pro)",
|
||||||
|
category="api node/video/Kling",
|
||||||
|
description="Use up to 7 reference images to generate a video with the latest Kling model.",
|
||||||
|
inputs=[
|
||||||
|
IO.Combo.Input("model_name", options=["kling-video-o1"]),
|
||||||
|
IO.String.Input(
|
||||||
|
"prompt",
|
||||||
|
multiline=True,
|
||||||
|
tooltip="A text prompt describing the video content. "
|
||||||
|
"This can include both positive and negative descriptions.",
|
||||||
|
),
|
||||||
|
IO.Combo.Input("aspect_ratio", options=["16:9", "9:16", "1:1"]),
|
||||||
|
IO.Int.Input("duration", default=3, min=3, max=10, display_mode=IO.NumberDisplay.slider),
|
||||||
|
IO.Image.Input(
|
||||||
|
"reference_images",
|
||||||
|
tooltip="Up to 7 reference images.",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
IO.Video.Output(),
|
||||||
|
],
|
||||||
|
hidden=[
|
||||||
|
IO.Hidden.auth_token_comfy_org,
|
||||||
|
IO.Hidden.api_key_comfy_org,
|
||||||
|
IO.Hidden.unique_id,
|
||||||
|
],
|
||||||
|
is_api_node=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def execute(
|
||||||
|
cls,
|
||||||
|
model_name: str,
|
||||||
|
prompt: str,
|
||||||
|
aspect_ratio: str,
|
||||||
|
duration: int,
|
||||||
|
reference_images: Input.Image,
|
||||||
|
) -> IO.NodeOutput:
|
||||||
|
prompt = normalize_omni_prompt_references(prompt)
|
||||||
|
validate_string(prompt, min_length=1, max_length=2500)
|
||||||
|
if get_number_of_images(reference_images) > 7:
|
||||||
|
raise ValueError("The maximum number of reference images is 7.")
|
||||||
|
for i in reference_images:
|
||||||
|
validate_image_dimensions(i, min_width=300, min_height=300)
|
||||||
|
validate_image_aspect_ratio(i, (1, 2.5), (2.5, 1))
|
||||||
|
image_list: list[OmniParamImage] = []
|
||||||
|
for i in await upload_images_to_comfyapi(cls, reference_images, wait_label="Uploading reference image"):
|
||||||
|
image_list.append(OmniParamImage(image_url=i))
|
||||||
|
response = await sync_op(
|
||||||
|
cls,
|
||||||
|
ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"),
|
||||||
|
response_model=TaskStatusResponse,
|
||||||
|
data=OmniProReferences2VideoRequest(
|
||||||
|
model_name=model_name,
|
||||||
|
prompt=prompt,
|
||||||
|
aspect_ratio=aspect_ratio,
|
||||||
|
duration=str(duration),
|
||||||
|
image_list=image_list,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
return await finish_omni_video_task(cls, response)
|
||||||
|
|
||||||
|
|
||||||
|
class OmniProVideoToVideoNode(IO.ComfyNode):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def define_schema(cls) -> IO.Schema:
|
||||||
|
return IO.Schema(
|
||||||
|
node_id="KlingOmniProVideoToVideoNode",
|
||||||
|
display_name="Kling Omni Video to Video (Pro)",
|
||||||
|
category="api node/video/Kling",
|
||||||
|
description="Use a video and up to 4 reference images to generate a video with the latest Kling model.",
|
||||||
|
inputs=[
|
||||||
|
IO.Combo.Input("model_name", options=["kling-video-o1"]),
|
||||||
|
IO.String.Input(
|
||||||
|
"prompt",
|
||||||
|
multiline=True,
|
||||||
|
tooltip="A text prompt describing the video content. "
|
||||||
|
"This can include both positive and negative descriptions.",
|
||||||
|
),
|
||||||
|
IO.Combo.Input("aspect_ratio", options=["16:9", "9:16", "1:1"]),
|
||||||
|
IO.Int.Input("duration", default=3, min=3, max=10, display_mode=IO.NumberDisplay.slider),
|
||||||
|
IO.Video.Input("reference_video", tooltip="Video to use as a reference."),
|
||||||
|
IO.Boolean.Input("keep_original_sound", default=True),
|
||||||
|
IO.Image.Input(
|
||||||
|
"reference_images",
|
||||||
|
tooltip="Up to 4 additional reference images.",
|
||||||
|
optional=True,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
IO.Video.Output(),
|
||||||
|
],
|
||||||
|
hidden=[
|
||||||
|
IO.Hidden.auth_token_comfy_org,
|
||||||
|
IO.Hidden.api_key_comfy_org,
|
||||||
|
IO.Hidden.unique_id,
|
||||||
|
],
|
||||||
|
is_api_node=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def execute(
|
||||||
|
cls,
|
||||||
|
model_name: str,
|
||||||
|
prompt: str,
|
||||||
|
aspect_ratio: str,
|
||||||
|
duration: int,
|
||||||
|
reference_video: Input.Video,
|
||||||
|
keep_original_sound: bool,
|
||||||
|
reference_images: Input.Image | None = None,
|
||||||
|
) -> IO.NodeOutput:
|
||||||
|
prompt = normalize_omni_prompt_references(prompt)
|
||||||
|
validate_string(prompt, min_length=1, max_length=2500)
|
||||||
|
validate_video_duration(reference_video, min_duration=3.0, max_duration=10.05)
|
||||||
|
validate_video_dimensions(reference_video, min_width=720, min_height=720, max_width=2160, max_height=2160)
|
||||||
|
image_list: list[OmniParamImage] = []
|
||||||
|
if reference_images is not None:
|
||||||
|
if get_number_of_images(reference_images) > 4:
|
||||||
|
raise ValueError("The maximum number of reference images allowed with a video input is 4.")
|
||||||
|
for i in reference_images:
|
||||||
|
validate_image_dimensions(i, min_width=300, min_height=300)
|
||||||
|
validate_image_aspect_ratio(i, (1, 2.5), (2.5, 1))
|
||||||
|
for i in await upload_images_to_comfyapi(cls, reference_images, wait_label="Uploading reference image"):
|
||||||
|
image_list.append(OmniParamImage(image_url=i))
|
||||||
|
video_list = [
|
||||||
|
OmniParamVideo(
|
||||||
|
video_url=await upload_video_to_comfyapi(cls, reference_video, wait_label="Uploading reference video"),
|
||||||
|
refer_type="feature",
|
||||||
|
keep_original_sound="yes" if keep_original_sound else "no",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
response = await sync_op(
|
||||||
|
cls,
|
||||||
|
ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"),
|
||||||
|
response_model=TaskStatusResponse,
|
||||||
|
data=OmniProReferences2VideoRequest(
|
||||||
|
model_name=model_name,
|
||||||
|
prompt=prompt,
|
||||||
|
aspect_ratio=aspect_ratio,
|
||||||
|
duration=str(duration),
|
||||||
|
image_list=image_list if image_list else None,
|
||||||
|
video_list=video_list,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
return await finish_omni_video_task(cls, response)
|
||||||
|
|
||||||
|
|
||||||
|
class OmniProEditVideoNode(IO.ComfyNode):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def define_schema(cls) -> IO.Schema:
|
||||||
|
return IO.Schema(
|
||||||
|
node_id="KlingOmniProEditVideoNode",
|
||||||
|
display_name="Kling Omni Edit Video (Pro)",
|
||||||
|
category="api node/video/Kling",
|
||||||
|
description="Edit an existing video with the latest model from Kling.",
|
||||||
|
inputs=[
|
||||||
|
IO.Combo.Input("model_name", options=["kling-video-o1"]),
|
||||||
|
IO.String.Input(
|
||||||
|
"prompt",
|
||||||
|
multiline=True,
|
||||||
|
tooltip="A text prompt describing the video content. "
|
||||||
|
"This can include both positive and negative descriptions.",
|
||||||
|
),
|
||||||
|
IO.Video.Input("video", tooltip="Video for editing. The output video length will be the same."),
|
||||||
|
IO.Boolean.Input("keep_original_sound", default=True),
|
||||||
|
IO.Image.Input(
|
||||||
|
"reference_images",
|
||||||
|
tooltip="Up to 4 additional reference images.",
|
||||||
|
optional=True,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
IO.Video.Output(),
|
||||||
|
],
|
||||||
|
hidden=[
|
||||||
|
IO.Hidden.auth_token_comfy_org,
|
||||||
|
IO.Hidden.api_key_comfy_org,
|
||||||
|
IO.Hidden.unique_id,
|
||||||
|
],
|
||||||
|
is_api_node=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def execute(
|
||||||
|
cls,
|
||||||
|
model_name: str,
|
||||||
|
prompt: str,
|
||||||
|
video: Input.Video,
|
||||||
|
keep_original_sound: bool,
|
||||||
|
reference_images: Input.Image | None = None,
|
||||||
|
) -> IO.NodeOutput:
|
||||||
|
prompt = normalize_omni_prompt_references(prompt)
|
||||||
|
validate_string(prompt, min_length=1, max_length=2500)
|
||||||
|
validate_video_duration(video, min_duration=3.0, max_duration=10.05)
|
||||||
|
validate_video_dimensions(video, min_width=720, min_height=720, max_width=2160, max_height=2160)
|
||||||
|
image_list: list[OmniParamImage] = []
|
||||||
|
if reference_images is not None:
|
||||||
|
if get_number_of_images(reference_images) > 4:
|
||||||
|
raise ValueError("The maximum number of reference images allowed with a video input is 4.")
|
||||||
|
for i in reference_images:
|
||||||
|
validate_image_dimensions(i, min_width=300, min_height=300)
|
||||||
|
validate_image_aspect_ratio(i, (1, 2.5), (2.5, 1))
|
||||||
|
for i in await upload_images_to_comfyapi(cls, reference_images, wait_label="Uploading reference image"):
|
||||||
|
image_list.append(OmniParamImage(image_url=i))
|
||||||
|
video_list = [
|
||||||
|
OmniParamVideo(
|
||||||
|
video_url=await upload_video_to_comfyapi(cls, video, wait_label="Uploading base video"),
|
||||||
|
refer_type="base",
|
||||||
|
keep_original_sound="yes" if keep_original_sound else "no",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
response = await sync_op(
|
||||||
|
cls,
|
||||||
|
ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"),
|
||||||
|
response_model=TaskStatusResponse,
|
||||||
|
data=OmniProReferences2VideoRequest(
|
||||||
|
model_name=model_name,
|
||||||
|
prompt=prompt,
|
||||||
|
aspect_ratio=None,
|
||||||
|
duration=None,
|
||||||
|
image_list=image_list if image_list else None,
|
||||||
|
video_list=video_list,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
return await finish_omni_video_task(cls, response)
|
||||||
|
|
||||||
|
|
||||||
|
class OmniProImageNode(IO.ComfyNode):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def define_schema(cls) -> IO.Schema:
|
||||||
|
return IO.Schema(
|
||||||
|
node_id="KlingOmniProImageNode",
|
||||||
|
display_name="Kling Omni Image (Pro)",
|
||||||
|
category="api node/image/Kling",
|
||||||
|
description="Create or edit images with the latest model from Kling.",
|
||||||
|
inputs=[
|
||||||
|
IO.Combo.Input("model_name", options=["kling-image-o1"]),
|
||||||
|
IO.String.Input(
|
||||||
|
"prompt",
|
||||||
|
multiline=True,
|
||||||
|
tooltip="A text prompt describing the image content. "
|
||||||
|
"This can include both positive and negative descriptions.",
|
||||||
|
),
|
||||||
|
IO.Combo.Input("resolution", options=["1K", "2K"]),
|
||||||
|
IO.Combo.Input(
|
||||||
|
"aspect_ratio",
|
||||||
|
options=["16:9", "9:16", "1:1", "4:3", "3:4", "3:2", "2:3", "21:9"],
|
||||||
|
),
|
||||||
|
IO.Image.Input(
|
||||||
|
"reference_images",
|
||||||
|
tooltip="Up to 10 additional reference images.",
|
||||||
|
optional=True,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
IO.Image.Output(),
|
||||||
|
],
|
||||||
|
hidden=[
|
||||||
|
IO.Hidden.auth_token_comfy_org,
|
||||||
|
IO.Hidden.api_key_comfy_org,
|
||||||
|
IO.Hidden.unique_id,
|
||||||
|
],
|
||||||
|
is_api_node=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def execute(
|
||||||
|
cls,
|
||||||
|
model_name: str,
|
||||||
|
prompt: str,
|
||||||
|
resolution: str,
|
||||||
|
aspect_ratio: str,
|
||||||
|
reference_images: Input.Image | None = None,
|
||||||
|
) -> IO.NodeOutput:
|
||||||
|
prompt = normalize_omni_prompt_references(prompt)
|
||||||
|
validate_string(prompt, min_length=1, max_length=2500)
|
||||||
|
image_list: list[OmniImageParamImage] = []
|
||||||
|
if reference_images is not None:
|
||||||
|
if get_number_of_images(reference_images) > 10:
|
||||||
|
raise ValueError("The maximum number of reference images is 10.")
|
||||||
|
for i in reference_images:
|
||||||
|
validate_image_dimensions(i, min_width=300, min_height=300)
|
||||||
|
validate_image_aspect_ratio(i, (1, 2.5), (2.5, 1))
|
||||||
|
for i in await upload_images_to_comfyapi(cls, reference_images, wait_label="Uploading reference image"):
|
||||||
|
image_list.append(OmniImageParamImage(image=i))
|
||||||
|
response = await sync_op(
|
||||||
|
cls,
|
||||||
|
ApiEndpoint(path="/proxy/kling/v1/images/omni-image", method="POST"),
|
||||||
|
response_model=TaskStatusResponse,
|
||||||
|
data=OmniProImageRequest(
|
||||||
|
model_name=model_name,
|
||||||
|
prompt=prompt,
|
||||||
|
resolution=resolution.lower(),
|
||||||
|
aspect_ratio=aspect_ratio,
|
||||||
|
image_list=image_list if image_list else None,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
if response.code:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Kling request failed. Code: {response.code}, Message: {response.message}, Data: {response.data}"
|
||||||
|
)
|
||||||
|
final_response = await poll_op(
|
||||||
|
cls,
|
||||||
|
ApiEndpoint(path=f"/proxy/kling/v1/images/omni-image/{response.data.task_id}"),
|
||||||
|
response_model=TaskStatusResponse,
|
||||||
|
status_extractor=lambda r: (r.data.task_status if r.data else None),
|
||||||
|
)
|
||||||
|
return IO.NodeOutput(await download_url_to_image_tensor(final_response.data.task_result.images[0].url))
|
||||||
|
|
||||||
|
|
||||||
class KlingCameraControlT2VNode(IO.ComfyNode):
|
class KlingCameraControlT2VNode(IO.ComfyNode):
|
||||||
"""
|
"""
|
||||||
Kling Text to Video Camera Control Node. This node is a text to video node, but it supports controlling the camera.
|
Kling Text to Video Camera Control Node. This node is a text to video node, but it supports controlling the camera.
|
||||||
@ -787,7 +1302,7 @@ class KlingCameraControlT2VNode(IO.ComfyNode):
|
|||||||
negative_prompt: str,
|
negative_prompt: str,
|
||||||
cfg_scale: float,
|
cfg_scale: float,
|
||||||
aspect_ratio: str,
|
aspect_ratio: str,
|
||||||
camera_control: Optional[KlingCameraControl] = None,
|
camera_control: KlingCameraControl | None = None,
|
||||||
) -> IO.NodeOutput:
|
) -> IO.NodeOutput:
|
||||||
return await execute_text2video(
|
return await execute_text2video(
|
||||||
cls,
|
cls,
|
||||||
@ -809,9 +1324,8 @@ class KlingImage2VideoNode(IO.ComfyNode):
|
|||||||
def define_schema(cls) -> IO.Schema:
|
def define_schema(cls) -> IO.Schema:
|
||||||
return IO.Schema(
|
return IO.Schema(
|
||||||
node_id="KlingImage2VideoNode",
|
node_id="KlingImage2VideoNode",
|
||||||
display_name="Kling Image to Video",
|
display_name="Kling Image(First Frame) to Video",
|
||||||
category="api node/video/Kling",
|
category="api node/video/Kling",
|
||||||
description="Kling Image to Video Node",
|
|
||||||
inputs=[
|
inputs=[
|
||||||
IO.Image.Input("start_frame", tooltip="The reference image used to generate the video."),
|
IO.Image.Input("start_frame", tooltip="The reference image used to generate the video."),
|
||||||
IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"),
|
IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"),
|
||||||
@ -854,8 +1368,8 @@ class KlingImage2VideoNode(IO.ComfyNode):
|
|||||||
mode: str,
|
mode: str,
|
||||||
aspect_ratio: str,
|
aspect_ratio: str,
|
||||||
duration: str,
|
duration: str,
|
||||||
camera_control: Optional[KlingCameraControl] = None,
|
camera_control: KlingCameraControl | None = None,
|
||||||
end_frame: Optional[torch.Tensor] = None,
|
end_frame: torch.Tensor | None = None,
|
||||||
) -> IO.NodeOutput:
|
) -> IO.NodeOutput:
|
||||||
return await execute_image2video(
|
return await execute_image2video(
|
||||||
cls,
|
cls,
|
||||||
@ -965,15 +1479,11 @@ class KlingStartEndFrameNode(IO.ComfyNode):
|
|||||||
IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"),
|
IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"),
|
||||||
IO.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"),
|
IO.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"),
|
||||||
IO.Float.Input("cfg_scale", default=0.5, min=0.0, max=1.0),
|
IO.Float.Input("cfg_scale", default=0.5, min=0.0, max=1.0),
|
||||||
IO.Combo.Input(
|
IO.Combo.Input("aspect_ratio", options=["16:9", "9:16", "1:1"]),
|
||||||
"aspect_ratio",
|
|
||||||
options=[i.value for i in KlingVideoGenAspectRatio],
|
|
||||||
default="16:9",
|
|
||||||
),
|
|
||||||
IO.Combo.Input(
|
IO.Combo.Input(
|
||||||
"mode",
|
"mode",
|
||||||
options=modes,
|
options=modes,
|
||||||
default=modes[2],
|
default=modes[6],
|
||||||
tooltip="The configuration to use for the video generation following the format: mode / duration / model_name.",
|
tooltip="The configuration to use for the video generation following the format: mode / duration / model_name.",
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@ -1170,7 +1680,10 @@ class KlingSingleImageVideoEffectNode(IO.ComfyNode):
|
|||||||
category="api node/video/Kling",
|
category="api node/video/Kling",
|
||||||
description="Achieve different special effects when generating a video based on the effect_scene.",
|
description="Achieve different special effects when generating a video based on the effect_scene.",
|
||||||
inputs=[
|
inputs=[
|
||||||
IO.Image.Input("image", tooltip=" Reference Image. URL or Base64 encoded string (without data:image prefix). File size cannot exceed 10MB, resolution not less than 300*300px, aspect ratio between 1:2.5 ~ 2.5:1"),
|
IO.Image.Input(
|
||||||
|
"image",
|
||||||
|
tooltip=" Reference Image. URL or Base64 encoded string (without data:image prefix). File size cannot exceed 10MB, resolution not less than 300*300px, aspect ratio between 1:2.5 ~ 2.5:1",
|
||||||
|
),
|
||||||
IO.Combo.Input(
|
IO.Combo.Input(
|
||||||
"effect_scene",
|
"effect_scene",
|
||||||
options=[i.value for i in KlingSingleImageEffectsScene],
|
options=[i.value for i in KlingSingleImageEffectsScene],
|
||||||
@ -1254,8 +1767,8 @@ class KlingLipSyncAudioToVideoNode(IO.ComfyNode):
|
|||||||
@classmethod
|
@classmethod
|
||||||
async def execute(
|
async def execute(
|
||||||
cls,
|
cls,
|
||||||
video: VideoInput,
|
video: Input.Video,
|
||||||
audio: AudioInput,
|
audio: Input.Audio,
|
||||||
voice_language: str,
|
voice_language: str,
|
||||||
) -> IO.NodeOutput:
|
) -> IO.NodeOutput:
|
||||||
return await execute_lipsync(
|
return await execute_lipsync(
|
||||||
@ -1314,7 +1827,7 @@ class KlingLipSyncTextToVideoNode(IO.ComfyNode):
|
|||||||
@classmethod
|
@classmethod
|
||||||
async def execute(
|
async def execute(
|
||||||
cls,
|
cls,
|
||||||
video: VideoInput,
|
video: Input.Video,
|
||||||
text: str,
|
text: str,
|
||||||
voice: str,
|
voice: str,
|
||||||
voice_speed: float,
|
voice_speed: float,
|
||||||
@ -1433,7 +1946,7 @@ class KlingImageGenerationNode(IO.ComfyNode):
|
|||||||
IO.Combo.Input(
|
IO.Combo.Input(
|
||||||
"model_name",
|
"model_name",
|
||||||
options=[i.value for i in KlingImageGenModelName],
|
options=[i.value for i in KlingImageGenModelName],
|
||||||
default="kling-v1",
|
default="kling-v2",
|
||||||
),
|
),
|
||||||
IO.Combo.Input(
|
IO.Combo.Input(
|
||||||
"aspect_ratio",
|
"aspect_ratio",
|
||||||
@ -1471,7 +1984,7 @@ class KlingImageGenerationNode(IO.ComfyNode):
|
|||||||
human_fidelity: float,
|
human_fidelity: float,
|
||||||
n: int,
|
n: int,
|
||||||
aspect_ratio: KlingImageGenAspectRatio,
|
aspect_ratio: KlingImageGenAspectRatio,
|
||||||
image: Optional[torch.Tensor] = None,
|
image: torch.Tensor | None = None,
|
||||||
) -> IO.NodeOutput:
|
) -> IO.NodeOutput:
|
||||||
validate_string(prompt, field_name="prompt", min_length=1, max_length=MAX_PROMPT_LENGTH_IMAGE_GEN)
|
validate_string(prompt, field_name="prompt", min_length=1, max_length=MAX_PROMPT_LENGTH_IMAGE_GEN)
|
||||||
validate_string(negative_prompt, field_name="negative_prompt", max_length=MAX_PROMPT_LENGTH_IMAGE_GEN)
|
validate_string(negative_prompt, field_name="negative_prompt", max_length=MAX_PROMPT_LENGTH_IMAGE_GEN)
|
||||||
@ -1516,6 +2029,136 @@ class KlingImageGenerationNode(IO.ComfyNode):
|
|||||||
return IO.NodeOutput(await image_result_to_node_output(images))
|
return IO.NodeOutput(await image_result_to_node_output(images))
|
||||||
|
|
||||||
|
|
||||||
|
class TextToVideoWithAudio(IO.ComfyNode):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def define_schema(cls) -> IO.Schema:
|
||||||
|
return IO.Schema(
|
||||||
|
node_id="KlingTextToVideoWithAudio",
|
||||||
|
display_name="Kling Text to Video with Audio",
|
||||||
|
category="api node/video/Kling",
|
||||||
|
inputs=[
|
||||||
|
IO.Combo.Input("model_name", options=["kling-v2-6"]),
|
||||||
|
IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt."),
|
||||||
|
IO.Combo.Input("mode", options=["pro"]),
|
||||||
|
IO.Combo.Input("aspect_ratio", options=["16:9", "9:16", "1:1"]),
|
||||||
|
IO.Combo.Input("duration", options=[5, 10]),
|
||||||
|
IO.Boolean.Input("generate_audio", default=True),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
IO.Video.Output(),
|
||||||
|
],
|
||||||
|
hidden=[
|
||||||
|
IO.Hidden.auth_token_comfy_org,
|
||||||
|
IO.Hidden.api_key_comfy_org,
|
||||||
|
IO.Hidden.unique_id,
|
||||||
|
],
|
||||||
|
is_api_node=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def execute(
|
||||||
|
cls,
|
||||||
|
model_name: str,
|
||||||
|
prompt: str,
|
||||||
|
mode: str,
|
||||||
|
aspect_ratio: str,
|
||||||
|
duration: int,
|
||||||
|
generate_audio: bool,
|
||||||
|
) -> IO.NodeOutput:
|
||||||
|
validate_string(prompt, min_length=1, max_length=2500)
|
||||||
|
response = await sync_op(
|
||||||
|
cls,
|
||||||
|
ApiEndpoint(path="/proxy/kling/v1/videos/text2video", method="POST"),
|
||||||
|
response_model=TaskStatusResponse,
|
||||||
|
data=TextToVideoWithAudioRequest(
|
||||||
|
model_name=model_name,
|
||||||
|
prompt=prompt,
|
||||||
|
mode=mode,
|
||||||
|
aspect_ratio=aspect_ratio,
|
||||||
|
duration=str(duration),
|
||||||
|
sound="on" if generate_audio else "off",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
if response.code:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Kling request failed. Code: {response.code}, Message: {response.message}, Data: {response.data}"
|
||||||
|
)
|
||||||
|
final_response = await poll_op(
|
||||||
|
cls,
|
||||||
|
ApiEndpoint(path=f"/proxy/kling/v1/videos/text2video/{response.data.task_id}"),
|
||||||
|
response_model=TaskStatusResponse,
|
||||||
|
status_extractor=lambda r: (r.data.task_status if r.data else None),
|
||||||
|
)
|
||||||
|
return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url))
|
||||||
|
|
||||||
|
|
||||||
|
class ImageToVideoWithAudio(IO.ComfyNode):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def define_schema(cls) -> IO.Schema:
|
||||||
|
return IO.Schema(
|
||||||
|
node_id="KlingImageToVideoWithAudio",
|
||||||
|
display_name="Kling Image(First Frame) to Video with Audio",
|
||||||
|
category="api node/video/Kling",
|
||||||
|
inputs=[
|
||||||
|
IO.Combo.Input("model_name", options=["kling-v2-6"]),
|
||||||
|
IO.Image.Input("start_frame"),
|
||||||
|
IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt."),
|
||||||
|
IO.Combo.Input("mode", options=["pro"]),
|
||||||
|
IO.Combo.Input("duration", options=[5, 10]),
|
||||||
|
IO.Boolean.Input("generate_audio", default=True),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
IO.Video.Output(),
|
||||||
|
],
|
||||||
|
hidden=[
|
||||||
|
IO.Hidden.auth_token_comfy_org,
|
||||||
|
IO.Hidden.api_key_comfy_org,
|
||||||
|
IO.Hidden.unique_id,
|
||||||
|
],
|
||||||
|
is_api_node=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def execute(
|
||||||
|
cls,
|
||||||
|
model_name: str,
|
||||||
|
start_frame: Input.Image,
|
||||||
|
prompt: str,
|
||||||
|
mode: str,
|
||||||
|
duration: int,
|
||||||
|
generate_audio: bool,
|
||||||
|
) -> IO.NodeOutput:
|
||||||
|
validate_string(prompt, min_length=1, max_length=2500)
|
||||||
|
validate_image_dimensions(start_frame, min_width=300, min_height=300)
|
||||||
|
validate_image_aspect_ratio(start_frame, (1, 2.5), (2.5, 1))
|
||||||
|
response = await sync_op(
|
||||||
|
cls,
|
||||||
|
ApiEndpoint(path="/proxy/kling/v1/videos/image2video", method="POST"),
|
||||||
|
response_model=TaskStatusResponse,
|
||||||
|
data=ImageToVideoWithAudioRequest(
|
||||||
|
model_name=model_name,
|
||||||
|
image=(await upload_images_to_comfyapi(cls, start_frame))[0],
|
||||||
|
prompt=prompt,
|
||||||
|
mode=mode,
|
||||||
|
duration=str(duration),
|
||||||
|
sound="on" if generate_audio else "off",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
if response.code:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Kling request failed. Code: {response.code}, Message: {response.message}, Data: {response.data}"
|
||||||
|
)
|
||||||
|
final_response = await poll_op(
|
||||||
|
cls,
|
||||||
|
ApiEndpoint(path=f"/proxy/kling/v1/videos/image2video/{response.data.task_id}"),
|
||||||
|
response_model=TaskStatusResponse,
|
||||||
|
status_extractor=lambda r: (r.data.task_status if r.data else None),
|
||||||
|
)
|
||||||
|
return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url))
|
||||||
|
|
||||||
|
|
||||||
class KlingExtension(ComfyExtension):
|
class KlingExtension(ComfyExtension):
|
||||||
@override
|
@override
|
||||||
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
|
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
|
||||||
@ -1533,6 +2176,14 @@ class KlingExtension(ComfyExtension):
|
|||||||
KlingImageGenerationNode,
|
KlingImageGenerationNode,
|
||||||
KlingSingleImageVideoEffectNode,
|
KlingSingleImageVideoEffectNode,
|
||||||
KlingDualCharacterVideoEffectNode,
|
KlingDualCharacterVideoEffectNode,
|
||||||
|
OmniProTextToVideoNode,
|
||||||
|
OmniProFirstLastFrameNode,
|
||||||
|
OmniProImageToVideoNode,
|
||||||
|
OmniProVideoToVideoNode,
|
||||||
|
OmniProEditVideoNode,
|
||||||
|
OmniProImageNode,
|
||||||
|
TextToVideoWithAudio,
|
||||||
|
ImageToVideoWithAudio,
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,12 +1,9 @@
|
|||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from typing_extensions import override
|
from typing_extensions import override
|
||||||
|
|
||||||
from comfy_api.input_impl import VideoFromFile
|
from comfy_api.latest import IO, ComfyExtension, Input, InputImpl
|
||||||
from comfy_api.latest import IO, ComfyExtension
|
|
||||||
from comfy_api_nodes.util import (
|
from comfy_api_nodes.util import (
|
||||||
ApiEndpoint,
|
ApiEndpoint,
|
||||||
get_number_of_images,
|
get_number_of_images,
|
||||||
@ -26,9 +23,9 @@ class ExecuteTaskRequest(BaseModel):
|
|||||||
model: str = Field(...)
|
model: str = Field(...)
|
||||||
duration: int = Field(...)
|
duration: int = Field(...)
|
||||||
resolution: str = Field(...)
|
resolution: str = Field(...)
|
||||||
fps: Optional[int] = Field(25)
|
fps: int | None = Field(25)
|
||||||
generate_audio: Optional[bool] = Field(True)
|
generate_audio: bool | None = Field(True)
|
||||||
image_uri: Optional[str] = Field(None)
|
image_uri: str | None = Field(None)
|
||||||
|
|
||||||
|
|
||||||
class TextToVideoNode(IO.ComfyNode):
|
class TextToVideoNode(IO.ComfyNode):
|
||||||
@ -103,7 +100,7 @@ class TextToVideoNode(IO.ComfyNode):
|
|||||||
as_binary=True,
|
as_binary=True,
|
||||||
max_retries=1,
|
max_retries=1,
|
||||||
)
|
)
|
||||||
return IO.NodeOutput(VideoFromFile(BytesIO(response)))
|
return IO.NodeOutput(InputImpl.VideoFromFile(BytesIO(response)))
|
||||||
|
|
||||||
|
|
||||||
class ImageToVideoNode(IO.ComfyNode):
|
class ImageToVideoNode(IO.ComfyNode):
|
||||||
@ -153,7 +150,7 @@ class ImageToVideoNode(IO.ComfyNode):
|
|||||||
@classmethod
|
@classmethod
|
||||||
async def execute(
|
async def execute(
|
||||||
cls,
|
cls,
|
||||||
image: torch.Tensor,
|
image: Input.Image,
|
||||||
model: str,
|
model: str,
|
||||||
prompt: str,
|
prompt: str,
|
||||||
duration: int,
|
duration: int,
|
||||||
@ -183,7 +180,7 @@ class ImageToVideoNode(IO.ComfyNode):
|
|||||||
as_binary=True,
|
as_binary=True,
|
||||||
max_retries=1,
|
max_retries=1,
|
||||||
)
|
)
|
||||||
return IO.NodeOutput(VideoFromFile(BytesIO(response)))
|
return IO.NodeOutput(InputImpl.VideoFromFile(BytesIO(response)))
|
||||||
|
|
||||||
|
|
||||||
class LtxvApiExtension(ComfyExtension):
|
class LtxvApiExtension(ComfyExtension):
|
||||||
|
|||||||
@ -1,11 +1,8 @@
|
|||||||
import logging
|
import logging
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from typing_extensions import override
|
from typing_extensions import override
|
||||||
|
|
||||||
from comfy_api.input import VideoInput
|
from comfy_api.latest import IO, ComfyExtension, Input
|
||||||
from comfy_api.latest import IO, ComfyExtension
|
|
||||||
from comfy_api_nodes.apis import (
|
from comfy_api_nodes.apis import (
|
||||||
MoonvalleyPromptResponse,
|
MoonvalleyPromptResponse,
|
||||||
MoonvalleyTextToVideoInferenceParams,
|
MoonvalleyTextToVideoInferenceParams,
|
||||||
@ -61,7 +58,7 @@ def validate_task_creation_response(response) -> None:
|
|||||||
raise RuntimeError(error_msg)
|
raise RuntimeError(error_msg)
|
||||||
|
|
||||||
|
|
||||||
def validate_video_to_video_input(video: VideoInput) -> VideoInput:
|
def validate_video_to_video_input(video: Input.Video) -> Input.Video:
|
||||||
"""
|
"""
|
||||||
Validates and processes video input for Moonvalley Video-to-Video generation.
|
Validates and processes video input for Moonvalley Video-to-Video generation.
|
||||||
|
|
||||||
@ -82,7 +79,7 @@ def validate_video_to_video_input(video: VideoInput) -> VideoInput:
|
|||||||
return _validate_and_trim_duration(video)
|
return _validate_and_trim_duration(video)
|
||||||
|
|
||||||
|
|
||||||
def _get_video_dimensions(video: VideoInput) -> tuple[int, int]:
|
def _get_video_dimensions(video: Input.Video) -> tuple[int, int]:
|
||||||
"""Extracts video dimensions with error handling."""
|
"""Extracts video dimensions with error handling."""
|
||||||
try:
|
try:
|
||||||
return video.get_dimensions()
|
return video.get_dimensions()
|
||||||
@ -106,7 +103,7 @@ def _validate_video_dimensions(width: int, height: int) -> None:
|
|||||||
raise ValueError(f"Resolution {width}x{height} not supported. Supported: {supported_list}")
|
raise ValueError(f"Resolution {width}x{height} not supported. Supported: {supported_list}")
|
||||||
|
|
||||||
|
|
||||||
def _validate_and_trim_duration(video: VideoInput) -> VideoInput:
|
def _validate_and_trim_duration(video: Input.Video) -> Input.Video:
|
||||||
"""Validates video duration and trims to 5 seconds if needed."""
|
"""Validates video duration and trims to 5 seconds if needed."""
|
||||||
duration = video.get_duration()
|
duration = video.get_duration()
|
||||||
_validate_minimum_duration(duration)
|
_validate_minimum_duration(duration)
|
||||||
@ -119,7 +116,7 @@ def _validate_minimum_duration(duration: float) -> None:
|
|||||||
raise ValueError("Input video must be at least 5 seconds long.")
|
raise ValueError("Input video must be at least 5 seconds long.")
|
||||||
|
|
||||||
|
|
||||||
def _trim_if_too_long(video: VideoInput, duration: float) -> VideoInput:
|
def _trim_if_too_long(video: Input.Video, duration: float) -> Input.Video:
|
||||||
"""Trims video to 5 seconds if longer."""
|
"""Trims video to 5 seconds if longer."""
|
||||||
if duration > 5:
|
if duration > 5:
|
||||||
return trim_video(video, 5)
|
return trim_video(video, 5)
|
||||||
@ -241,7 +238,7 @@ class MoonvalleyImg2VideoNode(IO.ComfyNode):
|
|||||||
@classmethod
|
@classmethod
|
||||||
async def execute(
|
async def execute(
|
||||||
cls,
|
cls,
|
||||||
image: torch.Tensor,
|
image: Input.Image,
|
||||||
prompt: str,
|
prompt: str,
|
||||||
negative_prompt: str,
|
negative_prompt: str,
|
||||||
resolution: str,
|
resolution: str,
|
||||||
@ -362,9 +359,9 @@ class MoonvalleyVideo2VideoNode(IO.ComfyNode):
|
|||||||
prompt: str,
|
prompt: str,
|
||||||
negative_prompt: str,
|
negative_prompt: str,
|
||||||
seed: int,
|
seed: int,
|
||||||
video: Optional[VideoInput] = None,
|
video: Input.Video | None = None,
|
||||||
control_type: str = "Motion Transfer",
|
control_type: str = "Motion Transfer",
|
||||||
motion_intensity: Optional[int] = 100,
|
motion_intensity: int | None = 100,
|
||||||
steps=33,
|
steps=33,
|
||||||
prompt_adherence=4.5,
|
prompt_adherence=4.5,
|
||||||
) -> IO.NodeOutput:
|
) -> IO.NodeOutput:
|
||||||
|
|||||||
@ -1,15 +1,10 @@
|
|||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from typing import Optional, Union
|
|
||||||
import json
|
|
||||||
import os
|
import os
|
||||||
import time
|
|
||||||
import uuid
|
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from inspect import cleandoc
|
from inspect import cleandoc
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from server import PromptServer
|
|
||||||
import folder_paths
|
import folder_paths
|
||||||
import base64
|
import base64
|
||||||
from comfy_api.latest import IO, ComfyExtension
|
from comfy_api.latest import IO, ComfyExtension
|
||||||
@ -587,11 +582,11 @@ class OpenAIChatNode(IO.ComfyNode):
|
|||||||
def create_input_message_contents(
|
def create_input_message_contents(
|
||||||
cls,
|
cls,
|
||||||
prompt: str,
|
prompt: str,
|
||||||
image: Optional[torch.Tensor] = None,
|
image: torch.Tensor | None = None,
|
||||||
files: Optional[list[InputFileContent]] = None,
|
files: list[InputFileContent] | None = None,
|
||||||
) -> InputMessageContentList:
|
) -> InputMessageContentList:
|
||||||
"""Create a list of input message contents from prompt and optional image."""
|
"""Create a list of input message contents from prompt and optional image."""
|
||||||
content_list: list[Union[InputContent, InputTextContent, InputImageContent, InputFileContent]] = [
|
content_list: list[InputContent | InputTextContent | InputImageContent | InputFileContent] = [
|
||||||
InputTextContent(text=prompt, type="input_text"),
|
InputTextContent(text=prompt, type="input_text"),
|
||||||
]
|
]
|
||||||
if image is not None:
|
if image is not None:
|
||||||
@ -617,9 +612,9 @@ class OpenAIChatNode(IO.ComfyNode):
|
|||||||
prompt: str,
|
prompt: str,
|
||||||
persist_context: bool = False,
|
persist_context: bool = False,
|
||||||
model: SupportedOpenAIModel = SupportedOpenAIModel.gpt_5.value,
|
model: SupportedOpenAIModel = SupportedOpenAIModel.gpt_5.value,
|
||||||
images: Optional[torch.Tensor] = None,
|
images: torch.Tensor | None = None,
|
||||||
files: Optional[list[InputFileContent]] = None,
|
files: list[InputFileContent] | None = None,
|
||||||
advanced_options: Optional[CreateModelResponseProperties] = None,
|
advanced_options: CreateModelResponseProperties | None = None,
|
||||||
) -> IO.NodeOutput:
|
) -> IO.NodeOutput:
|
||||||
validate_string(prompt, strip_whitespace=False)
|
validate_string(prompt, strip_whitespace=False)
|
||||||
|
|
||||||
@ -660,30 +655,7 @@ class OpenAIChatNode(IO.ComfyNode):
|
|||||||
status_extractor=lambda response: response.status,
|
status_extractor=lambda response: response.status,
|
||||||
completed_statuses=["incomplete", "completed"]
|
completed_statuses=["incomplete", "completed"]
|
||||||
)
|
)
|
||||||
output_text = cls.get_text_from_message_content(cls.get_message_content_from_response(result_response))
|
return IO.NodeOutput(cls.get_text_from_message_content(cls.get_message_content_from_response(result_response)))
|
||||||
|
|
||||||
# Update history
|
|
||||||
render_spec = {
|
|
||||||
"node_id": cls.hidden.unique_id,
|
|
||||||
"component": "ChatHistoryWidget",
|
|
||||||
"props": {
|
|
||||||
"history": json.dumps(
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"prompt": prompt,
|
|
||||||
"response": output_text,
|
|
||||||
"response_id": str(uuid.uuid4()),
|
|
||||||
"timestamp": time.time(),
|
|
||||||
}
|
|
||||||
]
|
|
||||||
),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
PromptServer.instance.send_sync(
|
|
||||||
"display_component",
|
|
||||||
render_spec,
|
|
||||||
)
|
|
||||||
return IO.NodeOutput(output_text)
|
|
||||||
|
|
||||||
|
|
||||||
class OpenAIInputFiles(IO.ComfyNode):
|
class OpenAIInputFiles(IO.ComfyNode):
|
||||||
@ -790,8 +762,8 @@ class OpenAIChatConfig(IO.ComfyNode):
|
|||||||
def execute(
|
def execute(
|
||||||
cls,
|
cls,
|
||||||
truncation: bool,
|
truncation: bool,
|
||||||
instructions: Optional[str] = None,
|
instructions: str | None = None,
|
||||||
max_output_tokens: Optional[int] = None,
|
max_output_tokens: int | None = None,
|
||||||
) -> IO.NodeOutput:
|
) -> IO.NodeOutput:
|
||||||
"""
|
"""
|
||||||
Configure advanced options for the OpenAI Chat Node.
|
Configure advanced options for the OpenAI Chat Node.
|
||||||
|
|||||||
@ -1,568 +0,0 @@
|
|||||||
"""
|
|
||||||
Pika x ComfyUI API Nodes
|
|
||||||
|
|
||||||
Pika API docs: https://pika-827374fb.mintlify.app/api-reference
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from io import BytesIO
|
|
||||||
import logging
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import torch
|
|
||||||
|
|
||||||
from typing_extensions import override
|
|
||||||
from comfy_api.latest import ComfyExtension, IO
|
|
||||||
from comfy_api.input_impl.video_types import VideoCodec, VideoContainer, VideoInput
|
|
||||||
from comfy_api_nodes.apis import pika_api as pika_defs
|
|
||||||
from comfy_api_nodes.util import (
|
|
||||||
validate_string,
|
|
||||||
download_url_to_video_output,
|
|
||||||
tensor_to_bytesio,
|
|
||||||
ApiEndpoint,
|
|
||||||
sync_op,
|
|
||||||
poll_op,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
PATH_PIKADDITIONS = "/proxy/pika/generate/pikadditions"
|
|
||||||
PATH_PIKASWAPS = "/proxy/pika/generate/pikaswaps"
|
|
||||||
PATH_PIKAFFECTS = "/proxy/pika/generate/pikaffects"
|
|
||||||
|
|
||||||
PIKA_API_VERSION = "2.2"
|
|
||||||
PATH_TEXT_TO_VIDEO = f"/proxy/pika/generate/{PIKA_API_VERSION}/t2v"
|
|
||||||
PATH_IMAGE_TO_VIDEO = f"/proxy/pika/generate/{PIKA_API_VERSION}/i2v"
|
|
||||||
PATH_PIKAFRAMES = f"/proxy/pika/generate/{PIKA_API_VERSION}/pikaframes"
|
|
||||||
PATH_PIKASCENES = f"/proxy/pika/generate/{PIKA_API_VERSION}/pikascenes"
|
|
||||||
|
|
||||||
PATH_VIDEO_GET = "/proxy/pika/videos"
|
|
||||||
|
|
||||||
|
|
||||||
async def execute_task(
|
|
||||||
task_id: str,
|
|
||||||
cls: type[IO.ComfyNode],
|
|
||||||
) -> IO.NodeOutput:
|
|
||||||
final_response: pika_defs.PikaVideoResponse = await poll_op(
|
|
||||||
cls,
|
|
||||||
ApiEndpoint(path=f"{PATH_VIDEO_GET}/{task_id}"),
|
|
||||||
response_model=pika_defs.PikaVideoResponse,
|
|
||||||
status_extractor=lambda response: (response.status.value if response.status else None),
|
|
||||||
progress_extractor=lambda response: (response.progress if hasattr(response, "progress") else None),
|
|
||||||
estimated_duration=60,
|
|
||||||
max_poll_attempts=240,
|
|
||||||
)
|
|
||||||
if not final_response.url:
|
|
||||||
error_msg = f"Pika task {task_id} succeeded but no video data found in response:\n{final_response}"
|
|
||||||
logging.error(error_msg)
|
|
||||||
raise Exception(error_msg)
|
|
||||||
video_url = final_response.url
|
|
||||||
logging.info("Pika task %s succeeded. Video URL: %s", task_id, video_url)
|
|
||||||
return IO.NodeOutput(await download_url_to_video_output(video_url))
|
|
||||||
|
|
||||||
|
|
||||||
def get_base_inputs_types() -> list[IO.Input]:
|
|
||||||
"""Get the base required inputs types common to all Pika nodes."""
|
|
||||||
return [
|
|
||||||
IO.String.Input("prompt_text", multiline=True),
|
|
||||||
IO.String.Input("negative_prompt", multiline=True),
|
|
||||||
IO.Int.Input("seed", min=0, max=0xFFFFFFFF, control_after_generate=True),
|
|
||||||
IO.Combo.Input("resolution", options=["1080p", "720p"], default="1080p"),
|
|
||||||
IO.Combo.Input("duration", options=[5, 10], default=5),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class PikaImageToVideo(IO.ComfyNode):
|
|
||||||
"""Pika 2.2 Image to Video Node."""
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def define_schema(cls) -> IO.Schema:
|
|
||||||
return IO.Schema(
|
|
||||||
node_id="PikaImageToVideoNode2_2",
|
|
||||||
display_name="Pika Image to Video",
|
|
||||||
description="Sends an image and prompt to the Pika API v2.2 to generate a video.",
|
|
||||||
category="api node/video/Pika",
|
|
||||||
inputs=[
|
|
||||||
IO.Image.Input("image", tooltip="The image to convert to video"),
|
|
||||||
*get_base_inputs_types(),
|
|
||||||
],
|
|
||||||
outputs=[IO.Video.Output()],
|
|
||||||
hidden=[
|
|
||||||
IO.Hidden.auth_token_comfy_org,
|
|
||||||
IO.Hidden.api_key_comfy_org,
|
|
||||||
IO.Hidden.unique_id,
|
|
||||||
],
|
|
||||||
is_api_node=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
async def execute(
|
|
||||||
cls,
|
|
||||||
image: torch.Tensor,
|
|
||||||
prompt_text: str,
|
|
||||||
negative_prompt: str,
|
|
||||||
seed: int,
|
|
||||||
resolution: str,
|
|
||||||
duration: int,
|
|
||||||
) -> IO.NodeOutput:
|
|
||||||
image_bytes_io = tensor_to_bytesio(image)
|
|
||||||
pika_files = {"image": ("image.png", image_bytes_io, "image/png")}
|
|
||||||
pika_request_data = pika_defs.PikaBodyGenerate22I2vGenerate22I2vPost(
|
|
||||||
promptText=prompt_text,
|
|
||||||
negativePrompt=negative_prompt,
|
|
||||||
seed=seed,
|
|
||||||
resolution=resolution,
|
|
||||||
duration=duration,
|
|
||||||
)
|
|
||||||
initial_operation = await sync_op(
|
|
||||||
cls,
|
|
||||||
ApiEndpoint(path=PATH_IMAGE_TO_VIDEO, method="POST"),
|
|
||||||
response_model=pika_defs.PikaGenerateResponse,
|
|
||||||
data=pika_request_data,
|
|
||||||
files=pika_files,
|
|
||||||
content_type="multipart/form-data",
|
|
||||||
)
|
|
||||||
return await execute_task(initial_operation.video_id, cls)
|
|
||||||
|
|
||||||
|
|
||||||
class PikaTextToVideoNode(IO.ComfyNode):
|
|
||||||
"""Pika Text2Video v2.2 Node."""
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def define_schema(cls) -> IO.Schema:
|
|
||||||
return IO.Schema(
|
|
||||||
node_id="PikaTextToVideoNode2_2",
|
|
||||||
display_name="Pika Text to Video",
|
|
||||||
description="Sends a text prompt to the Pika API v2.2 to generate a video.",
|
|
||||||
category="api node/video/Pika",
|
|
||||||
inputs=[
|
|
||||||
*get_base_inputs_types(),
|
|
||||||
IO.Float.Input(
|
|
||||||
"aspect_ratio",
|
|
||||||
step=0.001,
|
|
||||||
min=0.4,
|
|
||||||
max=2.5,
|
|
||||||
default=1.7777777777777777,
|
|
||||||
tooltip="Aspect ratio (width / height)",
|
|
||||||
)
|
|
||||||
],
|
|
||||||
outputs=[IO.Video.Output()],
|
|
||||||
hidden=[
|
|
||||||
IO.Hidden.auth_token_comfy_org,
|
|
||||||
IO.Hidden.api_key_comfy_org,
|
|
||||||
IO.Hidden.unique_id,
|
|
||||||
],
|
|
||||||
is_api_node=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
async def execute(
|
|
||||||
cls,
|
|
||||||
prompt_text: str,
|
|
||||||
negative_prompt: str,
|
|
||||||
seed: int,
|
|
||||||
resolution: str,
|
|
||||||
duration: int,
|
|
||||||
aspect_ratio: float,
|
|
||||||
) -> IO.NodeOutput:
|
|
||||||
initial_operation = await sync_op(
|
|
||||||
cls,
|
|
||||||
ApiEndpoint(path=PATH_TEXT_TO_VIDEO, method="POST"),
|
|
||||||
response_model=pika_defs.PikaGenerateResponse,
|
|
||||||
data=pika_defs.PikaBodyGenerate22T2vGenerate22T2vPost(
|
|
||||||
promptText=prompt_text,
|
|
||||||
negativePrompt=negative_prompt,
|
|
||||||
seed=seed,
|
|
||||||
resolution=resolution,
|
|
||||||
duration=duration,
|
|
||||||
aspectRatio=aspect_ratio,
|
|
||||||
),
|
|
||||||
content_type="application/x-www-form-urlencoded",
|
|
||||||
)
|
|
||||||
return await execute_task(initial_operation.video_id, cls)
|
|
||||||
|
|
||||||
|
|
||||||
class PikaScenes(IO.ComfyNode):
|
|
||||||
"""PikaScenes v2.2 Node."""
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def define_schema(cls) -> IO.Schema:
|
|
||||||
return IO.Schema(
|
|
||||||
node_id="PikaScenesV2_2",
|
|
||||||
display_name="Pika Scenes (Video Image Composition)",
|
|
||||||
description="Combine your images to create a video with the objects in them. Upload multiple images as ingredients and generate a high-quality video that incorporates all of them.",
|
|
||||||
category="api node/video/Pika",
|
|
||||||
inputs=[
|
|
||||||
*get_base_inputs_types(),
|
|
||||||
IO.Combo.Input(
|
|
||||||
"ingredients_mode",
|
|
||||||
options=["creative", "precise"],
|
|
||||||
default="creative",
|
|
||||||
),
|
|
||||||
IO.Float.Input(
|
|
||||||
"aspect_ratio",
|
|
||||||
step=0.001,
|
|
||||||
min=0.4,
|
|
||||||
max=2.5,
|
|
||||||
default=1.7777777777777777,
|
|
||||||
tooltip="Aspect ratio (width / height)",
|
|
||||||
),
|
|
||||||
IO.Image.Input(
|
|
||||||
"image_ingredient_1",
|
|
||||||
optional=True,
|
|
||||||
tooltip="Image that will be used as ingredient to create a video.",
|
|
||||||
),
|
|
||||||
IO.Image.Input(
|
|
||||||
"image_ingredient_2",
|
|
||||||
optional=True,
|
|
||||||
tooltip="Image that will be used as ingredient to create a video.",
|
|
||||||
),
|
|
||||||
IO.Image.Input(
|
|
||||||
"image_ingredient_3",
|
|
||||||
optional=True,
|
|
||||||
tooltip="Image that will be used as ingredient to create a video.",
|
|
||||||
),
|
|
||||||
IO.Image.Input(
|
|
||||||
"image_ingredient_4",
|
|
||||||
optional=True,
|
|
||||||
tooltip="Image that will be used as ingredient to create a video.",
|
|
||||||
),
|
|
||||||
IO.Image.Input(
|
|
||||||
"image_ingredient_5",
|
|
||||||
optional=True,
|
|
||||||
tooltip="Image that will be used as ingredient to create a video.",
|
|
||||||
),
|
|
||||||
],
|
|
||||||
outputs=[IO.Video.Output()],
|
|
||||||
hidden=[
|
|
||||||
IO.Hidden.auth_token_comfy_org,
|
|
||||||
IO.Hidden.api_key_comfy_org,
|
|
||||||
IO.Hidden.unique_id,
|
|
||||||
],
|
|
||||||
is_api_node=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
async def execute(
|
|
||||||
cls,
|
|
||||||
prompt_text: str,
|
|
||||||
negative_prompt: str,
|
|
||||||
seed: int,
|
|
||||||
resolution: str,
|
|
||||||
duration: int,
|
|
||||||
ingredients_mode: str,
|
|
||||||
aspect_ratio: float,
|
|
||||||
image_ingredient_1: Optional[torch.Tensor] = None,
|
|
||||||
image_ingredient_2: Optional[torch.Tensor] = None,
|
|
||||||
image_ingredient_3: Optional[torch.Tensor] = None,
|
|
||||||
image_ingredient_4: Optional[torch.Tensor] = None,
|
|
||||||
image_ingredient_5: Optional[torch.Tensor] = None,
|
|
||||||
) -> IO.NodeOutput:
|
|
||||||
all_image_bytes_io = []
|
|
||||||
for image in [
|
|
||||||
image_ingredient_1,
|
|
||||||
image_ingredient_2,
|
|
||||||
image_ingredient_3,
|
|
||||||
image_ingredient_4,
|
|
||||||
image_ingredient_5,
|
|
||||||
]:
|
|
||||||
if image is not None:
|
|
||||||
all_image_bytes_io.append(tensor_to_bytesio(image))
|
|
||||||
|
|
||||||
pika_files = [
|
|
||||||
("images", (f"image_{i}.png", image_bytes_io, "image/png"))
|
|
||||||
for i, image_bytes_io in enumerate(all_image_bytes_io)
|
|
||||||
]
|
|
||||||
|
|
||||||
pika_request_data = pika_defs.PikaBodyGenerate22C2vGenerate22PikascenesPost(
|
|
||||||
ingredientsMode=ingredients_mode,
|
|
||||||
promptText=prompt_text,
|
|
||||||
negativePrompt=negative_prompt,
|
|
||||||
seed=seed,
|
|
||||||
resolution=resolution,
|
|
||||||
duration=duration,
|
|
||||||
aspectRatio=aspect_ratio,
|
|
||||||
)
|
|
||||||
initial_operation = await sync_op(
|
|
||||||
cls,
|
|
||||||
ApiEndpoint(path=PATH_PIKASCENES, method="POST"),
|
|
||||||
response_model=pika_defs.PikaGenerateResponse,
|
|
||||||
data=pika_request_data,
|
|
||||||
files=pika_files,
|
|
||||||
content_type="multipart/form-data",
|
|
||||||
)
|
|
||||||
|
|
||||||
return await execute_task(initial_operation.video_id, cls)
|
|
||||||
|
|
||||||
|
|
||||||
class PikAdditionsNode(IO.ComfyNode):
|
|
||||||
"""Pika Pikadditions Node. Add an image into a video."""
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def define_schema(cls) -> IO.Schema:
|
|
||||||
return IO.Schema(
|
|
||||||
node_id="Pikadditions",
|
|
||||||
display_name="Pikadditions (Video Object Insertion)",
|
|
||||||
description="Add any object or image into your video. Upload a video and specify what you'd like to add to create a seamlessly integrated result.",
|
|
||||||
category="api node/video/Pika",
|
|
||||||
inputs=[
|
|
||||||
IO.Video.Input("video", tooltip="The video to add an image to."),
|
|
||||||
IO.Image.Input("image", tooltip="The image to add to the video."),
|
|
||||||
IO.String.Input("prompt_text", multiline=True),
|
|
||||||
IO.String.Input("negative_prompt", multiline=True),
|
|
||||||
IO.Int.Input(
|
|
||||||
"seed",
|
|
||||||
min=0,
|
|
||||||
max=0xFFFFFFFF,
|
|
||||||
control_after_generate=True,
|
|
||||||
),
|
|
||||||
],
|
|
||||||
outputs=[IO.Video.Output()],
|
|
||||||
hidden=[
|
|
||||||
IO.Hidden.auth_token_comfy_org,
|
|
||||||
IO.Hidden.api_key_comfy_org,
|
|
||||||
IO.Hidden.unique_id,
|
|
||||||
],
|
|
||||||
is_api_node=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
async def execute(
|
|
||||||
cls,
|
|
||||||
video: VideoInput,
|
|
||||||
image: torch.Tensor,
|
|
||||||
prompt_text: str,
|
|
||||||
negative_prompt: str,
|
|
||||||
seed: int,
|
|
||||||
) -> IO.NodeOutput:
|
|
||||||
video_bytes_io = BytesIO()
|
|
||||||
video.save_to(video_bytes_io, format=VideoContainer.MP4, codec=VideoCodec.H264)
|
|
||||||
video_bytes_io.seek(0)
|
|
||||||
|
|
||||||
image_bytes_io = tensor_to_bytesio(image)
|
|
||||||
pika_files = {
|
|
||||||
"video": ("video.mp4", video_bytes_io, "video/mp4"),
|
|
||||||
"image": ("image.png", image_bytes_io, "image/png"),
|
|
||||||
}
|
|
||||||
pika_request_data = pika_defs.PikaBodyGeneratePikadditionsGeneratePikadditionsPost(
|
|
||||||
promptText=prompt_text,
|
|
||||||
negativePrompt=negative_prompt,
|
|
||||||
seed=seed,
|
|
||||||
)
|
|
||||||
initial_operation = await sync_op(
|
|
||||||
cls,
|
|
||||||
ApiEndpoint(path=PATH_PIKADDITIONS, method="POST"),
|
|
||||||
response_model=pika_defs.PikaGenerateResponse,
|
|
||||||
data=pika_request_data,
|
|
||||||
files=pika_files,
|
|
||||||
content_type="multipart/form-data",
|
|
||||||
)
|
|
||||||
|
|
||||||
return await execute_task(initial_operation.video_id, cls)
|
|
||||||
|
|
||||||
|
|
||||||
class PikaSwapsNode(IO.ComfyNode):
|
|
||||||
"""Pika Pikaswaps Node."""
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def define_schema(cls) -> IO.Schema:
|
|
||||||
return IO.Schema(
|
|
||||||
node_id="Pikaswaps",
|
|
||||||
display_name="Pika Swaps (Video Object Replacement)",
|
|
||||||
description="Swap out any object or region of your video with a new image or object. Define areas to replace either with a mask or coordinates.",
|
|
||||||
category="api node/video/Pika",
|
|
||||||
inputs=[
|
|
||||||
IO.Video.Input("video", tooltip="The video to swap an object in."),
|
|
||||||
IO.Image.Input(
|
|
||||||
"image",
|
|
||||||
tooltip="The image used to replace the masked object in the video.",
|
|
||||||
optional=True,
|
|
||||||
),
|
|
||||||
IO.Mask.Input(
|
|
||||||
"mask",
|
|
||||||
tooltip="Use the mask to define areas in the video to replace.",
|
|
||||||
optional=True,
|
|
||||||
),
|
|
||||||
IO.String.Input("prompt_text", multiline=True, optional=True),
|
|
||||||
IO.String.Input("negative_prompt", multiline=True, optional=True),
|
|
||||||
IO.Int.Input("seed", min=0, max=0xFFFFFFFF, control_after_generate=True, optional=True),
|
|
||||||
IO.String.Input(
|
|
||||||
"region_to_modify",
|
|
||||||
multiline=True,
|
|
||||||
optional=True,
|
|
||||||
tooltip="Plaintext description of the object / region to modify.",
|
|
||||||
),
|
|
||||||
],
|
|
||||||
outputs=[IO.Video.Output()],
|
|
||||||
hidden=[
|
|
||||||
IO.Hidden.auth_token_comfy_org,
|
|
||||||
IO.Hidden.api_key_comfy_org,
|
|
||||||
IO.Hidden.unique_id,
|
|
||||||
],
|
|
||||||
is_api_node=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
async def execute(
|
|
||||||
cls,
|
|
||||||
video: VideoInput,
|
|
||||||
image: Optional[torch.Tensor] = None,
|
|
||||||
mask: Optional[torch.Tensor] = None,
|
|
||||||
prompt_text: str = "",
|
|
||||||
negative_prompt: str = "",
|
|
||||||
seed: int = 0,
|
|
||||||
region_to_modify: str = "",
|
|
||||||
) -> IO.NodeOutput:
|
|
||||||
video_bytes_io = BytesIO()
|
|
||||||
video.save_to(video_bytes_io, format=VideoContainer.MP4, codec=VideoCodec.H264)
|
|
||||||
video_bytes_io.seek(0)
|
|
||||||
pika_files = {
|
|
||||||
"video": ("video.mp4", video_bytes_io, "video/mp4"),
|
|
||||||
}
|
|
||||||
if mask is not None:
|
|
||||||
pika_files["modifyRegionMask"] = ("mask.png", tensor_to_bytesio(mask), "image/png")
|
|
||||||
if image is not None:
|
|
||||||
pika_files["image"] = ("image.png", tensor_to_bytesio(image), "image/png")
|
|
||||||
|
|
||||||
pika_request_data = pika_defs.PikaBodyGeneratePikaswapsGeneratePikaswapsPost(
|
|
||||||
promptText=prompt_text,
|
|
||||||
negativePrompt=negative_prompt,
|
|
||||||
seed=seed,
|
|
||||||
modifyRegionRoi=region_to_modify if region_to_modify else None,
|
|
||||||
)
|
|
||||||
initial_operation = await sync_op(
|
|
||||||
cls,
|
|
||||||
ApiEndpoint(path=PATH_PIKASWAPS, method="POST"),
|
|
||||||
response_model=pika_defs.PikaGenerateResponse,
|
|
||||||
data=pika_request_data,
|
|
||||||
files=pika_files,
|
|
||||||
content_type="multipart/form-data",
|
|
||||||
)
|
|
||||||
return await execute_task(initial_operation.video_id, cls)
|
|
||||||
|
|
||||||
|
|
||||||
class PikaffectsNode(IO.ComfyNode):
|
|
||||||
"""Pika Pikaffects Node."""
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def define_schema(cls) -> IO.Schema:
|
|
||||||
return IO.Schema(
|
|
||||||
node_id="Pikaffects",
|
|
||||||
display_name="Pikaffects (Video Effects)",
|
|
||||||
description="Generate a video with a specific Pikaffect. Supported Pikaffects: Cake-ify, Crumble, Crush, Decapitate, Deflate, Dissolve, Explode, Eye-pop, Inflate, Levitate, Melt, Peel, Poke, Squish, Ta-da, Tear",
|
|
||||||
category="api node/video/Pika",
|
|
||||||
inputs=[
|
|
||||||
IO.Image.Input("image", tooltip="The reference image to apply the Pikaffect to."),
|
|
||||||
IO.Combo.Input(
|
|
||||||
"pikaffect", options=pika_defs.Pikaffect, default="Cake-ify"
|
|
||||||
),
|
|
||||||
IO.String.Input("prompt_text", multiline=True),
|
|
||||||
IO.String.Input("negative_prompt", multiline=True),
|
|
||||||
IO.Int.Input("seed", min=0, max=0xFFFFFFFF, control_after_generate=True),
|
|
||||||
],
|
|
||||||
outputs=[IO.Video.Output()],
|
|
||||||
hidden=[
|
|
||||||
IO.Hidden.auth_token_comfy_org,
|
|
||||||
IO.Hidden.api_key_comfy_org,
|
|
||||||
IO.Hidden.unique_id,
|
|
||||||
],
|
|
||||||
is_api_node=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
async def execute(
|
|
||||||
cls,
|
|
||||||
image: torch.Tensor,
|
|
||||||
pikaffect: str,
|
|
||||||
prompt_text: str,
|
|
||||||
negative_prompt: str,
|
|
||||||
seed: int,
|
|
||||||
) -> IO.NodeOutput:
|
|
||||||
initial_operation = await sync_op(
|
|
||||||
cls,
|
|
||||||
ApiEndpoint(path=PATH_PIKAFFECTS, method="POST"),
|
|
||||||
response_model=pika_defs.PikaGenerateResponse,
|
|
||||||
data=pika_defs.PikaBodyGeneratePikaffectsGeneratePikaffectsPost(
|
|
||||||
pikaffect=pikaffect,
|
|
||||||
promptText=prompt_text,
|
|
||||||
negativePrompt=negative_prompt,
|
|
||||||
seed=seed,
|
|
||||||
),
|
|
||||||
files={"image": ("image.png", tensor_to_bytesio(image), "image/png")},
|
|
||||||
content_type="multipart/form-data",
|
|
||||||
)
|
|
||||||
return await execute_task(initial_operation.video_id, cls)
|
|
||||||
|
|
||||||
|
|
||||||
class PikaStartEndFrameNode(IO.ComfyNode):
|
|
||||||
"""PikaFrames v2.2 Node."""
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def define_schema(cls) -> IO.Schema:
|
|
||||||
return IO.Schema(
|
|
||||||
node_id="PikaStartEndFrameNode2_2",
|
|
||||||
display_name="Pika Start and End Frame to Video",
|
|
||||||
description="Generate a video by combining your first and last frame. Upload two images to define the start and end points, and let the AI create a smooth transition between them.",
|
|
||||||
category="api node/video/Pika",
|
|
||||||
inputs=[
|
|
||||||
IO.Image.Input("image_start", tooltip="The first image to combine."),
|
|
||||||
IO.Image.Input("image_end", tooltip="The last image to combine."),
|
|
||||||
*get_base_inputs_types(),
|
|
||||||
],
|
|
||||||
outputs=[IO.Video.Output()],
|
|
||||||
hidden=[
|
|
||||||
IO.Hidden.auth_token_comfy_org,
|
|
||||||
IO.Hidden.api_key_comfy_org,
|
|
||||||
IO.Hidden.unique_id,
|
|
||||||
],
|
|
||||||
is_api_node=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
async def execute(
|
|
||||||
cls,
|
|
||||||
image_start: torch.Tensor,
|
|
||||||
image_end: torch.Tensor,
|
|
||||||
prompt_text: str,
|
|
||||||
negative_prompt: str,
|
|
||||||
seed: int,
|
|
||||||
resolution: str,
|
|
||||||
duration: int,
|
|
||||||
) -> IO.NodeOutput:
|
|
||||||
validate_string(prompt_text, field_name="prompt_text", min_length=1)
|
|
||||||
pika_files = [
|
|
||||||
("keyFrames", ("image_start.png", tensor_to_bytesio(image_start), "image/png")),
|
|
||||||
("keyFrames", ("image_end.png", tensor_to_bytesio(image_end), "image/png")),
|
|
||||||
]
|
|
||||||
initial_operation = await sync_op(
|
|
||||||
cls,
|
|
||||||
ApiEndpoint(path=PATH_PIKAFRAMES, method="POST"),
|
|
||||||
response_model=pika_defs.PikaGenerateResponse,
|
|
||||||
data=pika_defs.PikaBodyGenerate22KeyframeGenerate22PikaframesPost(
|
|
||||||
promptText=prompt_text,
|
|
||||||
negativePrompt=negative_prompt,
|
|
||||||
seed=seed,
|
|
||||||
resolution=resolution,
|
|
||||||
duration=duration,
|
|
||||||
),
|
|
||||||
files=pika_files,
|
|
||||||
content_type="multipart/form-data",
|
|
||||||
)
|
|
||||||
return await execute_task(initial_operation.video_id, cls)
|
|
||||||
|
|
||||||
|
|
||||||
class PikaApiNodesExtension(ComfyExtension):
|
|
||||||
@override
|
|
||||||
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
|
|
||||||
return [
|
|
||||||
PikaImageToVideo,
|
|
||||||
PikaTextToVideoNode,
|
|
||||||
PikaScenes,
|
|
||||||
PikAdditionsNode,
|
|
||||||
PikaSwapsNode,
|
|
||||||
PikaffectsNode,
|
|
||||||
PikaStartEndFrameNode,
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
async def comfy_entrypoint() -> PikaApiNodesExtension:
|
|
||||||
return PikaApiNodesExtension()
|
|
||||||
@ -11,12 +11,11 @@ User Guides:
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import Union, Optional
|
|
||||||
from typing_extensions import override
|
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
|
||||||
import torch
|
from typing_extensions import override
|
||||||
|
|
||||||
|
from comfy_api.latest import IO, ComfyExtension, Input, InputImpl
|
||||||
from comfy_api_nodes.apis import (
|
from comfy_api_nodes.apis import (
|
||||||
RunwayImageToVideoRequest,
|
RunwayImageToVideoRequest,
|
||||||
RunwayImageToVideoResponse,
|
RunwayImageToVideoResponse,
|
||||||
@ -44,8 +43,6 @@ from comfy_api_nodes.util import (
|
|||||||
sync_op,
|
sync_op,
|
||||||
poll_op,
|
poll_op,
|
||||||
)
|
)
|
||||||
from comfy_api.input_impl import VideoFromFile
|
|
||||||
from comfy_api.latest import ComfyExtension, IO
|
|
||||||
|
|
||||||
PATH_IMAGE_TO_VIDEO = "/proxy/runway/image_to_video"
|
PATH_IMAGE_TO_VIDEO = "/proxy/runway/image_to_video"
|
||||||
PATH_TEXT_TO_IMAGE = "/proxy/runway/text_to_image"
|
PATH_TEXT_TO_IMAGE = "/proxy/runway/text_to_image"
|
||||||
@ -80,7 +77,7 @@ class RunwayGen3aAspectRatio(str, Enum):
|
|||||||
field_1280_768 = "1280:768"
|
field_1280_768 = "1280:768"
|
||||||
|
|
||||||
|
|
||||||
def get_video_url_from_task_status(response: TaskStatusResponse) -> Union[str, None]:
|
def get_video_url_from_task_status(response: TaskStatusResponse) -> str | None:
|
||||||
"""Returns the video URL from the task status response if it exists."""
|
"""Returns the video URL from the task status response if it exists."""
|
||||||
if hasattr(response, "output") and len(response.output) > 0:
|
if hasattr(response, "output") and len(response.output) > 0:
|
||||||
return response.output[0]
|
return response.output[0]
|
||||||
@ -89,13 +86,13 @@ def get_video_url_from_task_status(response: TaskStatusResponse) -> Union[str, N
|
|||||||
|
|
||||||
def extract_progress_from_task_status(
|
def extract_progress_from_task_status(
|
||||||
response: TaskStatusResponse,
|
response: TaskStatusResponse,
|
||||||
) -> Union[float, None]:
|
) -> float | None:
|
||||||
if hasattr(response, "progress") and response.progress is not None:
|
if hasattr(response, "progress") and response.progress is not None:
|
||||||
return response.progress * 100
|
return response.progress * 100
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def get_image_url_from_task_status(response: TaskStatusResponse) -> Union[str, None]:
|
def get_image_url_from_task_status(response: TaskStatusResponse) -> str | None:
|
||||||
"""Returns the image URL from the task status response if it exists."""
|
"""Returns the image URL from the task status response if it exists."""
|
||||||
if hasattr(response, "output") and len(response.output) > 0:
|
if hasattr(response, "output") and len(response.output) > 0:
|
||||||
return response.output[0]
|
return response.output[0]
|
||||||
@ -103,7 +100,7 @@ def get_image_url_from_task_status(response: TaskStatusResponse) -> Union[str, N
|
|||||||
|
|
||||||
|
|
||||||
async def get_response(
|
async def get_response(
|
||||||
cls: type[IO.ComfyNode], task_id: str, estimated_duration: Optional[int] = None
|
cls: type[IO.ComfyNode], task_id: str, estimated_duration: int | None = None
|
||||||
) -> TaskStatusResponse:
|
) -> TaskStatusResponse:
|
||||||
"""Poll the task status until it is finished then get the response."""
|
"""Poll the task status until it is finished then get the response."""
|
||||||
return await poll_op(
|
return await poll_op(
|
||||||
@ -119,8 +116,8 @@ async def get_response(
|
|||||||
async def generate_video(
|
async def generate_video(
|
||||||
cls: type[IO.ComfyNode],
|
cls: type[IO.ComfyNode],
|
||||||
request: RunwayImageToVideoRequest,
|
request: RunwayImageToVideoRequest,
|
||||||
estimated_duration: Optional[int] = None,
|
estimated_duration: int | None = None,
|
||||||
) -> VideoFromFile:
|
) -> InputImpl.VideoFromFile:
|
||||||
initial_response = await sync_op(
|
initial_response = await sync_op(
|
||||||
cls,
|
cls,
|
||||||
endpoint=ApiEndpoint(path=PATH_IMAGE_TO_VIDEO, method="POST"),
|
endpoint=ApiEndpoint(path=PATH_IMAGE_TO_VIDEO, method="POST"),
|
||||||
@ -193,7 +190,7 @@ class RunwayImageToVideoNodeGen3a(IO.ComfyNode):
|
|||||||
async def execute(
|
async def execute(
|
||||||
cls,
|
cls,
|
||||||
prompt: str,
|
prompt: str,
|
||||||
start_frame: torch.Tensor,
|
start_frame: Input.Image,
|
||||||
duration: str,
|
duration: str,
|
||||||
ratio: str,
|
ratio: str,
|
||||||
seed: int,
|
seed: int,
|
||||||
@ -283,7 +280,7 @@ class RunwayImageToVideoNodeGen4(IO.ComfyNode):
|
|||||||
async def execute(
|
async def execute(
|
||||||
cls,
|
cls,
|
||||||
prompt: str,
|
prompt: str,
|
||||||
start_frame: torch.Tensor,
|
start_frame: Input.Image,
|
||||||
duration: str,
|
duration: str,
|
||||||
ratio: str,
|
ratio: str,
|
||||||
seed: int,
|
seed: int,
|
||||||
@ -381,8 +378,8 @@ class RunwayFirstLastFrameNode(IO.ComfyNode):
|
|||||||
async def execute(
|
async def execute(
|
||||||
cls,
|
cls,
|
||||||
prompt: str,
|
prompt: str,
|
||||||
start_frame: torch.Tensor,
|
start_frame: Input.Image,
|
||||||
end_frame: torch.Tensor,
|
end_frame: Input.Image,
|
||||||
duration: str,
|
duration: str,
|
||||||
ratio: str,
|
ratio: str,
|
||||||
seed: int,
|
seed: int,
|
||||||
@ -467,7 +464,7 @@ class RunwayTextToImageNode(IO.ComfyNode):
|
|||||||
cls,
|
cls,
|
||||||
prompt: str,
|
prompt: str,
|
||||||
ratio: str,
|
ratio: str,
|
||||||
reference_image: Optional[torch.Tensor] = None,
|
reference_image: Input.Image | None = None,
|
||||||
) -> IO.NodeOutput:
|
) -> IO.NodeOutput:
|
||||||
validate_string(prompt, min_length=1)
|
validate_string(prompt, min_length=1)
|
||||||
|
|
||||||
|
|||||||
@ -102,8 +102,9 @@ class TripoTextToModelNode(IO.ComfyNode):
|
|||||||
IO.Int.Input("model_seed", default=42, optional=True),
|
IO.Int.Input("model_seed", default=42, optional=True),
|
||||||
IO.Int.Input("texture_seed", default=42, optional=True),
|
IO.Int.Input("texture_seed", default=42, optional=True),
|
||||||
IO.Combo.Input("texture_quality", default="standard", options=["standard", "detailed"], optional=True),
|
IO.Combo.Input("texture_quality", default="standard", options=["standard", "detailed"], optional=True),
|
||||||
IO.Int.Input("face_limit", default=-1, min=-1, max=500000, optional=True),
|
IO.Int.Input("face_limit", default=-1, min=-1, max=2000000, optional=True),
|
||||||
IO.Boolean.Input("quad", default=False, optional=True),
|
IO.Boolean.Input("quad", default=False, optional=True),
|
||||||
|
IO.Combo.Input("geometry_quality", default="standard", options=["standard", "detailed"], optional=True),
|
||||||
],
|
],
|
||||||
outputs=[
|
outputs=[
|
||||||
IO.String.Output(display_name="model_file"),
|
IO.String.Output(display_name="model_file"),
|
||||||
@ -131,6 +132,7 @@ class TripoTextToModelNode(IO.ComfyNode):
|
|||||||
model_seed: Optional[int] = None,
|
model_seed: Optional[int] = None,
|
||||||
texture_seed: Optional[int] = None,
|
texture_seed: Optional[int] = None,
|
||||||
texture_quality: Optional[str] = None,
|
texture_quality: Optional[str] = None,
|
||||||
|
geometry_quality: Optional[str] = None,
|
||||||
face_limit: Optional[int] = None,
|
face_limit: Optional[int] = None,
|
||||||
quad: Optional[bool] = None,
|
quad: Optional[bool] = None,
|
||||||
) -> IO.NodeOutput:
|
) -> IO.NodeOutput:
|
||||||
@ -154,6 +156,7 @@ class TripoTextToModelNode(IO.ComfyNode):
|
|||||||
texture_seed=texture_seed,
|
texture_seed=texture_seed,
|
||||||
texture_quality=texture_quality,
|
texture_quality=texture_quality,
|
||||||
face_limit=face_limit,
|
face_limit=face_limit,
|
||||||
|
geometry_quality=geometry_quality,
|
||||||
auto_size=True,
|
auto_size=True,
|
||||||
quad=quad,
|
quad=quad,
|
||||||
),
|
),
|
||||||
@ -194,6 +197,7 @@ class TripoImageToModelNode(IO.ComfyNode):
|
|||||||
),
|
),
|
||||||
IO.Int.Input("face_limit", default=-1, min=-1, max=500000, optional=True),
|
IO.Int.Input("face_limit", default=-1, min=-1, max=500000, optional=True),
|
||||||
IO.Boolean.Input("quad", default=False, optional=True),
|
IO.Boolean.Input("quad", default=False, optional=True),
|
||||||
|
IO.Combo.Input("geometry_quality", default="standard", options=["standard", "detailed"], optional=True),
|
||||||
],
|
],
|
||||||
outputs=[
|
outputs=[
|
||||||
IO.String.Output(display_name="model_file"),
|
IO.String.Output(display_name="model_file"),
|
||||||
@ -220,6 +224,7 @@ class TripoImageToModelNode(IO.ComfyNode):
|
|||||||
orientation=None,
|
orientation=None,
|
||||||
texture_seed: Optional[int] = None,
|
texture_seed: Optional[int] = None,
|
||||||
texture_quality: Optional[str] = None,
|
texture_quality: Optional[str] = None,
|
||||||
|
geometry_quality: Optional[str] = None,
|
||||||
texture_alignment: Optional[str] = None,
|
texture_alignment: Optional[str] = None,
|
||||||
face_limit: Optional[int] = None,
|
face_limit: Optional[int] = None,
|
||||||
quad: Optional[bool] = None,
|
quad: Optional[bool] = None,
|
||||||
@ -246,6 +251,7 @@ class TripoImageToModelNode(IO.ComfyNode):
|
|||||||
pbr=pbr,
|
pbr=pbr,
|
||||||
model_seed=model_seed,
|
model_seed=model_seed,
|
||||||
orientation=orientation,
|
orientation=orientation,
|
||||||
|
geometry_quality=geometry_quality,
|
||||||
texture_alignment=texture_alignment,
|
texture_alignment=texture_alignment,
|
||||||
texture_seed=texture_seed,
|
texture_seed=texture_seed,
|
||||||
texture_quality=texture_quality,
|
texture_quality=texture_quality,
|
||||||
@ -295,6 +301,7 @@ class TripoMultiviewToModelNode(IO.ComfyNode):
|
|||||||
),
|
),
|
||||||
IO.Int.Input("face_limit", default=-1, min=-1, max=500000, optional=True),
|
IO.Int.Input("face_limit", default=-1, min=-1, max=500000, optional=True),
|
||||||
IO.Boolean.Input("quad", default=False, optional=True),
|
IO.Boolean.Input("quad", default=False, optional=True),
|
||||||
|
IO.Combo.Input("geometry_quality", default="standard", options=["standard", "detailed"], optional=True),
|
||||||
],
|
],
|
||||||
outputs=[
|
outputs=[
|
||||||
IO.String.Output(display_name="model_file"),
|
IO.String.Output(display_name="model_file"),
|
||||||
@ -323,6 +330,7 @@ class TripoMultiviewToModelNode(IO.ComfyNode):
|
|||||||
model_seed: Optional[int] = None,
|
model_seed: Optional[int] = None,
|
||||||
texture_seed: Optional[int] = None,
|
texture_seed: Optional[int] = None,
|
||||||
texture_quality: Optional[str] = None,
|
texture_quality: Optional[str] = None,
|
||||||
|
geometry_quality: Optional[str] = None,
|
||||||
texture_alignment: Optional[str] = None,
|
texture_alignment: Optional[str] = None,
|
||||||
face_limit: Optional[int] = None,
|
face_limit: Optional[int] = None,
|
||||||
quad: Optional[bool] = None,
|
quad: Optional[bool] = None,
|
||||||
@ -359,6 +367,7 @@ class TripoMultiviewToModelNode(IO.ComfyNode):
|
|||||||
model_seed=model_seed,
|
model_seed=model_seed,
|
||||||
texture_seed=texture_seed,
|
texture_seed=texture_seed,
|
||||||
texture_quality=texture_quality,
|
texture_quality=texture_quality,
|
||||||
|
geometry_quality=geometry_quality,
|
||||||
texture_alignment=texture_alignment,
|
texture_alignment=texture_alignment,
|
||||||
face_limit=face_limit,
|
face_limit=face_limit,
|
||||||
quad=quad,
|
quad=quad,
|
||||||
@ -508,6 +517,8 @@ class TripoRetargetNode(IO.ComfyNode):
|
|||||||
options=[
|
options=[
|
||||||
"preset:idle",
|
"preset:idle",
|
||||||
"preset:walk",
|
"preset:walk",
|
||||||
|
"preset:run",
|
||||||
|
"preset:dive",
|
||||||
"preset:climb",
|
"preset:climb",
|
||||||
"preset:jump",
|
"preset:jump",
|
||||||
"preset:slash",
|
"preset:slash",
|
||||||
@ -515,6 +526,11 @@ class TripoRetargetNode(IO.ComfyNode):
|
|||||||
"preset:hurt",
|
"preset:hurt",
|
||||||
"preset:fall",
|
"preset:fall",
|
||||||
"preset:turn",
|
"preset:turn",
|
||||||
|
"preset:quadruped:walk",
|
||||||
|
"preset:hexapod:walk",
|
||||||
|
"preset:octopod:walk",
|
||||||
|
"preset:serpentine:march",
|
||||||
|
"preset:aquatic:march"
|
||||||
],
|
],
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@ -563,7 +579,7 @@ class TripoConversionNode(IO.ComfyNode):
|
|||||||
"face_limit",
|
"face_limit",
|
||||||
default=-1,
|
default=-1,
|
||||||
min=-1,
|
min=-1,
|
||||||
max=500000,
|
max=2000000,
|
||||||
optional=True,
|
optional=True,
|
||||||
),
|
),
|
||||||
IO.Int.Input(
|
IO.Int.Input(
|
||||||
@ -579,6 +595,40 @@ class TripoConversionNode(IO.ComfyNode):
|
|||||||
default="JPEG",
|
default="JPEG",
|
||||||
optional=True,
|
optional=True,
|
||||||
),
|
),
|
||||||
|
IO.Boolean.Input("force_symmetry", default=False, optional=True),
|
||||||
|
IO.Boolean.Input("flatten_bottom", default=False, optional=True),
|
||||||
|
IO.Float.Input(
|
||||||
|
"flatten_bottom_threshold",
|
||||||
|
default=0.0,
|
||||||
|
min=0.0,
|
||||||
|
max=1.0,
|
||||||
|
optional=True,
|
||||||
|
),
|
||||||
|
IO.Boolean.Input("pivot_to_center_bottom", default=False, optional=True),
|
||||||
|
IO.Float.Input(
|
||||||
|
"scale_factor",
|
||||||
|
default=1.0,
|
||||||
|
min=0.0,
|
||||||
|
optional=True,
|
||||||
|
),
|
||||||
|
IO.Boolean.Input("with_animation", default=False, optional=True),
|
||||||
|
IO.Boolean.Input("pack_uv", default=False, optional=True),
|
||||||
|
IO.Boolean.Input("bake", default=False, optional=True),
|
||||||
|
IO.String.Input("part_names", default="", optional=True), # comma-separated list
|
||||||
|
IO.Combo.Input(
|
||||||
|
"fbx_preset",
|
||||||
|
options=["blender", "mixamo", "3dsmax"],
|
||||||
|
default="blender",
|
||||||
|
optional=True,
|
||||||
|
),
|
||||||
|
IO.Boolean.Input("export_vertex_colors", default=False, optional=True),
|
||||||
|
IO.Combo.Input(
|
||||||
|
"export_orientation",
|
||||||
|
options=["align_image", "default"],
|
||||||
|
default="default",
|
||||||
|
optional=True,
|
||||||
|
),
|
||||||
|
IO.Boolean.Input("animate_in_place", default=False, optional=True),
|
||||||
],
|
],
|
||||||
outputs=[],
|
outputs=[],
|
||||||
hidden=[
|
hidden=[
|
||||||
@ -604,12 +654,31 @@ class TripoConversionNode(IO.ComfyNode):
|
|||||||
original_model_task_id,
|
original_model_task_id,
|
||||||
format: str,
|
format: str,
|
||||||
quad: bool,
|
quad: bool,
|
||||||
|
force_symmetry: bool,
|
||||||
face_limit: int,
|
face_limit: int,
|
||||||
|
flatten_bottom: bool,
|
||||||
|
flatten_bottom_threshold: float,
|
||||||
texture_size: int,
|
texture_size: int,
|
||||||
texture_format: str,
|
texture_format: str,
|
||||||
|
pivot_to_center_bottom: bool,
|
||||||
|
scale_factor: float,
|
||||||
|
with_animation: bool,
|
||||||
|
pack_uv: bool,
|
||||||
|
bake: bool,
|
||||||
|
part_names: str,
|
||||||
|
fbx_preset: str,
|
||||||
|
export_vertex_colors: bool,
|
||||||
|
export_orientation: str,
|
||||||
|
animate_in_place: bool,
|
||||||
) -> IO.NodeOutput:
|
) -> IO.NodeOutput:
|
||||||
if not original_model_task_id:
|
if not original_model_task_id:
|
||||||
raise RuntimeError("original_model_task_id is required")
|
raise RuntimeError("original_model_task_id is required")
|
||||||
|
|
||||||
|
# Parse part_names from comma-separated string to list
|
||||||
|
part_names_list = None
|
||||||
|
if part_names and part_names.strip():
|
||||||
|
part_names_list = [name.strip() for name in part_names.split(',') if name.strip()]
|
||||||
|
|
||||||
response = await sync_op(
|
response = await sync_op(
|
||||||
cls,
|
cls,
|
||||||
endpoint=ApiEndpoint(path="/proxy/tripo/v2/openapi/task", method="POST"),
|
endpoint=ApiEndpoint(path="/proxy/tripo/v2/openapi/task", method="POST"),
|
||||||
@ -618,9 +687,22 @@ class TripoConversionNode(IO.ComfyNode):
|
|||||||
original_model_task_id=original_model_task_id,
|
original_model_task_id=original_model_task_id,
|
||||||
format=format,
|
format=format,
|
||||||
quad=quad if quad else None,
|
quad=quad if quad else None,
|
||||||
|
force_symmetry=force_symmetry if force_symmetry else None,
|
||||||
face_limit=face_limit if face_limit != -1 else None,
|
face_limit=face_limit if face_limit != -1 else None,
|
||||||
|
flatten_bottom=flatten_bottom if flatten_bottom else None,
|
||||||
|
flatten_bottom_threshold=flatten_bottom_threshold if flatten_bottom_threshold != 0.0 else None,
|
||||||
texture_size=texture_size if texture_size != 4096 else None,
|
texture_size=texture_size if texture_size != 4096 else None,
|
||||||
texture_format=texture_format if texture_format != "JPEG" else None,
|
texture_format=texture_format if texture_format != "JPEG" else None,
|
||||||
|
pivot_to_center_bottom=pivot_to_center_bottom if pivot_to_center_bottom else None,
|
||||||
|
scale_factor=scale_factor if scale_factor != 1.0 else None,
|
||||||
|
with_animation=with_animation if with_animation else None,
|
||||||
|
pack_uv=pack_uv if pack_uv else None,
|
||||||
|
bake=bake if bake else None,
|
||||||
|
part_names=part_names_list,
|
||||||
|
fbx_preset=fbx_preset if fbx_preset != "blender" else None,
|
||||||
|
export_vertex_colors=export_vertex_colors if export_vertex_colors else None,
|
||||||
|
export_orientation=export_orientation if export_orientation != "default" else None,
|
||||||
|
animate_in_place=animate_in_place if animate_in_place else None,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
return await poll_until_finished(cls, response, average_duration=30)
|
return await poll_until_finished(cls, response, average_duration=30)
|
||||||
|
|||||||
@ -3,13 +3,15 @@ from io import BytesIO
|
|||||||
|
|
||||||
from typing_extensions import override
|
from typing_extensions import override
|
||||||
|
|
||||||
from comfy_api.input_impl.video_types import VideoFromFile
|
from comfy_api.latest import IO, ComfyExtension, Input, InputImpl
|
||||||
from comfy_api.latest import IO, ComfyExtension
|
|
||||||
from comfy_api_nodes.apis.veo_api import (
|
from comfy_api_nodes.apis.veo_api import (
|
||||||
VeoGenVidPollRequest,
|
VeoGenVidPollRequest,
|
||||||
VeoGenVidPollResponse,
|
VeoGenVidPollResponse,
|
||||||
VeoGenVidRequest,
|
VeoGenVidRequest,
|
||||||
VeoGenVidResponse,
|
VeoGenVidResponse,
|
||||||
|
VeoRequestInstance,
|
||||||
|
VeoRequestInstanceImage,
|
||||||
|
VeoRequestParameters,
|
||||||
)
|
)
|
||||||
from comfy_api_nodes.util import (
|
from comfy_api_nodes.util import (
|
||||||
ApiEndpoint,
|
ApiEndpoint,
|
||||||
@ -228,7 +230,7 @@ class VeoVideoGenerationNode(IO.ComfyNode):
|
|||||||
|
|
||||||
# Check if video is provided as base64 or URL
|
# Check if video is provided as base64 or URL
|
||||||
if hasattr(video, "bytesBase64Encoded") and video.bytesBase64Encoded:
|
if hasattr(video, "bytesBase64Encoded") and video.bytesBase64Encoded:
|
||||||
return IO.NodeOutput(VideoFromFile(BytesIO(base64.b64decode(video.bytesBase64Encoded))))
|
return IO.NodeOutput(InputImpl.VideoFromFile(BytesIO(base64.b64decode(video.bytesBase64Encoded))))
|
||||||
|
|
||||||
if hasattr(video, "gcsUri") and video.gcsUri:
|
if hasattr(video, "gcsUri") and video.gcsUri:
|
||||||
return IO.NodeOutput(await download_url_to_video_output(video.gcsUri))
|
return IO.NodeOutput(await download_url_to_video_output(video.gcsUri))
|
||||||
@ -346,12 +348,163 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Veo3FirstLastFrameNode(IO.ComfyNode):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def define_schema(cls):
|
||||||
|
return IO.Schema(
|
||||||
|
node_id="Veo3FirstLastFrameNode",
|
||||||
|
display_name="Google Veo 3 First-Last-Frame to Video",
|
||||||
|
category="api node/video/Veo",
|
||||||
|
description="Generate video using prompt and first and last frames.",
|
||||||
|
inputs=[
|
||||||
|
IO.String.Input(
|
||||||
|
"prompt",
|
||||||
|
multiline=True,
|
||||||
|
default="",
|
||||||
|
tooltip="Text description of the video",
|
||||||
|
),
|
||||||
|
IO.String.Input(
|
||||||
|
"negative_prompt",
|
||||||
|
multiline=True,
|
||||||
|
default="",
|
||||||
|
tooltip="Negative text prompt to guide what to avoid in the video",
|
||||||
|
),
|
||||||
|
IO.Combo.Input("resolution", options=["720p", "1080p"]),
|
||||||
|
IO.Combo.Input(
|
||||||
|
"aspect_ratio",
|
||||||
|
options=["16:9", "9:16"],
|
||||||
|
default="16:9",
|
||||||
|
tooltip="Aspect ratio of the output video",
|
||||||
|
),
|
||||||
|
IO.Int.Input(
|
||||||
|
"duration",
|
||||||
|
default=8,
|
||||||
|
min=4,
|
||||||
|
max=8,
|
||||||
|
step=2,
|
||||||
|
display_mode=IO.NumberDisplay.slider,
|
||||||
|
tooltip="Duration of the output video in seconds",
|
||||||
|
),
|
||||||
|
IO.Int.Input(
|
||||||
|
"seed",
|
||||||
|
default=0,
|
||||||
|
min=0,
|
||||||
|
max=0xFFFFFFFF,
|
||||||
|
step=1,
|
||||||
|
display_mode=IO.NumberDisplay.number,
|
||||||
|
control_after_generate=True,
|
||||||
|
tooltip="Seed for video generation",
|
||||||
|
),
|
||||||
|
IO.Image.Input("first_frame", tooltip="Start frame"),
|
||||||
|
IO.Image.Input("last_frame", tooltip="End frame"),
|
||||||
|
IO.Combo.Input(
|
||||||
|
"model",
|
||||||
|
options=["veo-3.1-generate", "veo-3.1-fast-generate"],
|
||||||
|
default="veo-3.1-fast-generate",
|
||||||
|
),
|
||||||
|
IO.Boolean.Input(
|
||||||
|
"generate_audio",
|
||||||
|
default=True,
|
||||||
|
tooltip="Generate audio for the video.",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
IO.Video.Output(),
|
||||||
|
],
|
||||||
|
hidden=[
|
||||||
|
IO.Hidden.auth_token_comfy_org,
|
||||||
|
IO.Hidden.api_key_comfy_org,
|
||||||
|
IO.Hidden.unique_id,
|
||||||
|
],
|
||||||
|
is_api_node=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def execute(
|
||||||
|
cls,
|
||||||
|
prompt: str,
|
||||||
|
negative_prompt: str,
|
||||||
|
resolution: str,
|
||||||
|
aspect_ratio: str,
|
||||||
|
duration: int,
|
||||||
|
seed: int,
|
||||||
|
first_frame: Input.Image,
|
||||||
|
last_frame: Input.Image,
|
||||||
|
model: str,
|
||||||
|
generate_audio: bool,
|
||||||
|
):
|
||||||
|
model = MODELS_MAP[model]
|
||||||
|
initial_response = await sync_op(
|
||||||
|
cls,
|
||||||
|
ApiEndpoint(path=f"/proxy/veo/{model}/generate", method="POST"),
|
||||||
|
response_model=VeoGenVidResponse,
|
||||||
|
data=VeoGenVidRequest(
|
||||||
|
instances=[
|
||||||
|
VeoRequestInstance(
|
||||||
|
prompt=prompt,
|
||||||
|
image=VeoRequestInstanceImage(
|
||||||
|
bytesBase64Encoded=tensor_to_base64_string(first_frame), mimeType="image/png"
|
||||||
|
),
|
||||||
|
lastFrame=VeoRequestInstanceImage(
|
||||||
|
bytesBase64Encoded=tensor_to_base64_string(last_frame), mimeType="image/png"
|
||||||
|
),
|
||||||
|
),
|
||||||
|
],
|
||||||
|
parameters=VeoRequestParameters(
|
||||||
|
aspectRatio=aspect_ratio,
|
||||||
|
personGeneration="ALLOW",
|
||||||
|
durationSeconds=duration,
|
||||||
|
enhancePrompt=True, # cannot be False for Veo3
|
||||||
|
seed=seed,
|
||||||
|
generateAudio=generate_audio,
|
||||||
|
negativePrompt=negative_prompt,
|
||||||
|
resolution=resolution,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
poll_response = await poll_op(
|
||||||
|
cls,
|
||||||
|
ApiEndpoint(path=f"/proxy/veo/{model}/poll", method="POST"),
|
||||||
|
response_model=VeoGenVidPollResponse,
|
||||||
|
status_extractor=lambda r: "completed" if r.done else "pending",
|
||||||
|
data=VeoGenVidPollRequest(
|
||||||
|
operationName=initial_response.name,
|
||||||
|
),
|
||||||
|
poll_interval=5.0,
|
||||||
|
estimated_duration=AVERAGE_DURATION_VIDEO_GEN,
|
||||||
|
)
|
||||||
|
|
||||||
|
if poll_response.error:
|
||||||
|
raise Exception(f"Veo API error: {poll_response.error.message} (code: {poll_response.error.code})")
|
||||||
|
|
||||||
|
response = poll_response.response
|
||||||
|
filtered_count = response.raiMediaFilteredCount
|
||||||
|
if filtered_count:
|
||||||
|
reasons = response.raiMediaFilteredReasons or []
|
||||||
|
reason_part = f": {reasons[0]}" if reasons else ""
|
||||||
|
raise Exception(
|
||||||
|
f"Content blocked by Google's Responsible AI filters{reason_part} "
|
||||||
|
f"({filtered_count} video{'s' if filtered_count != 1 else ''} filtered)."
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.videos:
|
||||||
|
video = response.videos[0]
|
||||||
|
if video.bytesBase64Encoded:
|
||||||
|
return IO.NodeOutput(InputImpl.VideoFromFile(BytesIO(base64.b64decode(video.bytesBase64Encoded))))
|
||||||
|
if video.gcsUri:
|
||||||
|
return IO.NodeOutput(await download_url_to_video_output(video.gcsUri))
|
||||||
|
raise Exception("Video returned but no data or URL was provided")
|
||||||
|
raise Exception("Video generation completed but no video was returned")
|
||||||
|
|
||||||
|
|
||||||
class VeoExtension(ComfyExtension):
|
class VeoExtension(ComfyExtension):
|
||||||
@override
|
@override
|
||||||
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
|
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
|
||||||
return [
|
return [
|
||||||
VeoVideoGenerationNode,
|
VeoVideoGenerationNode,
|
||||||
Veo3VideoGenerationNode,
|
Veo3VideoGenerationNode,
|
||||||
|
Veo3FirstLastFrameNode,
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -47,6 +47,7 @@ from .validation_utils import (
|
|||||||
validate_string,
|
validate_string,
|
||||||
validate_video_dimensions,
|
validate_video_dimensions,
|
||||||
validate_video_duration,
|
validate_video_duration,
|
||||||
|
validate_video_frame_count,
|
||||||
)
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
@ -94,6 +95,7 @@ __all__ = [
|
|||||||
"validate_string",
|
"validate_string",
|
||||||
"validate_video_dimensions",
|
"validate_video_dimensions",
|
||||||
"validate_video_duration",
|
"validate_video_duration",
|
||||||
|
"validate_video_frame_count",
|
||||||
# Misc functions
|
# Misc functions
|
||||||
"get_fs_object_size",
|
"get_fs_object_size",
|
||||||
]
|
]
|
||||||
|
|||||||
@ -2,8 +2,8 @@ import asyncio
|
|||||||
import contextlib
|
import contextlib
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
from collections.abc import Callable
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from typing import Callable, Optional, Union
|
|
||||||
|
|
||||||
from comfy.cli_args import args
|
from comfy.cli_args import args
|
||||||
from comfy.model_management import processing_interrupted
|
from comfy.model_management import processing_interrupted
|
||||||
@ -35,12 +35,12 @@ def default_base_url() -> str:
|
|||||||
|
|
||||||
async def sleep_with_interrupt(
|
async def sleep_with_interrupt(
|
||||||
seconds: float,
|
seconds: float,
|
||||||
node_cls: Optional[type[IO.ComfyNode]],
|
node_cls: type[IO.ComfyNode] | None,
|
||||||
label: Optional[str] = None,
|
label: str | None = None,
|
||||||
start_ts: Optional[float] = None,
|
start_ts: float | None = None,
|
||||||
estimated_total: Optional[int] = None,
|
estimated_total: int | None = None,
|
||||||
*,
|
*,
|
||||||
display_callback: Optional[Callable[[type[IO.ComfyNode], str, int, Optional[int]], None]] = None,
|
display_callback: Callable[[type[IO.ComfyNode], str, int, int | None], None] | None = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Sleep in 1s slices while:
|
Sleep in 1s slices while:
|
||||||
@ -65,7 +65,7 @@ def mimetype_to_extension(mime_type: str) -> str:
|
|||||||
return mime_type.split("/")[-1].lower()
|
return mime_type.split("/")[-1].lower()
|
||||||
|
|
||||||
|
|
||||||
def get_fs_object_size(path_or_object: Union[str, BytesIO]) -> int:
|
def get_fs_object_size(path_or_object: str | BytesIO) -> int:
|
||||||
if isinstance(path_or_object, str):
|
if isinstance(path_or_object, str):
|
||||||
return os.path.getsize(path_or_object)
|
return os.path.getsize(path_or_object)
|
||||||
return len(path_or_object.getvalue())
|
return len(path_or_object.getvalue())
|
||||||
|
|||||||
@ -4,10 +4,11 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
|
from collections.abc import Callable, Iterable
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from typing import Any, Callable, Iterable, Literal, Optional, Type, TypeVar, Union
|
from typing import Any, Literal, TypeVar
|
||||||
from urllib.parse import urljoin, urlparse
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
@ -37,8 +38,8 @@ class ApiEndpoint:
|
|||||||
path: str,
|
path: str,
|
||||||
method: Literal["GET", "POST", "PUT", "DELETE", "PATCH"] = "GET",
|
method: Literal["GET", "POST", "PUT", "DELETE", "PATCH"] = "GET",
|
||||||
*,
|
*,
|
||||||
query_params: Optional[dict[str, Any]] = None,
|
query_params: dict[str, Any] | None = None,
|
||||||
headers: Optional[dict[str, str]] = None,
|
headers: dict[str, str] | None = None,
|
||||||
):
|
):
|
||||||
self.path = path
|
self.path = path
|
||||||
self.method = method
|
self.method = method
|
||||||
@ -52,18 +53,18 @@ class _RequestConfig:
|
|||||||
endpoint: ApiEndpoint
|
endpoint: ApiEndpoint
|
||||||
timeout: float
|
timeout: float
|
||||||
content_type: str
|
content_type: str
|
||||||
data: Optional[dict[str, Any]]
|
data: dict[str, Any] | None
|
||||||
files: Optional[Union[dict[str, Any], list[tuple[str, Any]]]]
|
files: dict[str, Any] | list[tuple[str, Any]] | None
|
||||||
multipart_parser: Optional[Callable]
|
multipart_parser: Callable | None
|
||||||
max_retries: int
|
max_retries: int
|
||||||
retry_delay: float
|
retry_delay: float
|
||||||
retry_backoff: float
|
retry_backoff: float
|
||||||
wait_label: str = "Waiting"
|
wait_label: str = "Waiting"
|
||||||
monitor_progress: bool = True
|
monitor_progress: bool = True
|
||||||
estimated_total: Optional[int] = None
|
estimated_total: int | None = None
|
||||||
final_label_on_success: Optional[str] = "Completed"
|
final_label_on_success: str | None = "Completed"
|
||||||
progress_origin_ts: Optional[float] = None
|
progress_origin_ts: float | None = None
|
||||||
price_extractor: Optional[Callable[[dict[str, Any]], Optional[float]]] = None
|
price_extractor: Callable[[dict[str, Any]], float | None] | None = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -71,10 +72,10 @@ class _PollUIState:
|
|||||||
started: float
|
started: float
|
||||||
status_label: str = "Queued"
|
status_label: str = "Queued"
|
||||||
is_queued: bool = True
|
is_queued: bool = True
|
||||||
price: Optional[float] = None
|
price: float | None = None
|
||||||
estimated_duration: Optional[int] = None
|
estimated_duration: int | None = None
|
||||||
base_processing_elapsed: float = 0.0 # sum of completed active intervals
|
base_processing_elapsed: float = 0.0 # sum of completed active intervals
|
||||||
active_since: Optional[float] = None # start time of current active interval (None if queued)
|
active_since: float | None = None # start time of current active interval (None if queued)
|
||||||
|
|
||||||
|
|
||||||
_RETRY_STATUS = {408, 429, 500, 502, 503, 504}
|
_RETRY_STATUS = {408, 429, 500, 502, 503, 504}
|
||||||
@ -87,20 +88,20 @@ async def sync_op(
|
|||||||
cls: type[IO.ComfyNode],
|
cls: type[IO.ComfyNode],
|
||||||
endpoint: ApiEndpoint,
|
endpoint: ApiEndpoint,
|
||||||
*,
|
*,
|
||||||
response_model: Type[M],
|
response_model: type[M],
|
||||||
price_extractor: Optional[Callable[[M], Optional[float]]] = None,
|
price_extractor: Callable[[M | Any], float | None] | None = None,
|
||||||
data: Optional[BaseModel] = None,
|
data: BaseModel | None = None,
|
||||||
files: Optional[Union[dict[str, Any], list[tuple[str, Any]]]] = None,
|
files: dict[str, Any] | list[tuple[str, Any]] | None = None,
|
||||||
content_type: str = "application/json",
|
content_type: str = "application/json",
|
||||||
timeout: float = 3600.0,
|
timeout: float = 3600.0,
|
||||||
multipart_parser: Optional[Callable] = None,
|
multipart_parser: Callable | None = None,
|
||||||
max_retries: int = 3,
|
max_retries: int = 3,
|
||||||
retry_delay: float = 1.0,
|
retry_delay: float = 1.0,
|
||||||
retry_backoff: float = 2.0,
|
retry_backoff: float = 2.0,
|
||||||
wait_label: str = "Waiting for server",
|
wait_label: str = "Waiting for server",
|
||||||
estimated_duration: Optional[int] = None,
|
estimated_duration: int | None = None,
|
||||||
final_label_on_success: Optional[str] = "Completed",
|
final_label_on_success: str | None = "Completed",
|
||||||
progress_origin_ts: Optional[float] = None,
|
progress_origin_ts: float | None = None,
|
||||||
monitor_progress: bool = True,
|
monitor_progress: bool = True,
|
||||||
) -> M:
|
) -> M:
|
||||||
raw = await sync_op_raw(
|
raw = await sync_op_raw(
|
||||||
@ -131,22 +132,22 @@ async def poll_op(
|
|||||||
cls: type[IO.ComfyNode],
|
cls: type[IO.ComfyNode],
|
||||||
poll_endpoint: ApiEndpoint,
|
poll_endpoint: ApiEndpoint,
|
||||||
*,
|
*,
|
||||||
response_model: Type[M],
|
response_model: type[M],
|
||||||
status_extractor: Callable[[M], Optional[Union[str, int]]],
|
status_extractor: Callable[[M | Any], str | int | None],
|
||||||
progress_extractor: Optional[Callable[[M], Optional[int]]] = None,
|
progress_extractor: Callable[[M | Any], int | None] | None = None,
|
||||||
price_extractor: Optional[Callable[[M], Optional[float]]] = None,
|
price_extractor: Callable[[M | Any], float | None] | None = None,
|
||||||
completed_statuses: Optional[list[Union[str, int]]] = None,
|
completed_statuses: list[str | int] | None = None,
|
||||||
failed_statuses: Optional[list[Union[str, int]]] = None,
|
failed_statuses: list[str | int] | None = None,
|
||||||
queued_statuses: Optional[list[Union[str, int]]] = None,
|
queued_statuses: list[str | int] | None = None,
|
||||||
data: Optional[BaseModel] = None,
|
data: BaseModel | None = None,
|
||||||
poll_interval: float = 5.0,
|
poll_interval: float = 5.0,
|
||||||
max_poll_attempts: int = 120,
|
max_poll_attempts: int = 120,
|
||||||
timeout_per_poll: float = 120.0,
|
timeout_per_poll: float = 120.0,
|
||||||
max_retries_per_poll: int = 3,
|
max_retries_per_poll: int = 3,
|
||||||
retry_delay_per_poll: float = 1.0,
|
retry_delay_per_poll: float = 1.0,
|
||||||
retry_backoff_per_poll: float = 2.0,
|
retry_backoff_per_poll: float = 2.0,
|
||||||
estimated_duration: Optional[int] = None,
|
estimated_duration: int | None = None,
|
||||||
cancel_endpoint: Optional[ApiEndpoint] = None,
|
cancel_endpoint: ApiEndpoint | None = None,
|
||||||
cancel_timeout: float = 10.0,
|
cancel_timeout: float = 10.0,
|
||||||
) -> M:
|
) -> M:
|
||||||
raw = await poll_op_raw(
|
raw = await poll_op_raw(
|
||||||
@ -178,22 +179,22 @@ async def sync_op_raw(
|
|||||||
cls: type[IO.ComfyNode],
|
cls: type[IO.ComfyNode],
|
||||||
endpoint: ApiEndpoint,
|
endpoint: ApiEndpoint,
|
||||||
*,
|
*,
|
||||||
price_extractor: Optional[Callable[[dict[str, Any]], Optional[float]]] = None,
|
price_extractor: Callable[[dict[str, Any]], float | None] | None = None,
|
||||||
data: Optional[Union[dict[str, Any], BaseModel]] = None,
|
data: dict[str, Any] | BaseModel | None = None,
|
||||||
files: Optional[Union[dict[str, Any], list[tuple[str, Any]]]] = None,
|
files: dict[str, Any] | list[tuple[str, Any]] | None = None,
|
||||||
content_type: str = "application/json",
|
content_type: str = "application/json",
|
||||||
timeout: float = 3600.0,
|
timeout: float = 3600.0,
|
||||||
multipart_parser: Optional[Callable] = None,
|
multipart_parser: Callable | None = None,
|
||||||
max_retries: int = 3,
|
max_retries: int = 3,
|
||||||
retry_delay: float = 1.0,
|
retry_delay: float = 1.0,
|
||||||
retry_backoff: float = 2.0,
|
retry_backoff: float = 2.0,
|
||||||
wait_label: str = "Waiting for server",
|
wait_label: str = "Waiting for server",
|
||||||
estimated_duration: Optional[int] = None,
|
estimated_duration: int | None = None,
|
||||||
as_binary: bool = False,
|
as_binary: bool = False,
|
||||||
final_label_on_success: Optional[str] = "Completed",
|
final_label_on_success: str | None = "Completed",
|
||||||
progress_origin_ts: Optional[float] = None,
|
progress_origin_ts: float | None = None,
|
||||||
monitor_progress: bool = True,
|
monitor_progress: bool = True,
|
||||||
) -> Union[dict[str, Any], bytes]:
|
) -> dict[str, Any] | bytes:
|
||||||
"""
|
"""
|
||||||
Make a single network request.
|
Make a single network request.
|
||||||
- If as_binary=False (default): returns JSON dict (or {'_raw': '<text>'} if non-JSON).
|
- If as_binary=False (default): returns JSON dict (or {'_raw': '<text>'} if non-JSON).
|
||||||
@ -229,21 +230,21 @@ async def poll_op_raw(
|
|||||||
cls: type[IO.ComfyNode],
|
cls: type[IO.ComfyNode],
|
||||||
poll_endpoint: ApiEndpoint,
|
poll_endpoint: ApiEndpoint,
|
||||||
*,
|
*,
|
||||||
status_extractor: Callable[[dict[str, Any]], Optional[Union[str, int]]],
|
status_extractor: Callable[[dict[str, Any]], str | int | None],
|
||||||
progress_extractor: Optional[Callable[[dict[str, Any]], Optional[int]]] = None,
|
progress_extractor: Callable[[dict[str, Any]], int | None] | None = None,
|
||||||
price_extractor: Optional[Callable[[dict[str, Any]], Optional[float]]] = None,
|
price_extractor: Callable[[dict[str, Any]], float | None] | None = None,
|
||||||
completed_statuses: Optional[list[Union[str, int]]] = None,
|
completed_statuses: list[str | int] | None = None,
|
||||||
failed_statuses: Optional[list[Union[str, int]]] = None,
|
failed_statuses: list[str | int] | None = None,
|
||||||
queued_statuses: Optional[list[Union[str, int]]] = None,
|
queued_statuses: list[str | int] | None = None,
|
||||||
data: Optional[Union[dict[str, Any], BaseModel]] = None,
|
data: dict[str, Any] | BaseModel | None = None,
|
||||||
poll_interval: float = 5.0,
|
poll_interval: float = 5.0,
|
||||||
max_poll_attempts: int = 120,
|
max_poll_attempts: int = 120,
|
||||||
timeout_per_poll: float = 120.0,
|
timeout_per_poll: float = 120.0,
|
||||||
max_retries_per_poll: int = 3,
|
max_retries_per_poll: int = 3,
|
||||||
retry_delay_per_poll: float = 1.0,
|
retry_delay_per_poll: float = 1.0,
|
||||||
retry_backoff_per_poll: float = 2.0,
|
retry_backoff_per_poll: float = 2.0,
|
||||||
estimated_duration: Optional[int] = None,
|
estimated_duration: int | None = None,
|
||||||
cancel_endpoint: Optional[ApiEndpoint] = None,
|
cancel_endpoint: ApiEndpoint | None = None,
|
||||||
cancel_timeout: float = 10.0,
|
cancel_timeout: float = 10.0,
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
@ -261,7 +262,7 @@ async def poll_op_raw(
|
|||||||
consumed_attempts = 0 # counts only non-queued polls
|
consumed_attempts = 0 # counts only non-queued polls
|
||||||
|
|
||||||
progress_bar = utils.ProgressBar(100) if progress_extractor else None
|
progress_bar = utils.ProgressBar(100) if progress_extractor else None
|
||||||
last_progress: Optional[int] = None
|
last_progress: int | None = None
|
||||||
|
|
||||||
state = _PollUIState(started=started, estimated_duration=estimated_duration)
|
state = _PollUIState(started=started, estimated_duration=estimated_duration)
|
||||||
stop_ticker = asyncio.Event()
|
stop_ticker = asyncio.Event()
|
||||||
@ -420,10 +421,10 @@ async def poll_op_raw(
|
|||||||
|
|
||||||
def _display_text(
|
def _display_text(
|
||||||
node_cls: type[IO.ComfyNode],
|
node_cls: type[IO.ComfyNode],
|
||||||
text: Optional[str],
|
text: str | None,
|
||||||
*,
|
*,
|
||||||
status: Optional[Union[str, int]] = None,
|
status: str | int | None = None,
|
||||||
price: Optional[float] = None,
|
price: float | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
display_lines: list[str] = []
|
display_lines: list[str] = []
|
||||||
if status:
|
if status:
|
||||||
@ -440,13 +441,13 @@ def _display_text(
|
|||||||
|
|
||||||
def _display_time_progress(
|
def _display_time_progress(
|
||||||
node_cls: type[IO.ComfyNode],
|
node_cls: type[IO.ComfyNode],
|
||||||
status: Optional[Union[str, int]],
|
status: str | int | None,
|
||||||
elapsed_seconds: int,
|
elapsed_seconds: int,
|
||||||
estimated_total: Optional[int] = None,
|
estimated_total: int | None = None,
|
||||||
*,
|
*,
|
||||||
price: Optional[float] = None,
|
price: float | None = None,
|
||||||
is_queued: Optional[bool] = None,
|
is_queued: bool | None = None,
|
||||||
processing_elapsed_seconds: Optional[int] = None,
|
processing_elapsed_seconds: int | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
if estimated_total is not None and estimated_total > 0 and is_queued is False:
|
if estimated_total is not None and estimated_total > 0 and is_queued is False:
|
||||||
pe = processing_elapsed_seconds if processing_elapsed_seconds is not None else elapsed_seconds
|
pe = processing_elapsed_seconds if processing_elapsed_seconds is not None else elapsed_seconds
|
||||||
@ -488,7 +489,7 @@ def _unpack_tuple(t: tuple) -> tuple[str, Any, str]:
|
|||||||
raise ValueError("files tuple must be (filename, file[, content_type])")
|
raise ValueError("files tuple must be (filename, file[, content_type])")
|
||||||
|
|
||||||
|
|
||||||
def _merge_params(endpoint_params: dict[str, Any], method: str, data: Optional[dict[str, Any]]) -> dict[str, Any]:
|
def _merge_params(endpoint_params: dict[str, Any], method: str, data: dict[str, Any] | None) -> dict[str, Any]:
|
||||||
params = dict(endpoint_params or {})
|
params = dict(endpoint_params or {})
|
||||||
if method.upper() == "GET" and data:
|
if method.upper() == "GET" and data:
|
||||||
for k, v in data.items():
|
for k, v in data.items():
|
||||||
@ -534,9 +535,9 @@ def _generate_operation_id(method: str, path: str, attempt: int) -> str:
|
|||||||
def _snapshot_request_body_for_logging(
|
def _snapshot_request_body_for_logging(
|
||||||
content_type: str,
|
content_type: str,
|
||||||
method: str,
|
method: str,
|
||||||
data: Optional[dict[str, Any]],
|
data: dict[str, Any] | None,
|
||||||
files: Optional[Union[dict[str, Any], list[tuple[str, Any]]]],
|
files: dict[str, Any] | list[tuple[str, Any]] | None,
|
||||||
) -> Optional[Union[dict[str, Any], str]]:
|
) -> dict[str, Any] | str | None:
|
||||||
if method.upper() == "GET":
|
if method.upper() == "GET":
|
||||||
return None
|
return None
|
||||||
if content_type == "multipart/form-data":
|
if content_type == "multipart/form-data":
|
||||||
@ -586,13 +587,13 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool):
|
|||||||
attempt = 0
|
attempt = 0
|
||||||
delay = cfg.retry_delay
|
delay = cfg.retry_delay
|
||||||
operation_succeeded: bool = False
|
operation_succeeded: bool = False
|
||||||
final_elapsed_seconds: Optional[int] = None
|
final_elapsed_seconds: int | None = None
|
||||||
extracted_price: Optional[float] = None
|
extracted_price: float | None = None
|
||||||
while True:
|
while True:
|
||||||
attempt += 1
|
attempt += 1
|
||||||
stop_event = asyncio.Event()
|
stop_event = asyncio.Event()
|
||||||
monitor_task: Optional[asyncio.Task] = None
|
monitor_task: asyncio.Task | None = None
|
||||||
sess: Optional[aiohttp.ClientSession] = None
|
sess: aiohttp.ClientSession | None = None
|
||||||
|
|
||||||
operation_id = _generate_operation_id(method, cfg.endpoint.path, attempt)
|
operation_id = _generate_operation_id(method, cfg.endpoint.path, attempt)
|
||||||
logging.debug("[DEBUG] HTTP %s %s (attempt %d)", method, url, attempt)
|
logging.debug("[DEBUG] HTTP %s %s (attempt %d)", method, url, attempt)
|
||||||
@ -887,7 +888,7 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _validate_or_raise(response_model: Type[M], payload: Any) -> M:
|
def _validate_or_raise(response_model: type[M], payload: Any) -> M:
|
||||||
try:
|
try:
|
||||||
return response_model.model_validate(payload)
|
return response_model.model_validate(payload)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -902,9 +903,9 @@ def _validate_or_raise(response_model: Type[M], payload: Any) -> M:
|
|||||||
|
|
||||||
|
|
||||||
def _wrap_model_extractor(
|
def _wrap_model_extractor(
|
||||||
response_model: Type[M],
|
response_model: type[M],
|
||||||
extractor: Optional[Callable[[M], Any]],
|
extractor: Callable[[M], Any] | None,
|
||||||
) -> Optional[Callable[[dict[str, Any]], Any]]:
|
) -> Callable[[dict[str, Any]], Any] | None:
|
||||||
"""Wrap a typed extractor so it can be used by the dict-based poller.
|
"""Wrap a typed extractor so it can be used by the dict-based poller.
|
||||||
Validates the dict into `response_model` before invoking `extractor`.
|
Validates the dict into `response_model` before invoking `extractor`.
|
||||||
Uses a small per-wrapper cache keyed by `id(dict)` to avoid re-validating
|
Uses a small per-wrapper cache keyed by `id(dict)` to avoid re-validating
|
||||||
@ -929,10 +930,10 @@ def _wrap_model_extractor(
|
|||||||
return _wrapped
|
return _wrapped
|
||||||
|
|
||||||
|
|
||||||
def _normalize_statuses(values: Optional[Iterable[Union[str, int]]]) -> set[Union[str, int]]:
|
def _normalize_statuses(values: Iterable[str | int] | None) -> set[str | int]:
|
||||||
if not values:
|
if not values:
|
||||||
return set()
|
return set()
|
||||||
out: set[Union[str, int]] = set()
|
out: set[str | int] = set()
|
||||||
for v in values:
|
for v in values:
|
||||||
nv = _normalize_status_value(v)
|
nv = _normalize_status_value(v)
|
||||||
if nv is not None:
|
if nv is not None:
|
||||||
@ -940,7 +941,7 @@ def _normalize_statuses(values: Optional[Iterable[Union[str, int]]]) -> set[Unio
|
|||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
def _normalize_status_value(val: Union[str, int, None]) -> Union[str, int, None]:
|
def _normalize_status_value(val: str | int | None) -> str | int | None:
|
||||||
if isinstance(val, str):
|
if isinstance(val, str):
|
||||||
return val.strip().lower()
|
return val.strip().lower()
|
||||||
return val
|
return val
|
||||||
|
|||||||
@ -4,7 +4,6 @@ import math
|
|||||||
import mimetypes
|
import mimetypes
|
||||||
import uuid
|
import uuid
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import av
|
import av
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -12,8 +11,7 @@ import torch
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from comfy.utils import common_upscale
|
from comfy.utils import common_upscale
|
||||||
from comfy_api.latest import Input, InputImpl
|
from comfy_api.latest import Input, InputImpl, Types
|
||||||
from comfy_api.util import VideoCodec, VideoContainer
|
|
||||||
|
|
||||||
from ._helpers import mimetype_to_extension
|
from ._helpers import mimetype_to_extension
|
||||||
|
|
||||||
@ -57,7 +55,7 @@ def image_tensor_pair_to_batch(image1: torch.Tensor, image2: torch.Tensor) -> to
|
|||||||
|
|
||||||
def tensor_to_bytesio(
|
def tensor_to_bytesio(
|
||||||
image: torch.Tensor,
|
image: torch.Tensor,
|
||||||
name: Optional[str] = None,
|
name: str | None = None,
|
||||||
total_pixels: int = 2048 * 2048,
|
total_pixels: int = 2048 * 2048,
|
||||||
mime_type: str = "image/png",
|
mime_type: str = "image/png",
|
||||||
) -> BytesIO:
|
) -> BytesIO:
|
||||||
@ -177,8 +175,8 @@ def audio_to_base64_string(audio: Input.Audio, container_format: str = "mp4", co
|
|||||||
|
|
||||||
def video_to_base64_string(
|
def video_to_base64_string(
|
||||||
video: Input.Video,
|
video: Input.Video,
|
||||||
container_format: VideoContainer = None,
|
container_format: Types.VideoContainer | None = None,
|
||||||
codec: VideoCodec = None
|
codec: Types.VideoCodec | None = None,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""
|
"""
|
||||||
Converts a video input to a base64 string.
|
Converts a video input to a base64 string.
|
||||||
@ -189,12 +187,11 @@ def video_to_base64_string(
|
|||||||
codec: Optional codec to use (defaults to video.codec if available)
|
codec: Optional codec to use (defaults to video.codec if available)
|
||||||
"""
|
"""
|
||||||
video_bytes_io = BytesIO()
|
video_bytes_io = BytesIO()
|
||||||
|
video.save_to(
|
||||||
# Use provided format/codec if specified, otherwise use video's own if available
|
video_bytes_io,
|
||||||
format_to_use = container_format if container_format is not None else getattr(video, 'container', VideoContainer.MP4)
|
format=container_format or getattr(video, "container", Types.VideoContainer.MP4),
|
||||||
codec_to_use = codec if codec is not None else getattr(video, 'codec', VideoCodec.H264)
|
codec=codec or getattr(video, "codec", Types.VideoCodec.H264),
|
||||||
|
)
|
||||||
video.save_to(video_bytes_io, format=format_to_use, codec=codec_to_use)
|
|
||||||
video_bytes_io.seek(0)
|
video_bytes_io.seek(0)
|
||||||
return base64.b64encode(video_bytes_io.getvalue()).decode("utf-8")
|
return base64.b64encode(video_bytes_io.getvalue()).decode("utf-8")
|
||||||
|
|
||||||
|
|||||||
@ -3,15 +3,15 @@ import contextlib
|
|||||||
import uuid
|
import uuid
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import IO, Optional, Union
|
from typing import IO
|
||||||
from urllib.parse import urljoin, urlparse
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import torch
|
import torch
|
||||||
from aiohttp.client_exceptions import ClientError, ContentTypeError
|
from aiohttp.client_exceptions import ClientError, ContentTypeError
|
||||||
|
|
||||||
from comfy_api.input_impl import VideoFromFile
|
|
||||||
from comfy_api.latest import IO as COMFY_IO
|
from comfy_api.latest import IO as COMFY_IO
|
||||||
|
from comfy_api.latest import InputImpl
|
||||||
|
|
||||||
from . import request_logger
|
from . import request_logger
|
||||||
from ._helpers import (
|
from ._helpers import (
|
||||||
@ -29,9 +29,9 @@ _RETRY_STATUS = {408, 429, 500, 502, 503, 504}
|
|||||||
|
|
||||||
async def download_url_to_bytesio(
|
async def download_url_to_bytesio(
|
||||||
url: str,
|
url: str,
|
||||||
dest: Optional[Union[BytesIO, IO[bytes], str, Path]],
|
dest: BytesIO | IO[bytes] | str | Path | None,
|
||||||
*,
|
*,
|
||||||
timeout: Optional[float] = None,
|
timeout: float | None = None,
|
||||||
max_retries: int = 5,
|
max_retries: int = 5,
|
||||||
retry_delay: float = 1.0,
|
retry_delay: float = 1.0,
|
||||||
retry_backoff: float = 2.0,
|
retry_backoff: float = 2.0,
|
||||||
@ -71,10 +71,10 @@ async def download_url_to_bytesio(
|
|||||||
|
|
||||||
is_path_sink = isinstance(dest, (str, Path))
|
is_path_sink = isinstance(dest, (str, Path))
|
||||||
fhandle = None
|
fhandle = None
|
||||||
session: Optional[aiohttp.ClientSession] = None
|
session: aiohttp.ClientSession | None = None
|
||||||
stop_evt: Optional[asyncio.Event] = None
|
stop_evt: asyncio.Event | None = None
|
||||||
monitor_task: Optional[asyncio.Task] = None
|
monitor_task: asyncio.Task | None = None
|
||||||
req_task: Optional[asyncio.Task] = None
|
req_task: asyncio.Task | None = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with contextlib.suppress(Exception):
|
with contextlib.suppress(Exception):
|
||||||
@ -234,11 +234,11 @@ async def download_url_to_video_output(
|
|||||||
timeout: float = None,
|
timeout: float = None,
|
||||||
max_retries: int = 5,
|
max_retries: int = 5,
|
||||||
cls: type[COMFY_IO.ComfyNode] = None,
|
cls: type[COMFY_IO.ComfyNode] = None,
|
||||||
) -> VideoFromFile:
|
) -> InputImpl.VideoFromFile:
|
||||||
"""Downloads a video from a URL and returns a `VIDEO` output."""
|
"""Downloads a video from a URL and returns a `VIDEO` output."""
|
||||||
result = BytesIO()
|
result = BytesIO()
|
||||||
await download_url_to_bytesio(video_url, result, timeout=timeout, max_retries=max_retries, cls=cls)
|
await download_url_to_bytesio(video_url, result, timeout=timeout, max_retries=max_retries, cls=cls)
|
||||||
return VideoFromFile(result)
|
return InputImpl.VideoFromFile(result)
|
||||||
|
|
||||||
|
|
||||||
async def download_url_as_bytesio(
|
async def download_url_as_bytesio(
|
||||||
|
|||||||
@ -1,5 +1,3 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
|
|||||||
@ -4,15 +4,13 @@ import logging
|
|||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from typing import Optional, Union
|
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import torch
|
import torch
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
from comfy_api.latest import IO, Input
|
from comfy_api.latest import IO, Input, Types
|
||||||
from comfy_api.util import VideoCodec, VideoContainer
|
|
||||||
|
|
||||||
from . import request_logger
|
from . import request_logger
|
||||||
from ._helpers import is_processing_interrupted, sleep_with_interrupt
|
from ._helpers import is_processing_interrupted, sleep_with_interrupt
|
||||||
@ -32,7 +30,7 @@ from .conversions import (
|
|||||||
|
|
||||||
class UploadRequest(BaseModel):
|
class UploadRequest(BaseModel):
|
||||||
file_name: str = Field(..., description="Filename to upload")
|
file_name: str = Field(..., description="Filename to upload")
|
||||||
content_type: Optional[str] = Field(
|
content_type: str | None = Field(
|
||||||
None,
|
None,
|
||||||
description="Mime type of the file. For example: image/png, image/jpeg, video/mp4, etc.",
|
description="Mime type of the file. For example: image/png, image/jpeg, video/mp4, etc.",
|
||||||
)
|
)
|
||||||
@ -48,22 +46,30 @@ async def upload_images_to_comfyapi(
|
|||||||
image: torch.Tensor,
|
image: torch.Tensor,
|
||||||
*,
|
*,
|
||||||
max_images: int = 8,
|
max_images: int = 8,
|
||||||
mime_type: Optional[str] = None,
|
mime_type: str | None = None,
|
||||||
wait_label: Optional[str] = "Uploading",
|
wait_label: str | None = "Uploading",
|
||||||
|
show_batch_index: bool = True,
|
||||||
) -> list[str]:
|
) -> list[str]:
|
||||||
"""
|
"""
|
||||||
Uploads images to ComfyUI API and returns download URLs.
|
Uploads images to ComfyUI API and returns download URLs.
|
||||||
To upload multiple images, stack them in the batch dimension first.
|
To upload multiple images, stack them in the batch dimension first.
|
||||||
"""
|
"""
|
||||||
# if batch, try to upload each file if max_images is greater than 0
|
# if batched, try to upload each file if max_images is greater than 0
|
||||||
download_urls: list[str] = []
|
download_urls: list[str] = []
|
||||||
is_batch = len(image.shape) > 3
|
is_batch = len(image.shape) > 3
|
||||||
batch_len = image.shape[0] if is_batch else 1
|
batch_len = image.shape[0] if is_batch else 1
|
||||||
|
num_to_upload = min(batch_len, max_images)
|
||||||
|
batch_start_ts = time.monotonic()
|
||||||
|
|
||||||
for idx in range(min(batch_len, max_images)):
|
for idx in range(num_to_upload):
|
||||||
tensor = image[idx] if is_batch else image
|
tensor = image[idx] if is_batch else image
|
||||||
img_io = tensor_to_bytesio(tensor, mime_type=mime_type)
|
img_io = tensor_to_bytesio(tensor, mime_type=mime_type)
|
||||||
url = await upload_file_to_comfyapi(cls, img_io, img_io.name, mime_type, wait_label)
|
|
||||||
|
effective_label = wait_label
|
||||||
|
if wait_label and show_batch_index and num_to_upload > 1:
|
||||||
|
effective_label = f"{wait_label} ({idx + 1}/{num_to_upload})"
|
||||||
|
|
||||||
|
url = await upload_file_to_comfyapi(cls, img_io, img_io.name, mime_type, effective_label, batch_start_ts)
|
||||||
download_urls.append(url)
|
download_urls.append(url)
|
||||||
return download_urls
|
return download_urls
|
||||||
|
|
||||||
@ -92,9 +98,10 @@ async def upload_video_to_comfyapi(
|
|||||||
cls: type[IO.ComfyNode],
|
cls: type[IO.ComfyNode],
|
||||||
video: Input.Video,
|
video: Input.Video,
|
||||||
*,
|
*,
|
||||||
container: VideoContainer = VideoContainer.MP4,
|
container: Types.VideoContainer = Types.VideoContainer.MP4,
|
||||||
codec: VideoCodec = VideoCodec.H264,
|
codec: Types.VideoCodec = Types.VideoCodec.H264,
|
||||||
max_duration: Optional[int] = None,
|
max_duration: int | None = None,
|
||||||
|
wait_label: str | None = "Uploading",
|
||||||
) -> str:
|
) -> str:
|
||||||
"""
|
"""
|
||||||
Uploads a single video to ComfyUI API and returns its download URL.
|
Uploads a single video to ComfyUI API and returns its download URL.
|
||||||
@ -119,15 +126,16 @@ async def upload_video_to_comfyapi(
|
|||||||
video.save_to(video_bytes_io, format=container, codec=codec)
|
video.save_to(video_bytes_io, format=container, codec=codec)
|
||||||
video_bytes_io.seek(0)
|
video_bytes_io.seek(0)
|
||||||
|
|
||||||
return await upload_file_to_comfyapi(cls, video_bytes_io, filename, upload_mime_type)
|
return await upload_file_to_comfyapi(cls, video_bytes_io, filename, upload_mime_type, wait_label)
|
||||||
|
|
||||||
|
|
||||||
async def upload_file_to_comfyapi(
|
async def upload_file_to_comfyapi(
|
||||||
cls: type[IO.ComfyNode],
|
cls: type[IO.ComfyNode],
|
||||||
file_bytes_io: BytesIO,
|
file_bytes_io: BytesIO,
|
||||||
filename: str,
|
filename: str,
|
||||||
upload_mime_type: Optional[str],
|
upload_mime_type: str | None,
|
||||||
wait_label: Optional[str] = "Uploading",
|
wait_label: str | None = "Uploading",
|
||||||
|
progress_origin_ts: float | None = None,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Uploads a single file to ComfyUI API and returns its download URL."""
|
"""Uploads a single file to ComfyUI API and returns its download URL."""
|
||||||
if upload_mime_type is None:
|
if upload_mime_type is None:
|
||||||
@ -148,6 +156,7 @@ async def upload_file_to_comfyapi(
|
|||||||
file_bytes_io,
|
file_bytes_io,
|
||||||
content_type=upload_mime_type,
|
content_type=upload_mime_type,
|
||||||
wait_label=wait_label,
|
wait_label=wait_label,
|
||||||
|
progress_origin_ts=progress_origin_ts,
|
||||||
)
|
)
|
||||||
return create_resp.download_url
|
return create_resp.download_url
|
||||||
|
|
||||||
@ -155,27 +164,18 @@ async def upload_file_to_comfyapi(
|
|||||||
async def upload_file(
|
async def upload_file(
|
||||||
cls: type[IO.ComfyNode],
|
cls: type[IO.ComfyNode],
|
||||||
upload_url: str,
|
upload_url: str,
|
||||||
file: Union[BytesIO, str],
|
file: BytesIO | str,
|
||||||
*,
|
*,
|
||||||
content_type: Optional[str] = None,
|
content_type: str | None = None,
|
||||||
max_retries: int = 3,
|
max_retries: int = 3,
|
||||||
retry_delay: float = 1.0,
|
retry_delay: float = 1.0,
|
||||||
retry_backoff: float = 2.0,
|
retry_backoff: float = 2.0,
|
||||||
wait_label: Optional[str] = None,
|
wait_label: str | None = None,
|
||||||
|
progress_origin_ts: float | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Upload a file to a signed URL (e.g., S3 pre-signed PUT) with retries, Comfy progress display, and interruption.
|
Upload a file to a signed URL (e.g., S3 pre-signed PUT) with retries, Comfy progress display, and interruption.
|
||||||
|
|
||||||
Args:
|
|
||||||
cls: Node class (provides auth context + UI progress hooks).
|
|
||||||
upload_url: Pre-signed PUT URL.
|
|
||||||
file: BytesIO or path string.
|
|
||||||
content_type: Explicit MIME type. If None, we *suppress* Content-Type.
|
|
||||||
max_retries: Maximum retry attempts.
|
|
||||||
retry_delay: Initial delay in seconds.
|
|
||||||
retry_backoff: Exponential backoff factor.
|
|
||||||
wait_label: Progress label shown in Comfy UI.
|
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ProcessingInterrupted, LocalNetworkError, ApiServerError, Exception
|
ProcessingInterrupted, LocalNetworkError, ApiServerError, Exception
|
||||||
"""
|
"""
|
||||||
@ -198,7 +198,7 @@ async def upload_file(
|
|||||||
|
|
||||||
attempt = 0
|
attempt = 0
|
||||||
delay = retry_delay
|
delay = retry_delay
|
||||||
start_ts = time.monotonic()
|
start_ts = progress_origin_ts if progress_origin_ts is not None else time.monotonic()
|
||||||
op_uuid = uuid.uuid4().hex[:8]
|
op_uuid = uuid.uuid4().hex[:8]
|
||||||
while True:
|
while True:
|
||||||
attempt += 1
|
attempt += 1
|
||||||
@ -218,7 +218,7 @@ async def upload_file(
|
|||||||
return
|
return
|
||||||
|
|
||||||
monitor_task = asyncio.create_task(_monitor())
|
monitor_task = asyncio.create_task(_monitor())
|
||||||
sess: Optional[aiohttp.ClientSession] = None
|
sess: aiohttp.ClientSession | None = None
|
||||||
try:
|
try:
|
||||||
try:
|
try:
|
||||||
request_logger.log_request_response(
|
request_logger.log_request_response(
|
||||||
|
|||||||
@ -1,9 +1,7 @@
|
|||||||
import logging
|
import logging
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from comfy_api.input.video_types import VideoInput
|
|
||||||
from comfy_api.latest import Input
|
from comfy_api.latest import Input
|
||||||
|
|
||||||
|
|
||||||
@ -18,10 +16,10 @@ def get_image_dimensions(image: torch.Tensor) -> tuple[int, int]:
|
|||||||
|
|
||||||
def validate_image_dimensions(
|
def validate_image_dimensions(
|
||||||
image: torch.Tensor,
|
image: torch.Tensor,
|
||||||
min_width: Optional[int] = None,
|
min_width: int | None = None,
|
||||||
max_width: Optional[int] = None,
|
max_width: int | None = None,
|
||||||
min_height: Optional[int] = None,
|
min_height: int | None = None,
|
||||||
max_height: Optional[int] = None,
|
max_height: int | None = None,
|
||||||
):
|
):
|
||||||
height, width = get_image_dimensions(image)
|
height, width = get_image_dimensions(image)
|
||||||
|
|
||||||
@ -37,8 +35,8 @@ def validate_image_dimensions(
|
|||||||
|
|
||||||
def validate_image_aspect_ratio(
|
def validate_image_aspect_ratio(
|
||||||
image: torch.Tensor,
|
image: torch.Tensor,
|
||||||
min_ratio: Optional[tuple[float, float]] = None, # e.g. (1, 4)
|
min_ratio: tuple[float, float] | None = None, # e.g. (1, 4)
|
||||||
max_ratio: Optional[tuple[float, float]] = None, # e.g. (4, 1)
|
max_ratio: tuple[float, float] | None = None, # e.g. (4, 1)
|
||||||
*,
|
*,
|
||||||
strict: bool = True, # True -> (min, max); False -> [min, max]
|
strict: bool = True, # True -> (min, max); False -> [min, max]
|
||||||
) -> float:
|
) -> float:
|
||||||
@ -54,8 +52,8 @@ def validate_image_aspect_ratio(
|
|||||||
def validate_images_aspect_ratio_closeness(
|
def validate_images_aspect_ratio_closeness(
|
||||||
first_image: torch.Tensor,
|
first_image: torch.Tensor,
|
||||||
second_image: torch.Tensor,
|
second_image: torch.Tensor,
|
||||||
min_rel: float, # e.g. 0.8
|
min_rel: float, # e.g. 0.8
|
||||||
max_rel: float, # e.g. 1.25
|
max_rel: float, # e.g. 1.25
|
||||||
*,
|
*,
|
||||||
strict: bool = False, # True -> (min, max); False -> [min, max]
|
strict: bool = False, # True -> (min, max); False -> [min, max]
|
||||||
) -> float:
|
) -> float:
|
||||||
@ -84,8 +82,8 @@ def validate_images_aspect_ratio_closeness(
|
|||||||
|
|
||||||
def validate_aspect_ratio_string(
|
def validate_aspect_ratio_string(
|
||||||
aspect_ratio: str,
|
aspect_ratio: str,
|
||||||
min_ratio: Optional[tuple[float, float]] = None, # e.g. (1, 4)
|
min_ratio: tuple[float, float] | None = None, # e.g. (1, 4)
|
||||||
max_ratio: Optional[tuple[float, float]] = None, # e.g. (4, 1)
|
max_ratio: tuple[float, float] | None = None, # e.g. (4, 1)
|
||||||
*,
|
*,
|
||||||
strict: bool = False, # True -> (min, max); False -> [min, max]
|
strict: bool = False, # True -> (min, max); False -> [min, max]
|
||||||
) -> float:
|
) -> float:
|
||||||
@ -97,10 +95,10 @@ def validate_aspect_ratio_string(
|
|||||||
|
|
||||||
def validate_video_dimensions(
|
def validate_video_dimensions(
|
||||||
video: Input.Video,
|
video: Input.Video,
|
||||||
min_width: Optional[int] = None,
|
min_width: int | None = None,
|
||||||
max_width: Optional[int] = None,
|
max_width: int | None = None,
|
||||||
min_height: Optional[int] = None,
|
min_height: int | None = None,
|
||||||
max_height: Optional[int] = None,
|
max_height: int | None = None,
|
||||||
):
|
):
|
||||||
try:
|
try:
|
||||||
width, height = video.get_dimensions()
|
width, height = video.get_dimensions()
|
||||||
@ -120,8 +118,8 @@ def validate_video_dimensions(
|
|||||||
|
|
||||||
def validate_video_duration(
|
def validate_video_duration(
|
||||||
video: Input.Video,
|
video: Input.Video,
|
||||||
min_duration: Optional[float] = None,
|
min_duration: float | None = None,
|
||||||
max_duration: Optional[float] = None,
|
max_duration: float | None = None,
|
||||||
):
|
):
|
||||||
try:
|
try:
|
||||||
duration = video.get_duration()
|
duration = video.get_duration()
|
||||||
@ -136,6 +134,23 @@ def validate_video_duration(
|
|||||||
raise ValueError(f"Video duration must be at most {max_duration}s, got {duration}s")
|
raise ValueError(f"Video duration must be at most {max_duration}s, got {duration}s")
|
||||||
|
|
||||||
|
|
||||||
|
def validate_video_frame_count(
|
||||||
|
video: Input.Video,
|
||||||
|
min_frame_count: int | None = None,
|
||||||
|
max_frame_count: int | None = None,
|
||||||
|
):
|
||||||
|
try:
|
||||||
|
frame_count = video.get_frame_count()
|
||||||
|
except Exception as e:
|
||||||
|
logging.error("Error getting frame count of video: %s", e)
|
||||||
|
return
|
||||||
|
|
||||||
|
if min_frame_count is not None and min_frame_count > frame_count:
|
||||||
|
raise ValueError(f"Video frame count must be at least {min_frame_count}, got {frame_count}")
|
||||||
|
if max_frame_count is not None and frame_count > max_frame_count:
|
||||||
|
raise ValueError(f"Video frame count must be at most {max_frame_count}, got {frame_count}")
|
||||||
|
|
||||||
|
|
||||||
def get_number_of_images(images):
|
def get_number_of_images(images):
|
||||||
if isinstance(images, torch.Tensor):
|
if isinstance(images, torch.Tensor):
|
||||||
return images.shape[0] if images.ndim >= 4 else 1
|
return images.shape[0] if images.ndim >= 4 else 1
|
||||||
@ -144,8 +159,8 @@ def get_number_of_images(images):
|
|||||||
|
|
||||||
def validate_audio_duration(
|
def validate_audio_duration(
|
||||||
audio: Input.Audio,
|
audio: Input.Audio,
|
||||||
min_duration: Optional[float] = None,
|
min_duration: float | None = None,
|
||||||
max_duration: Optional[float] = None,
|
max_duration: float | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
sr = int(audio["sample_rate"])
|
sr = int(audio["sample_rate"])
|
||||||
dur = int(audio["waveform"].shape[-1]) / sr
|
dur = int(audio["waveform"].shape[-1]) / sr
|
||||||
@ -177,7 +192,7 @@ def validate_string(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def validate_container_format_is_mp4(video: VideoInput) -> None:
|
def validate_container_format_is_mp4(video: Input.Video) -> None:
|
||||||
"""Validates video container format is MP4."""
|
"""Validates video container format is MP4."""
|
||||||
container_format = video.get_container_format()
|
container_format = video.get_container_format()
|
||||||
if container_format not in ["mp4", "mov,mp4,m4a,3gp,3g2,mj2"]:
|
if container_format not in ["mp4", "mov,mp4,m4a,3gp,3g2,mj2"]:
|
||||||
@ -194,8 +209,8 @@ def _ratio_from_tuple(r: tuple[float, float]) -> float:
|
|||||||
def _assert_ratio_bounds(
|
def _assert_ratio_bounds(
|
||||||
ar: float,
|
ar: float,
|
||||||
*,
|
*,
|
||||||
min_ratio: Optional[tuple[float, float]] = None,
|
min_ratio: tuple[float, float] | None = None,
|
||||||
max_ratio: Optional[tuple[float, float]] = None,
|
max_ratio: tuple[float, float] | None = None,
|
||||||
strict: bool = True,
|
strict: bool = True,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Validate a numeric aspect ratio against optional min/max ratio bounds."""
|
"""Validate a numeric aspect ratio against optional min/max ratio bounds."""
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user