diff --git a/.github/workflows/release-stable-all.yml b/.github/workflows/release-stable-all.yml index d72ece2ce..8f07a7b1c 100644 --- a/.github/workflows/release-stable-all.yml +++ b/.github/workflows/release-stable-all.yml @@ -20,7 +20,7 @@ jobs: git_tag: ${{ inputs.git_tag }} cache_tag: "cu130" python_minor: "13" - python_patch: "9" + python_patch: "11" rel_name: "nvidia" rel_extra_name: "" test_release: true @@ -65,11 +65,11 @@ jobs: contents: "write" packages: "write" pull-requests: "read" - name: "Release AMD ROCm 7.1.1" + name: "Release AMD ROCm 7.2" uses: ./.github/workflows/stable-release.yml with: git_tag: ${{ inputs.git_tag }} - cache_tag: "rocm711" + cache_tag: "rocm72" python_minor: "12" python_patch: "10" rel_name: "amd" diff --git a/.github/workflows/windows_release_dependencies.yml b/.github/workflows/windows_release_dependencies.yml index f61ee21a2..93e01ac93 100644 --- a/.github/workflows/windows_release_dependencies.yml +++ b/.github/workflows/windows_release_dependencies.yml @@ -29,7 +29,7 @@ on: description: 'python patch version' required: true type: string - default: "9" + default: "11" # push: # branches: # - master diff --git a/README.md b/README.md index c56e05d07..96dc2904b 100644 --- a/README.md +++ b/README.md @@ -208,7 +208,7 @@ comfy install ## Manual Install (Windows, Linux) -Python 3.14 works but you may encounter issues with the torch compile node. The free threaded variant is still missing some dependencies. +Python 3.14 works but some custom nodes may have issues. The free threaded variant works but some dependencies will enable the GIL so it's not fully supported. Python 3.13 is very well supported. If you have trouble with some custom node dependencies on 3.13 you can try 3.12 diff --git a/app/assets/api/routes.py b/app/assets/api/routes.py index 30e87a898..7676e50b4 100644 --- a/app/assets/api/routes.py +++ b/app/assets/api/routes.py @@ -1,5 +1,8 @@ import logging import uuid +import urllib.parse +import os +import contextlib from aiohttp import web from pydantic import ValidationError @@ -8,6 +11,9 @@ import app.assets.manager as manager from app import user_manager from app.assets.api import schemas_in from app.assets.helpers import get_query_dict +from app.assets.scanner import seed_assets + +import folder_paths ROUTES = web.RouteTableDef() USER_MANAGER: user_manager.UserManager | None = None @@ -15,6 +21,9 @@ USER_MANAGER: user_manager.UserManager | None = None # UUID regex (canonical hyphenated form, case-insensitive) UUID_RE = r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}" +# Note to any custom node developers reading this code: +# The assets system is not yet fully implemented, do not rely on the code in /app/assets remaining the same. + def register_assets_system(app: web.Application, user_manager_instance: user_manager.UserManager) -> None: global USER_MANAGER USER_MANAGER = user_manager_instance @@ -28,6 +37,18 @@ def _validation_error_response(code: str, ve: ValidationError) -> web.Response: return _error_response(400, code, "Validation failed.", {"errors": ve.json()}) +@ROUTES.head("/api/assets/hash/{hash}") +async def head_asset_by_hash(request: web.Request) -> web.Response: + hash_str = request.match_info.get("hash", "").strip().lower() + if not hash_str or ":" not in hash_str: + return _error_response(400, "INVALID_HASH", "hash must be like 'blake3:'") + algo, digest = hash_str.split(":", 1) + if algo != "blake3" or not digest or any(c for c in digest if c not in "0123456789abcdef"): + return _error_response(400, "INVALID_HASH", "hash must be like 'blake3:'") + exists = manager.asset_exists(asset_hash=hash_str) + return web.Response(status=200 if exists else 404) + + @ROUTES.get("/api/assets") async def list_assets(request: web.Request) -> web.Response: """ @@ -50,7 +71,7 @@ async def list_assets(request: web.Request) -> web.Response: order=q.order, owner_id=USER_MANAGER.get_request_user_id(request), ) - return web.json_response(payload.model_dump(mode="json")) + return web.json_response(payload.model_dump(mode="json", exclude_none=True)) @ROUTES.get(f"/api/assets/{{id:{UUID_RE}}}") @@ -76,6 +97,314 @@ async def get_asset(request: web.Request) -> web.Response: return web.json_response(result.model_dump(mode="json"), status=200) +@ROUTES.get(f"/api/assets/{{id:{UUID_RE}}}/content") +async def download_asset_content(request: web.Request) -> web.Response: + # question: do we need disposition? could we just stick with one of these? + disposition = request.query.get("disposition", "attachment").lower().strip() + if disposition not in {"inline", "attachment"}: + disposition = "attachment" + + try: + abs_path, content_type, filename = manager.resolve_asset_content_for_download( + asset_info_id=str(uuid.UUID(request.match_info["id"])), + owner_id=USER_MANAGER.get_request_user_id(request), + ) + except ValueError as ve: + return _error_response(404, "ASSET_NOT_FOUND", str(ve)) + except NotImplementedError as nie: + return _error_response(501, "BACKEND_UNSUPPORTED", str(nie)) + except FileNotFoundError: + return _error_response(404, "FILE_NOT_FOUND", "Underlying file not found on disk.") + + quoted = (filename or "").replace("\r", "").replace("\n", "").replace('"', "'") + cd = f'{disposition}; filename="{quoted}"; filename*=UTF-8\'\'{urllib.parse.quote(filename)}' + + file_size = os.path.getsize(abs_path) + logging.info( + "download_asset_content: path=%s, size=%d bytes (%.2f MB), content_type=%s, filename=%s", + abs_path, + file_size, + file_size / (1024 * 1024), + content_type, + filename, + ) + + async def file_sender(): + chunk_size = 64 * 1024 + with open(abs_path, "rb") as f: + while True: + chunk = f.read(chunk_size) + if not chunk: + break + yield chunk + + return web.Response( + body=file_sender(), + content_type=content_type, + headers={ + "Content-Disposition": cd, + "Content-Length": str(file_size), + }, + ) + + +@ROUTES.post("/api/assets/from-hash") +async def create_asset_from_hash(request: web.Request) -> web.Response: + try: + payload = await request.json() + body = schemas_in.CreateFromHashBody.model_validate(payload) + except ValidationError as ve: + return _validation_error_response("INVALID_BODY", ve) + except Exception: + return _error_response(400, "INVALID_JSON", "Request body must be valid JSON.") + + result = manager.create_asset_from_hash( + hash_str=body.hash, + name=body.name, + tags=body.tags, + user_metadata=body.user_metadata, + owner_id=USER_MANAGER.get_request_user_id(request), + ) + if result is None: + return _error_response(404, "ASSET_NOT_FOUND", f"Asset content {body.hash} does not exist") + return web.json_response(result.model_dump(mode="json"), status=201) + + +@ROUTES.post("/api/assets") +async def upload_asset(request: web.Request) -> web.Response: + """Multipart/form-data endpoint for Asset uploads.""" + if not (request.content_type or "").lower().startswith("multipart/"): + return _error_response(415, "UNSUPPORTED_MEDIA_TYPE", "Use multipart/form-data for uploads.") + + reader = await request.multipart() + + file_present = False + file_client_name: str | None = None + tags_raw: list[str] = [] + provided_name: str | None = None + user_metadata_raw: str | None = None + provided_hash: str | None = None + provided_hash_exists: bool | None = None + + file_written = 0 + tmp_path: str | None = None + while True: + field = await reader.next() + if field is None: + break + + fname = getattr(field, "name", "") or "" + + if fname == "hash": + try: + s = ((await field.text()) or "").strip().lower() + except Exception: + return _error_response(400, "INVALID_HASH", "hash must be like 'blake3:'") + + if s: + if ":" not in s: + return _error_response(400, "INVALID_HASH", "hash must be like 'blake3:'") + algo, digest = s.split(":", 1) + if algo != "blake3" or not digest or any(c for c in digest if c not in "0123456789abcdef"): + return _error_response(400, "INVALID_HASH", "hash must be like 'blake3:'") + provided_hash = f"{algo}:{digest}" + try: + provided_hash_exists = manager.asset_exists(asset_hash=provided_hash) + except Exception: + provided_hash_exists = None # do not fail the whole request here + + elif fname == "file": + file_present = True + file_client_name = (field.filename or "").strip() + + if provided_hash and provided_hash_exists is True: + # If client supplied a hash that we know exists, drain but do not write to disk + try: + while True: + chunk = await field.read_chunk(8 * 1024 * 1024) + if not chunk: + break + file_written += len(chunk) + except Exception: + return _error_response(500, "UPLOAD_IO_ERROR", "Failed to receive uploaded file.") + continue # Do not create temp file; we will create AssetInfo from the existing content + + # Otherwise, store to temp for hashing/ingest + uploads_root = os.path.join(folder_paths.get_temp_directory(), "uploads") + unique_dir = os.path.join(uploads_root, uuid.uuid4().hex) + os.makedirs(unique_dir, exist_ok=True) + tmp_path = os.path.join(unique_dir, ".upload.part") + + try: + with open(tmp_path, "wb") as f: + while True: + chunk = await field.read_chunk(8 * 1024 * 1024) + if not chunk: + break + f.write(chunk) + file_written += len(chunk) + except Exception: + try: + if os.path.exists(tmp_path or ""): + os.remove(tmp_path) + finally: + return _error_response(500, "UPLOAD_IO_ERROR", "Failed to receive and store uploaded file.") + elif fname == "tags": + tags_raw.append((await field.text()) or "") + elif fname == "name": + provided_name = (await field.text()) or None + elif fname == "user_metadata": + user_metadata_raw = (await field.text()) or None + + # If client did not send file, and we are not doing a from-hash fast path -> error + if not file_present and not (provided_hash and provided_hash_exists): + return _error_response(400, "MISSING_FILE", "Form must include a 'file' part or a known 'hash'.") + + if file_present and file_written == 0 and not (provided_hash and provided_hash_exists): + # Empty upload is only acceptable if we are fast-pathing from existing hash + try: + if tmp_path and os.path.exists(tmp_path): + os.remove(tmp_path) + finally: + return _error_response(400, "EMPTY_UPLOAD", "Uploaded file is empty.") + + try: + spec = schemas_in.UploadAssetSpec.model_validate({ + "tags": tags_raw, + "name": provided_name, + "user_metadata": user_metadata_raw, + "hash": provided_hash, + }) + except ValidationError as ve: + try: + if tmp_path and os.path.exists(tmp_path): + os.remove(tmp_path) + finally: + return _validation_error_response("INVALID_BODY", ve) + + # Validate models category against configured folders (consistent with previous behavior) + if spec.tags and spec.tags[0] == "models": + if len(spec.tags) < 2 or spec.tags[1] not in folder_paths.folder_names_and_paths: + if tmp_path and os.path.exists(tmp_path): + os.remove(tmp_path) + return _error_response( + 400, "INVALID_BODY", f"unknown models category '{spec.tags[1] if len(spec.tags) >= 2 else ''}'" + ) + + owner_id = USER_MANAGER.get_request_user_id(request) + + # Fast path: if a valid provided hash exists, create AssetInfo without writing anything + if spec.hash and provided_hash_exists is True: + try: + result = manager.create_asset_from_hash( + hash_str=spec.hash, + name=spec.name or (spec.hash.split(":", 1)[1]), + tags=spec.tags, + user_metadata=spec.user_metadata or {}, + owner_id=owner_id, + ) + except Exception: + logging.exception("create_asset_from_hash failed for hash=%s, owner_id=%s", spec.hash, owner_id) + return _error_response(500, "INTERNAL", "Unexpected server error.") + + if result is None: + return _error_response(404, "ASSET_NOT_FOUND", f"Asset content {spec.hash} does not exist") + + # Drain temp if we accidentally saved (e.g., hash field came after file) + if tmp_path and os.path.exists(tmp_path): + with contextlib.suppress(Exception): + os.remove(tmp_path) + + status = 200 if (not result.created_new) else 201 + return web.json_response(result.model_dump(mode="json"), status=status) + + # Otherwise, we must have a temp file path to ingest + if not tmp_path or not os.path.exists(tmp_path): + # The only case we reach here without a temp file is: client sent a hash that does not exist and no file + return _error_response(404, "ASSET_NOT_FOUND", "Provided hash not found and no file uploaded.") + + try: + created = manager.upload_asset_from_temp_path( + spec, + temp_path=tmp_path, + client_filename=file_client_name, + owner_id=owner_id, + expected_asset_hash=spec.hash, + ) + status = 201 if created.created_new else 200 + return web.json_response(created.model_dump(mode="json"), status=status) + except ValueError as e: + if tmp_path and os.path.exists(tmp_path): + os.remove(tmp_path) + msg = str(e) + if "HASH_MISMATCH" in msg or msg.strip().upper() == "HASH_MISMATCH": + return _error_response( + 400, + "HASH_MISMATCH", + "Uploaded file hash does not match provided hash.", + ) + return _error_response(400, "BAD_REQUEST", "Invalid inputs.") + except Exception: + if tmp_path and os.path.exists(tmp_path): + os.remove(tmp_path) + logging.exception("upload_asset_from_temp_path failed for tmp_path=%s, owner_id=%s", tmp_path, owner_id) + return _error_response(500, "INTERNAL", "Unexpected server error.") + + +@ROUTES.put(f"/api/assets/{{id:{UUID_RE}}}") +async def update_asset(request: web.Request) -> web.Response: + asset_info_id = str(uuid.UUID(request.match_info["id"])) + try: + body = schemas_in.UpdateAssetBody.model_validate(await request.json()) + except ValidationError as ve: + return _validation_error_response("INVALID_BODY", ve) + except Exception: + return _error_response(400, "INVALID_JSON", "Request body must be valid JSON.") + + try: + result = manager.update_asset( + asset_info_id=asset_info_id, + name=body.name, + user_metadata=body.user_metadata, + owner_id=USER_MANAGER.get_request_user_id(request), + ) + except (ValueError, PermissionError) as ve: + return _error_response(404, "ASSET_NOT_FOUND", str(ve), {"id": asset_info_id}) + except Exception: + logging.exception( + "update_asset failed for asset_info_id=%s, owner_id=%s", + asset_info_id, + USER_MANAGER.get_request_user_id(request), + ) + return _error_response(500, "INTERNAL", "Unexpected server error.") + return web.json_response(result.model_dump(mode="json"), status=200) + + +@ROUTES.delete(f"/api/assets/{{id:{UUID_RE}}}") +async def delete_asset(request: web.Request) -> web.Response: + asset_info_id = str(uuid.UUID(request.match_info["id"])) + delete_content = request.query.get("delete_content") + delete_content = True if delete_content is None else delete_content.lower() not in {"0", "false", "no"} + + try: + deleted = manager.delete_asset_reference( + asset_info_id=asset_info_id, + owner_id=USER_MANAGER.get_request_user_id(request), + delete_content_if_orphan=delete_content, + ) + except Exception: + logging.exception( + "delete_asset_reference failed for asset_info_id=%s, owner_id=%s", + asset_info_id, + USER_MANAGER.get_request_user_id(request), + ) + return _error_response(500, "INTERNAL", "Unexpected server error.") + + if not deleted: + return _error_response(404, "ASSET_NOT_FOUND", f"AssetInfo {asset_info_id} not found.") + return web.Response(status=204) + + @ROUTES.get("/api/tags") async def get_tags(request: web.Request) -> web.Response: """ @@ -100,3 +429,86 @@ async def get_tags(request: web.Request) -> web.Response: owner_id=USER_MANAGER.get_request_user_id(request), ) return web.json_response(result.model_dump(mode="json")) + + +@ROUTES.post(f"/api/assets/{{id:{UUID_RE}}}/tags") +async def add_asset_tags(request: web.Request) -> web.Response: + asset_info_id = str(uuid.UUID(request.match_info["id"])) + try: + payload = await request.json() + data = schemas_in.TagsAdd.model_validate(payload) + except ValidationError as ve: + return _error_response(400, "INVALID_BODY", "Invalid JSON body for tags add.", {"errors": ve.errors()}) + except Exception: + return _error_response(400, "INVALID_JSON", "Request body must be valid JSON.") + + try: + result = manager.add_tags_to_asset( + asset_info_id=asset_info_id, + tags=data.tags, + origin="manual", + owner_id=USER_MANAGER.get_request_user_id(request), + ) + except (ValueError, PermissionError) as ve: + return _error_response(404, "ASSET_NOT_FOUND", str(ve), {"id": asset_info_id}) + except Exception: + logging.exception( + "add_tags_to_asset failed for asset_info_id=%s, owner_id=%s", + asset_info_id, + USER_MANAGER.get_request_user_id(request), + ) + return _error_response(500, "INTERNAL", "Unexpected server error.") + + return web.json_response(result.model_dump(mode="json"), status=200) + + +@ROUTES.delete(f"/api/assets/{{id:{UUID_RE}}}/tags") +async def delete_asset_tags(request: web.Request) -> web.Response: + asset_info_id = str(uuid.UUID(request.match_info["id"])) + try: + payload = await request.json() + data = schemas_in.TagsRemove.model_validate(payload) + except ValidationError as ve: + return _error_response(400, "INVALID_BODY", "Invalid JSON body for tags remove.", {"errors": ve.errors()}) + except Exception: + return _error_response(400, "INVALID_JSON", "Request body must be valid JSON.") + + try: + result = manager.remove_tags_from_asset( + asset_info_id=asset_info_id, + tags=data.tags, + owner_id=USER_MANAGER.get_request_user_id(request), + ) + except ValueError as ve: + return _error_response(404, "ASSET_NOT_FOUND", str(ve), {"id": asset_info_id}) + except Exception: + logging.exception( + "remove_tags_from_asset failed for asset_info_id=%s, owner_id=%s", + asset_info_id, + USER_MANAGER.get_request_user_id(request), + ) + return _error_response(500, "INTERNAL", "Unexpected server error.") + + return web.json_response(result.model_dump(mode="json"), status=200) + + +@ROUTES.post("/api/assets/seed") +async def seed_assets_endpoint(request: web.Request) -> web.Response: + """Trigger asset seeding for specified roots (models, input, output).""" + try: + payload = await request.json() + roots = payload.get("roots", ["models", "input", "output"]) + except Exception: + roots = ["models", "input", "output"] + + valid_roots = [r for r in roots if r in ("models", "input", "output")] + if not valid_roots: + return _error_response(400, "INVALID_BODY", "No valid roots specified") + + try: + seed_assets(tuple(valid_roots)) + except Exception: + logging.exception("seed_assets failed for roots=%s", valid_roots) + return _error_response(500, "INTERNAL", "Seed operation failed") + + return web.json_response({"seeded": valid_roots}, status=200) diff --git a/app/assets/api/schemas_in.py b/app/assets/api/schemas_in.py index 200b41aef..6707ffb0c 100644 --- a/app/assets/api/schemas_in.py +++ b/app/assets/api/schemas_in.py @@ -1,5 +1,4 @@ import json -import uuid from typing import Any, Literal from pydantic import ( @@ -8,9 +7,9 @@ from pydantic import ( Field, conint, field_validator, + model_validator, ) - class ListAssetsQuery(BaseModel): include_tags: list[str] = Field(default_factory=list) exclude_tags: list[str] = Field(default_factory=list) @@ -57,6 +56,57 @@ class ListAssetsQuery(BaseModel): return None +class UpdateAssetBody(BaseModel): + name: str | None = None + user_metadata: dict[str, Any] | None = None + + @model_validator(mode="after") + def _at_least_one(self): + if self.name is None and self.user_metadata is None: + raise ValueError("Provide at least one of: name, user_metadata.") + return self + + +class CreateFromHashBody(BaseModel): + model_config = ConfigDict(extra="ignore", str_strip_whitespace=True) + + hash: str + name: str + tags: list[str] = Field(default_factory=list) + user_metadata: dict[str, Any] = Field(default_factory=dict) + + @field_validator("hash") + @classmethod + def _require_blake3(cls, v): + s = (v or "").strip().lower() + if ":" not in s: + raise ValueError("hash must be 'blake3:'") + algo, digest = s.split(":", 1) + if algo != "blake3": + raise ValueError("only canonical 'blake3:' is accepted here") + if not digest or any(c for c in digest if c not in "0123456789abcdef"): + raise ValueError("hash digest must be lowercase hex") + return s + + @field_validator("tags", mode="before") + @classmethod + def _tags_norm(cls, v): + if v is None: + return [] + if isinstance(v, list): + out = [str(t).strip().lower() for t in v if str(t).strip()] + seen = set() + dedup = [] + for t in out: + if t not in seen: + seen.add(t) + dedup.append(t) + return dedup + if isinstance(v, str): + return [t.strip().lower() for t in v.split(",") if t.strip()] + return [] + + class TagsListQuery(BaseModel): model_config = ConfigDict(extra="ignore", str_strip_whitespace=True) @@ -75,20 +125,140 @@ class TagsListQuery(BaseModel): return v.lower() or None -class SetPreviewBody(BaseModel): - """Set or clear the preview for an AssetInfo. Provide an Asset.id or null.""" - preview_id: str | None = None +class TagsAdd(BaseModel): + model_config = ConfigDict(extra="ignore") + tags: list[str] = Field(..., min_length=1) - @field_validator("preview_id", mode="before") + @field_validator("tags") @classmethod - def _norm_uuid(cls, v): + def normalize_tags(cls, v: list[str]) -> list[str]: + out = [] + for t in v: + if not isinstance(t, str): + raise TypeError("tags must be strings") + tnorm = t.strip().lower() + if tnorm: + out.append(tnorm) + seen = set() + deduplicated = [] + for x in out: + if x not in seen: + seen.add(x) + deduplicated.append(x) + return deduplicated + + +class TagsRemove(TagsAdd): + pass + + +class UploadAssetSpec(BaseModel): + """Upload Asset operation. + - tags: ordered; first is root ('models'|'input'|'output'); + if root == 'models', second must be a valid category from folder_paths.folder_names_and_paths + - name: display name + - user_metadata: arbitrary JSON object (optional) + - hash: optional canonical 'blake3:' provided by the client for validation / fast-path + + Files created via this endpoint are stored on disk using the **content hash** as the filename stem + and the original extension is preserved when available. + """ + model_config = ConfigDict(extra="ignore", str_strip_whitespace=True) + + tags: list[str] = Field(..., min_length=1) + name: str | None = Field(default=None, max_length=512, description="Display Name") + user_metadata: dict[str, Any] = Field(default_factory=dict) + hash: str | None = Field(default=None) + + @field_validator("hash", mode="before") + @classmethod + def _parse_hash(cls, v): if v is None: return None - s = str(v).strip() + s = str(v).strip().lower() if not s: return None - try: - uuid.UUID(s) - except Exception: - raise ValueError("preview_id must be a UUID") - return s + if ":" not in s: + raise ValueError("hash must be 'blake3:'") + algo, digest = s.split(":", 1) + if algo != "blake3": + raise ValueError("only canonical 'blake3:' is accepted here") + if not digest or any(c for c in digest if c not in "0123456789abcdef"): + raise ValueError("hash digest must be lowercase hex") + return f"{algo}:{digest}" + + @field_validator("tags", mode="before") + @classmethod + def _parse_tags(cls, v): + """ + Accepts a list of strings (possibly multiple form fields), + where each string can be: + - JSON array (e.g., '["models","loras","foo"]') + - comma-separated ('models, loras, foo') + - single token ('models') + Returns a normalized, deduplicated, ordered list. + """ + items: list[str] = [] + if v is None: + return [] + if isinstance(v, str): + v = [v] + + if isinstance(v, list): + for item in v: + if item is None: + continue + s = str(item).strip() + if not s: + continue + if s.startswith("["): + try: + arr = json.loads(s) + if isinstance(arr, list): + items.extend(str(x) for x in arr) + continue + except Exception: + pass # fallback to CSV parse below + items.extend([p for p in s.split(",") if p.strip()]) + else: + return [] + + # normalize + dedupe + norm = [] + seen = set() + for t in items: + tnorm = str(t).strip().lower() + if tnorm and tnorm not in seen: + seen.add(tnorm) + norm.append(tnorm) + return norm + + @field_validator("user_metadata", mode="before") + @classmethod + def _parse_metadata_json(cls, v): + if v is None or isinstance(v, dict): + return v or {} + if isinstance(v, str): + s = v.strip() + if not s: + return {} + try: + parsed = json.loads(s) + except Exception as e: + raise ValueError(f"user_metadata must be JSON: {e}") from e + if not isinstance(parsed, dict): + raise ValueError("user_metadata must be a JSON object") + return parsed + return {} + + @model_validator(mode="after") + def _validate_order(self): + if not self.tags: + raise ValueError("tags must be provided and non-empty") + root = self.tags[0] + if root not in {"models", "input", "output"}: + raise ValueError("first tag must be one of: models, input, output") + if root == "models": + if len(self.tags) < 2: + raise ValueError("models uploads require a category tag as the second tag") + return self diff --git a/app/assets/api/schemas_out.py b/app/assets/api/schemas_out.py index 9f8184f20..b6fb3da0c 100644 --- a/app/assets/api/schemas_out.py +++ b/app/assets/api/schemas_out.py @@ -29,6 +29,21 @@ class AssetsList(BaseModel): has_more: bool +class AssetUpdated(BaseModel): + id: str + name: str + asset_hash: str | None = None + tags: list[str] = Field(default_factory=list) + user_metadata: dict[str, Any] = Field(default_factory=dict) + updated_at: datetime | None = None + + model_config = ConfigDict(from_attributes=True) + + @field_serializer("updated_at") + def _ser_updated(self, v: datetime | None, _info): + return v.isoformat() if v else None + + class AssetDetail(BaseModel): id: str name: str @@ -48,6 +63,10 @@ class AssetDetail(BaseModel): return v.isoformat() if v else None +class AssetCreated(AssetDetail): + created_new: bool + + class TagUsage(BaseModel): name: str count: int @@ -58,3 +77,17 @@ class TagsList(BaseModel): tags: list[TagUsage] = Field(default_factory=list) total: int has_more: bool + + +class TagsAdd(BaseModel): + model_config = ConfigDict(str_strip_whitespace=True) + added: list[str] = Field(default_factory=list) + already_present: list[str] = Field(default_factory=list) + total_tags: list[str] = Field(default_factory=list) + + +class TagsRemove(BaseModel): + model_config = ConfigDict(str_strip_whitespace=True) + removed: list[str] = Field(default_factory=list) + not_present: list[str] = Field(default_factory=list) + total_tags: list[str] = Field(default_factory=list) diff --git a/app/assets/database/queries.py b/app/assets/database/queries.py index 0824c0c2f..d6b33ec7b 100644 --- a/app/assets/database/queries.py +++ b/app/assets/database/queries.py @@ -1,9 +1,17 @@ +import os +import logging import sqlalchemy as sa from collections import defaultdict -from sqlalchemy import select, exists, func +from datetime import datetime +from typing import Iterable, Any +from sqlalchemy import select, delete, exists, func +from sqlalchemy.dialects import sqlite +from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import Session, contains_eager, noload -from app.assets.database.models import Asset, AssetInfo, AssetInfoMeta, AssetInfoTag, Tag -from app.assets.helpers import escape_like_prefix, normalize_tags +from app.assets.database.models import Asset, AssetInfo, AssetCacheState, AssetInfoMeta, AssetInfoTag, Tag +from app.assets.helpers import ( + compute_relative_filename, escape_like_prefix, normalize_tags, project_kv, utcnow +) from typing import Sequence @@ -15,6 +23,22 @@ def visible_owner_clause(owner_id: str) -> sa.sql.ClauseElement: return AssetInfo.owner_id.in_(["", owner_id]) +def pick_best_live_path(states: Sequence[AssetCacheState]) -> str: + """ + Return the best on-disk path among cache states: + 1) Prefer a path that exists with needs_verify == False (already verified). + 2) Otherwise, pick the first path that exists. + 3) Otherwise return empty string. + """ + alive = [s for s in states if getattr(s, "file_path", None) and os.path.isfile(s.file_path)] + if not alive: + return "" + for s in alive: + if not getattr(s, "needs_verify", False): + return s.file_path + return alive[0].file_path + + def apply_tag_filters( stmt: sa.sql.Select, include_tags: Sequence[str] | None = None, @@ -42,6 +66,7 @@ def apply_tag_filters( ) return stmt + def apply_metadata_filter( stmt: sa.sql.Select, metadata_filter: dict | None = None, @@ -94,7 +119,11 @@ def apply_metadata_filter( return stmt -def asset_exists_by_hash(session: Session, asset_hash: str) -> bool: +def asset_exists_by_hash( + session: Session, + *, + asset_hash: str, +) -> bool: """ Check if an asset with a given hash exists in database. """ @@ -105,9 +134,39 @@ def asset_exists_by_hash(session: Session, asset_hash: str) -> bool: ).first() return row is not None -def get_asset_info_by_id(session: Session, asset_info_id: str) -> AssetInfo | None: + +def asset_info_exists_for_asset_id( + session: Session, + *, + asset_id: str, +) -> bool: + q = ( + select(sa.literal(True)) + .select_from(AssetInfo) + .where(AssetInfo.asset_id == asset_id) + .limit(1) + ) + return (session.execute(q)).first() is not None + + +def get_asset_by_hash( + session: Session, + *, + asset_hash: str, +) -> Asset | None: + return ( + session.execute(select(Asset).where(Asset.hash == asset_hash).limit(1)) + ).scalars().first() + + +def get_asset_info_by_id( + session: Session, + *, + asset_info_id: str, +) -> AssetInfo | None: return session.get(AssetInfo, asset_info_id) + def list_asset_infos_page( session: Session, owner_id: str = "", @@ -171,12 +230,14 @@ def list_asset_infos_page( select(AssetInfoTag.asset_info_id, Tag.name) .join(Tag, Tag.name == AssetInfoTag.tag_name) .where(AssetInfoTag.asset_info_id.in_(id_list)) + .order_by(AssetInfoTag.added_at) ) for aid, tag_name in rows.all(): tag_map[aid].append(tag_name) return infos, tag_map, total + def fetch_asset_info_asset_and_tags( session: Session, asset_info_id: str, @@ -208,6 +269,494 @@ def fetch_asset_info_asset_and_tags( tags.append(tag_name) return first_info, first_asset, tags + +def fetch_asset_info_and_asset( + session: Session, + *, + asset_info_id: str, + owner_id: str = "", +) -> tuple[AssetInfo, Asset] | None: + stmt = ( + select(AssetInfo, Asset) + .join(Asset, Asset.id == AssetInfo.asset_id) + .where( + AssetInfo.id == asset_info_id, + visible_owner_clause(owner_id), + ) + .limit(1) + .options(noload(AssetInfo.tags)) + ) + row = session.execute(stmt) + pair = row.first() + if not pair: + return None + return pair[0], pair[1] + +def list_cache_states_by_asset_id( + session: Session, *, asset_id: str +) -> Sequence[AssetCacheState]: + return ( + session.execute( + select(AssetCacheState) + .where(AssetCacheState.asset_id == asset_id) + .order_by(AssetCacheState.id.asc()) + ) + ).scalars().all() + + +def touch_asset_info_by_id( + session: Session, + *, + asset_info_id: str, + ts: datetime | None = None, + only_if_newer: bool = True, +) -> None: + ts = ts or utcnow() + stmt = sa.update(AssetInfo).where(AssetInfo.id == asset_info_id) + if only_if_newer: + stmt = stmt.where( + sa.or_(AssetInfo.last_access_time.is_(None), AssetInfo.last_access_time < ts) + ) + session.execute(stmt.values(last_access_time=ts)) + + +def create_asset_info_for_existing_asset( + session: Session, + *, + asset_hash: str, + name: str, + user_metadata: dict | None = None, + tags: Sequence[str] | None = None, + tag_origin: str = "manual", + owner_id: str = "", +) -> AssetInfo: + """Create or return an existing AssetInfo for an Asset identified by asset_hash.""" + now = utcnow() + asset = get_asset_by_hash(session, asset_hash=asset_hash) + if not asset: + raise ValueError(f"Unknown asset hash {asset_hash}") + + info = AssetInfo( + owner_id=owner_id, + name=name, + asset_id=asset.id, + preview_id=None, + created_at=now, + updated_at=now, + last_access_time=now, + ) + try: + with session.begin_nested(): + session.add(info) + session.flush() + except IntegrityError: + existing = ( + session.execute( + select(AssetInfo) + .options(noload(AssetInfo.tags)) + .where( + AssetInfo.asset_id == asset.id, + AssetInfo.name == name, + AssetInfo.owner_id == owner_id, + ) + .limit(1) + ) + ).unique().scalars().first() + if not existing: + raise RuntimeError("AssetInfo upsert failed to find existing row after conflict.") + return existing + + # metadata["filename"] hack + new_meta = dict(user_metadata or {}) + computed_filename = None + try: + p = pick_best_live_path(list_cache_states_by_asset_id(session, asset_id=asset.id)) + if p: + computed_filename = compute_relative_filename(p) + except Exception: + computed_filename = None + if computed_filename: + new_meta["filename"] = computed_filename + if new_meta: + replace_asset_info_metadata_projection( + session, + asset_info_id=info.id, + user_metadata=new_meta, + ) + + if tags is not None: + set_asset_info_tags( + session, + asset_info_id=info.id, + tags=tags, + origin=tag_origin, + ) + return info + + +def set_asset_info_tags( + session: Session, + *, + asset_info_id: str, + tags: Sequence[str], + origin: str = "manual", +) -> dict: + desired = normalize_tags(tags) + + current = set( + tag_name for (tag_name,) in ( + session.execute(select(AssetInfoTag.tag_name).where(AssetInfoTag.asset_info_id == asset_info_id)) + ).all() + ) + + to_add = [t for t in desired if t not in current] + to_remove = [t for t in current if t not in desired] + + if to_add: + ensure_tags_exist(session, to_add, tag_type="user") + session.add_all([ + AssetInfoTag(asset_info_id=asset_info_id, tag_name=t, origin=origin, added_at=utcnow()) + for t in to_add + ]) + session.flush() + + if to_remove: + session.execute( + delete(AssetInfoTag) + .where(AssetInfoTag.asset_info_id == asset_info_id, AssetInfoTag.tag_name.in_(to_remove)) + ) + session.flush() + + return {"added": to_add, "removed": to_remove, "total": desired} + + +def replace_asset_info_metadata_projection( + session: Session, + *, + asset_info_id: str, + user_metadata: dict | None = None, +) -> None: + info = session.get(AssetInfo, asset_info_id) + if not info: + raise ValueError(f"AssetInfo {asset_info_id} not found") + + info.user_metadata = user_metadata or {} + info.updated_at = utcnow() + session.flush() + + session.execute(delete(AssetInfoMeta).where(AssetInfoMeta.asset_info_id == asset_info_id)) + session.flush() + + if not user_metadata: + return + + rows: list[AssetInfoMeta] = [] + for k, v in user_metadata.items(): + for r in project_kv(k, v): + rows.append( + AssetInfoMeta( + asset_info_id=asset_info_id, + key=r["key"], + ordinal=int(r["ordinal"]), + val_str=r.get("val_str"), + val_num=r.get("val_num"), + val_bool=r.get("val_bool"), + val_json=r.get("val_json"), + ) + ) + if rows: + session.add_all(rows) + session.flush() + + +def ingest_fs_asset( + session: Session, + *, + asset_hash: str, + abs_path: str, + size_bytes: int, + mtime_ns: int, + mime_type: str | None = None, + info_name: str | None = None, + owner_id: str = "", + preview_id: str | None = None, + user_metadata: dict | None = None, + tags: Sequence[str] = (), + tag_origin: str = "manual", + require_existing_tags: bool = False, +) -> dict: + """ + Idempotently upsert: + - Asset by content hash (create if missing) + - AssetCacheState(file_path) pointing to asset_id + - Optionally AssetInfo + tag links and metadata projection + Returns flags and ids. + """ + locator = os.path.abspath(abs_path) + now = utcnow() + + if preview_id: + if not session.get(Asset, preview_id): + preview_id = None + + out: dict[str, Any] = { + "asset_created": False, + "asset_updated": False, + "state_created": False, + "state_updated": False, + "asset_info_id": None, + } + + # 1) Asset by hash + asset = ( + session.execute(select(Asset).where(Asset.hash == asset_hash).limit(1)) + ).scalars().first() + if not asset: + vals = { + "hash": asset_hash, + "size_bytes": int(size_bytes), + "mime_type": mime_type, + "created_at": now, + } + res = session.execute( + sqlite.insert(Asset) + .values(**vals) + .on_conflict_do_nothing(index_elements=[Asset.hash]) + ) + if int(res.rowcount or 0) > 0: + out["asset_created"] = True + asset = ( + session.execute( + select(Asset).where(Asset.hash == asset_hash).limit(1) + ) + ).scalars().first() + if not asset: + raise RuntimeError("Asset row not found after upsert.") + else: + changed = False + if asset.size_bytes != int(size_bytes) and int(size_bytes) > 0: + asset.size_bytes = int(size_bytes) + changed = True + if mime_type and asset.mime_type != mime_type: + asset.mime_type = mime_type + changed = True + if changed: + out["asset_updated"] = True + + # 2) AssetCacheState upsert by file_path (unique) + vals = { + "asset_id": asset.id, + "file_path": locator, + "mtime_ns": int(mtime_ns), + } + ins = ( + sqlite.insert(AssetCacheState) + .values(**vals) + .on_conflict_do_nothing(index_elements=[AssetCacheState.file_path]) + ) + + res = session.execute(ins) + if int(res.rowcount or 0) > 0: + out["state_created"] = True + else: + upd = ( + sa.update(AssetCacheState) + .where(AssetCacheState.file_path == locator) + .where( + sa.or_( + AssetCacheState.asset_id != asset.id, + AssetCacheState.mtime_ns.is_(None), + AssetCacheState.mtime_ns != int(mtime_ns), + ) + ) + .values(asset_id=asset.id, mtime_ns=int(mtime_ns)) + ) + res2 = session.execute(upd) + if int(res2.rowcount or 0) > 0: + out["state_updated"] = True + + # 3) Optional AssetInfo + tags + metadata + if info_name: + try: + with session.begin_nested(): + info = AssetInfo( + owner_id=owner_id, + name=info_name, + asset_id=asset.id, + preview_id=preview_id, + created_at=now, + updated_at=now, + last_access_time=now, + ) + session.add(info) + session.flush() + out["asset_info_id"] = info.id + except IntegrityError: + pass + + existing_info = ( + session.execute( + select(AssetInfo) + .where( + AssetInfo.asset_id == asset.id, + AssetInfo.name == info_name, + (AssetInfo.owner_id == owner_id), + ) + .limit(1) + ) + ).unique().scalar_one_or_none() + if not existing_info: + raise RuntimeError("Failed to update or insert AssetInfo.") + + if preview_id and existing_info.preview_id != preview_id: + existing_info.preview_id = preview_id + + existing_info.updated_at = now + if existing_info.last_access_time < now: + existing_info.last_access_time = now + session.flush() + out["asset_info_id"] = existing_info.id + + norm = [t.strip().lower() for t in (tags or []) if (t or "").strip()] + if norm and out["asset_info_id"] is not None: + if not require_existing_tags: + ensure_tags_exist(session, norm, tag_type="user") + + existing_tag_names = set( + name for (name,) in (session.execute(select(Tag.name).where(Tag.name.in_(norm)))).all() + ) + missing = [t for t in norm if t not in existing_tag_names] + if missing and require_existing_tags: + raise ValueError(f"Unknown tags: {missing}") + + existing_links = set( + tag_name + for (tag_name,) in ( + session.execute( + select(AssetInfoTag.tag_name).where(AssetInfoTag.asset_info_id == out["asset_info_id"]) + ) + ).all() + ) + to_add = [t for t in norm if t in existing_tag_names and t not in existing_links] + if to_add: + session.add_all( + [ + AssetInfoTag( + asset_info_id=out["asset_info_id"], + tag_name=t, + origin=tag_origin, + added_at=now, + ) + for t in to_add + ] + ) + session.flush() + + # metadata["filename"] hack + if out["asset_info_id"] is not None: + primary_path = pick_best_live_path(list_cache_states_by_asset_id(session, asset_id=asset.id)) + computed_filename = compute_relative_filename(primary_path) if primary_path else None + + current_meta = existing_info.user_metadata or {} + new_meta = dict(current_meta) + if user_metadata is not None: + for k, v in user_metadata.items(): + new_meta[k] = v + if computed_filename: + new_meta["filename"] = computed_filename + + if new_meta != current_meta: + replace_asset_info_metadata_projection( + session, + asset_info_id=out["asset_info_id"], + user_metadata=new_meta, + ) + + try: + remove_missing_tag_for_asset_id(session, asset_id=asset.id) + except Exception: + logging.exception("Failed to clear 'missing' tag for asset %s", asset.id) + return out + + +def update_asset_info_full( + session: Session, + *, + asset_info_id: str, + name: str | None = None, + tags: Sequence[str] | None = None, + user_metadata: dict | None = None, + tag_origin: str = "manual", + asset_info_row: Any = None, +) -> AssetInfo: + if not asset_info_row: + info = session.get(AssetInfo, asset_info_id) + if not info: + raise ValueError(f"AssetInfo {asset_info_id} not found") + else: + info = asset_info_row + + touched = False + if name is not None and name != info.name: + info.name = name + touched = True + + computed_filename = None + try: + p = pick_best_live_path(list_cache_states_by_asset_id(session, asset_id=info.asset_id)) + if p: + computed_filename = compute_relative_filename(p) + except Exception: + computed_filename = None + + if user_metadata is not None: + new_meta = dict(user_metadata) + if computed_filename: + new_meta["filename"] = computed_filename + replace_asset_info_metadata_projection( + session, asset_info_id=asset_info_id, user_metadata=new_meta + ) + touched = True + else: + if computed_filename: + current_meta = info.user_metadata or {} + if current_meta.get("filename") != computed_filename: + new_meta = dict(current_meta) + new_meta["filename"] = computed_filename + replace_asset_info_metadata_projection( + session, asset_info_id=asset_info_id, user_metadata=new_meta + ) + touched = True + + if tags is not None: + set_asset_info_tags( + session, + asset_info_id=asset_info_id, + tags=tags, + origin=tag_origin, + ) + touched = True + + if touched and user_metadata is None: + info.updated_at = utcnow() + session.flush() + + return info + + +def delete_asset_info_by_id( + session: Session, + *, + asset_info_id: str, + owner_id: str, +) -> bool: + stmt = sa.delete(AssetInfo).where( + AssetInfo.id == asset_info_id, + visible_owner_clause(owner_id), + ) + return int((session.execute(stmt)).rowcount or 0) > 0 + + def list_tags_with_usage( session: Session, prefix: str | None = None, @@ -265,3 +814,163 @@ def list_tags_with_usage( rows_norm = [(name, ttype, int(count or 0)) for (name, ttype, count) in rows] return rows_norm, int(total or 0) + + +def ensure_tags_exist(session: Session, names: Iterable[str], tag_type: str = "user") -> None: + wanted = normalize_tags(list(names)) + if not wanted: + return + rows = [{"name": n, "tag_type": tag_type} for n in list(dict.fromkeys(wanted))] + ins = ( + sqlite.insert(Tag) + .values(rows) + .on_conflict_do_nothing(index_elements=[Tag.name]) + ) + session.execute(ins) + + +def get_asset_tags(session: Session, *, asset_info_id: str) -> list[str]: + return [ + tag_name for (tag_name,) in ( + session.execute( + select(AssetInfoTag.tag_name).where(AssetInfoTag.asset_info_id == asset_info_id) + ) + ).all() + ] + + +def add_tags_to_asset_info( + session: Session, + *, + asset_info_id: str, + tags: Sequence[str], + origin: str = "manual", + create_if_missing: bool = True, + asset_info_row: Any = None, +) -> dict: + if not asset_info_row: + info = session.get(AssetInfo, asset_info_id) + if not info: + raise ValueError(f"AssetInfo {asset_info_id} not found") + + norm = normalize_tags(tags) + if not norm: + total = get_asset_tags(session, asset_info_id=asset_info_id) + return {"added": [], "already_present": [], "total_tags": total} + + if create_if_missing: + ensure_tags_exist(session, norm, tag_type="user") + + current = { + tag_name + for (tag_name,) in ( + session.execute( + sa.select(AssetInfoTag.tag_name).where(AssetInfoTag.asset_info_id == asset_info_id) + ) + ).all() + } + + want = set(norm) + to_add = sorted(want - current) + + if to_add: + with session.begin_nested() as nested: + try: + session.add_all( + [ + AssetInfoTag( + asset_info_id=asset_info_id, + tag_name=t, + origin=origin, + added_at=utcnow(), + ) + for t in to_add + ] + ) + session.flush() + except IntegrityError: + nested.rollback() + + after = set(get_asset_tags(session, asset_info_id=asset_info_id)) + return { + "added": sorted(((after - current) & want)), + "already_present": sorted(want & current), + "total_tags": sorted(after), + } + + +def remove_tags_from_asset_info( + session: Session, + *, + asset_info_id: str, + tags: Sequence[str], +) -> dict: + info = session.get(AssetInfo, asset_info_id) + if not info: + raise ValueError(f"AssetInfo {asset_info_id} not found") + + norm = normalize_tags(tags) + if not norm: + total = get_asset_tags(session, asset_info_id=asset_info_id) + return {"removed": [], "not_present": [], "total_tags": total} + + existing = { + tag_name + for (tag_name,) in ( + session.execute( + sa.select(AssetInfoTag.tag_name).where(AssetInfoTag.asset_info_id == asset_info_id) + ) + ).all() + } + + to_remove = sorted(set(t for t in norm if t in existing)) + not_present = sorted(set(t for t in norm if t not in existing)) + + if to_remove: + session.execute( + delete(AssetInfoTag) + .where( + AssetInfoTag.asset_info_id == asset_info_id, + AssetInfoTag.tag_name.in_(to_remove), + ) + ) + session.flush() + + total = get_asset_tags(session, asset_info_id=asset_info_id) + return {"removed": to_remove, "not_present": not_present, "total_tags": total} + + +def remove_missing_tag_for_asset_id( + session: Session, + *, + asset_id: str, +) -> None: + session.execute( + sa.delete(AssetInfoTag).where( + AssetInfoTag.asset_info_id.in_(sa.select(AssetInfo.id).where(AssetInfo.asset_id == asset_id)), + AssetInfoTag.tag_name == "missing", + ) + ) + + +def set_asset_info_preview( + session: Session, + *, + asset_info_id: str, + preview_asset_id: str | None = None, +) -> None: + """Set or clear preview_id and bump updated_at. Raises on unknown IDs.""" + info = session.get(AssetInfo, asset_info_id) + if not info: + raise ValueError(f"AssetInfo {asset_info_id} not found") + + if preview_asset_id is None: + info.preview_id = None + else: + # validate preview asset exists + if not session.get(Asset, preview_asset_id): + raise ValueError(f"Preview Asset {preview_asset_id} not found") + info.preview_id = preview_asset_id + + info.updated_at = utcnow() + session.flush() diff --git a/app/assets/helpers.py b/app/assets/helpers.py index 08b465b5a..5030b123a 100644 --- a/app/assets/helpers.py +++ b/app/assets/helpers.py @@ -1,5 +1,6 @@ import contextlib import os +from decimal import Decimal from aiohttp import web from datetime import datetime, timezone from pathlib import Path @@ -87,6 +88,40 @@ def get_comfy_models_folders() -> list[tuple[str, list[str]]]: targets.append((name, paths)) return targets +def resolve_destination_from_tags(tags: list[str]) -> tuple[str, list[str]]: + """Validates and maps tags -> (base_dir, subdirs_for_fs)""" + root = tags[0] + if root == "models": + if len(tags) < 2: + raise ValueError("at least two tags required for model asset") + try: + bases = folder_paths.folder_names_and_paths[tags[1]][0] + except KeyError: + raise ValueError(f"unknown model category '{tags[1]}'") + if not bases: + raise ValueError(f"no base path configured for category '{tags[1]}'") + base_dir = os.path.abspath(bases[0]) + raw_subdirs = tags[2:] + else: + base_dir = os.path.abspath( + folder_paths.get_input_directory() if root == "input" else folder_paths.get_output_directory() + ) + raw_subdirs = tags[1:] + for i in raw_subdirs: + if i in (".", ".."): + raise ValueError("invalid path component in tags") + + return base_dir, raw_subdirs if raw_subdirs else [] + +def ensure_within_base(candidate: str, base: str) -> None: + cand_abs = os.path.abspath(candidate) + base_abs = os.path.abspath(base) + try: + if os.path.commonpath([cand_abs, base_abs]) != base_abs: + raise ValueError("destination escapes base directory") + except Exception: + raise ValueError("invalid destination path") + def compute_relative_filename(file_path: str) -> str | None: """ Return the model's path relative to the last well-known folder (the model category), @@ -113,7 +148,6 @@ def compute_relative_filename(file_path: str) -> str | None: return "/".join(inside) return "/".join(parts) # input/output: keep all parts - def get_relative_to_root_category_path_of_asset(file_path: str) -> tuple[Literal["input", "output", "models"], str]: """Given an absolute or relative file path, determine which root category the path belongs to: - 'input' if the file resides under `folder_paths.get_input_directory()` @@ -215,3 +249,64 @@ def collect_models_files() -> list[str]: if allowed: out.append(abs_path) return out + +def is_scalar(v): + if v is None: + return True + if isinstance(v, bool): + return True + if isinstance(v, (int, float, Decimal, str)): + return True + return False + +def project_kv(key: str, value): + """ + Turn a metadata key/value into typed projection rows. + Returns list[dict] with keys: + key, ordinal, and one of val_str / val_num / val_bool / val_json (others None) + """ + rows: list[dict] = [] + + def _null_row(ordinal: int) -> dict: + return { + "key": key, "ordinal": ordinal, + "val_str": None, "val_num": None, "val_bool": None, "val_json": None + } + + if value is None: + rows.append(_null_row(0)) + return rows + + if is_scalar(value): + if isinstance(value, bool): + rows.append({"key": key, "ordinal": 0, "val_bool": bool(value)}) + elif isinstance(value, (int, float, Decimal)): + num = value if isinstance(value, Decimal) else Decimal(str(value)) + rows.append({"key": key, "ordinal": 0, "val_num": num}) + elif isinstance(value, str): + rows.append({"key": key, "ordinal": 0, "val_str": value}) + else: + rows.append({"key": key, "ordinal": 0, "val_json": value}) + return rows + + if isinstance(value, list): + if all(is_scalar(x) for x in value): + for i, x in enumerate(value): + if x is None: + rows.append(_null_row(i)) + elif isinstance(x, bool): + rows.append({"key": key, "ordinal": i, "val_bool": bool(x)}) + elif isinstance(x, (int, float, Decimal)): + num = x if isinstance(x, Decimal) else Decimal(str(x)) + rows.append({"key": key, "ordinal": i, "val_num": num}) + elif isinstance(x, str): + rows.append({"key": key, "ordinal": i, "val_str": x}) + else: + rows.append({"key": key, "ordinal": i, "val_json": x}) + return rows + for i, x in enumerate(value): + rows.append({"key": key, "ordinal": i, "val_json": x}) + return rows + + rows.append({"key": key, "ordinal": 0, "val_json": value}) + return rows diff --git a/app/assets/manager.py b/app/assets/manager.py index 6425e7aa2..a68c8c8ae 100644 --- a/app/assets/manager.py +++ b/app/assets/manager.py @@ -1,13 +1,33 @@ +import os +import mimetypes +import contextlib from typing import Sequence from app.database.db import create_session -from app.assets.api import schemas_out +from app.assets.api import schemas_out, schemas_in from app.assets.database.queries import ( asset_exists_by_hash, + asset_info_exists_for_asset_id, + get_asset_by_hash, + get_asset_info_by_id, fetch_asset_info_asset_and_tags, + fetch_asset_info_and_asset, + create_asset_info_for_existing_asset, + touch_asset_info_by_id, + update_asset_info_full, + delete_asset_info_by_id, + list_cache_states_by_asset_id, list_asset_infos_page, list_tags_with_usage, + get_asset_tags, + add_tags_to_asset_info, + remove_tags_from_asset_info, + pick_best_live_path, + ingest_fs_asset, + set_asset_info_preview, ) +from app.assets.helpers import resolve_destination_from_tags, ensure_within_base +from app.assets.database.models import Asset def _safe_sort_field(requested: str | None) -> str: @@ -19,11 +39,28 @@ def _safe_sort_field(requested: str | None) -> str: return "created_at" -def asset_exists(asset_hash: str) -> bool: +def _get_size_mtime_ns(path: str) -> tuple[int, int]: + st = os.stat(path, follow_symlinks=True) + return st.st_size, getattr(st, "st_mtime_ns", int(st.st_mtime * 1_000_000_000)) + + +def _safe_filename(name: str | None, fallback: str) -> str: + n = os.path.basename((name or "").strip() or fallback) + if n: + return n + return fallback + + +def asset_exists(*, asset_hash: str) -> bool: + """ + Check if an asset with a given hash exists in database. + """ with create_session() as session: return asset_exists_by_hash(session, asset_hash=asset_hash) + def list_assets( + *, include_tags: Sequence[str] | None = None, exclude_tags: Sequence[str] | None = None, name_contains: str | None = None, @@ -63,7 +100,6 @@ def list_assets( size=int(asset.size_bytes) if asset else None, mime_type=asset.mime_type if asset else None, tags=tags, - preview_url=f"/api/assets/{info.id}/content", created_at=info.created_at, updated_at=info.updated_at, last_access_time=info.last_access_time, @@ -76,7 +112,12 @@ def list_assets( has_more=(offset + len(summaries)) < total, ) -def get_asset(asset_info_id: str, owner_id: str = "") -> schemas_out.AssetDetail: + +def get_asset( + *, + asset_info_id: str, + owner_id: str = "", +) -> schemas_out.AssetDetail: with create_session() as session: res = fetch_asset_info_asset_and_tags(session, asset_info_id=asset_info_id, owner_id=owner_id) if not res: @@ -97,6 +138,358 @@ def get_asset(asset_info_id: str, owner_id: str = "") -> schemas_out.AssetDetail last_access_time=info.last_access_time, ) + +def resolve_asset_content_for_download( + *, + asset_info_id: str, + owner_id: str = "", +) -> tuple[str, str, str]: + with create_session() as session: + pair = fetch_asset_info_and_asset(session, asset_info_id=asset_info_id, owner_id=owner_id) + if not pair: + raise ValueError(f"AssetInfo {asset_info_id} not found") + + info, asset = pair + states = list_cache_states_by_asset_id(session, asset_id=asset.id) + abs_path = pick_best_live_path(states) + if not abs_path: + raise FileNotFoundError + + touch_asset_info_by_id(session, asset_info_id=asset_info_id) + session.commit() + + ctype = asset.mime_type or mimetypes.guess_type(info.name or abs_path)[0] or "application/octet-stream" + download_name = info.name or os.path.basename(abs_path) + return abs_path, ctype, download_name + + +def upload_asset_from_temp_path( + spec: schemas_in.UploadAssetSpec, + *, + temp_path: str, + client_filename: str | None = None, + owner_id: str = "", + expected_asset_hash: str | None = None, +) -> schemas_out.AssetCreated: + """ + Create new asset or update existing asset from a temporary file path. + """ + try: + # NOTE: blake3 is not required right now, so this will fail if blake3 is not installed in local environment + import app.assets.hashing as hashing + digest = hashing.blake3_hash(temp_path) + except Exception as e: + raise RuntimeError(f"failed to hash uploaded file: {e}") + asset_hash = "blake3:" + digest + + if expected_asset_hash and asset_hash != expected_asset_hash.strip().lower(): + raise ValueError("HASH_MISMATCH") + + with create_session() as session: + existing = get_asset_by_hash(session, asset_hash=asset_hash) + if existing is not None: + with contextlib.suppress(Exception): + if temp_path and os.path.exists(temp_path): + os.remove(temp_path) + + display_name = _safe_filename(spec.name or (client_filename or ""), fallback=digest) + info = create_asset_info_for_existing_asset( + session, + asset_hash=asset_hash, + name=display_name, + user_metadata=spec.user_metadata or {}, + tags=spec.tags or [], + tag_origin="manual", + owner_id=owner_id, + ) + tag_names = get_asset_tags(session, asset_info_id=info.id) + session.commit() + + return schemas_out.AssetCreated( + id=info.id, + name=info.name, + asset_hash=existing.hash, + size=int(existing.size_bytes) if existing.size_bytes is not None else None, + mime_type=existing.mime_type, + tags=tag_names, + user_metadata=info.user_metadata or {}, + preview_id=info.preview_id, + created_at=info.created_at, + last_access_time=info.last_access_time, + created_new=False, + ) + + base_dir, subdirs = resolve_destination_from_tags(spec.tags) + dest_dir = os.path.join(base_dir, *subdirs) if subdirs else base_dir + os.makedirs(dest_dir, exist_ok=True) + + src_for_ext = (client_filename or spec.name or "").strip() + _ext = os.path.splitext(os.path.basename(src_for_ext))[1] if src_for_ext else "" + ext = _ext if 0 < len(_ext) <= 16 else "" + hashed_basename = f"{digest}{ext}" + dest_abs = os.path.abspath(os.path.join(dest_dir, hashed_basename)) + ensure_within_base(dest_abs, base_dir) + + content_type = ( + mimetypes.guess_type(os.path.basename(src_for_ext), strict=False)[0] + or mimetypes.guess_type(hashed_basename, strict=False)[0] + or "application/octet-stream" + ) + + try: + os.replace(temp_path, dest_abs) + except Exception as e: + raise RuntimeError(f"failed to move uploaded file into place: {e}") + + try: + size_bytes, mtime_ns = _get_size_mtime_ns(dest_abs) + except OSError as e: + raise RuntimeError(f"failed to stat destination file: {e}") + + with create_session() as session: + result = ingest_fs_asset( + session, + asset_hash=asset_hash, + abs_path=dest_abs, + size_bytes=size_bytes, + mtime_ns=mtime_ns, + mime_type=content_type, + info_name=_safe_filename(spec.name or (client_filename or ""), fallback=digest), + owner_id=owner_id, + preview_id=None, + user_metadata=spec.user_metadata or {}, + tags=spec.tags, + tag_origin="manual", + require_existing_tags=False, + ) + info_id = result["asset_info_id"] + if not info_id: + raise RuntimeError("failed to create asset metadata") + + pair = fetch_asset_info_and_asset(session, asset_info_id=info_id, owner_id=owner_id) + if not pair: + raise RuntimeError("inconsistent DB state after ingest") + info, asset = pair + tag_names = get_asset_tags(session, asset_info_id=info.id) + created_result = schemas_out.AssetCreated( + id=info.id, + name=info.name, + asset_hash=asset.hash, + size=int(asset.size_bytes), + mime_type=asset.mime_type, + tags=tag_names, + user_metadata=info.user_metadata or {}, + preview_id=info.preview_id, + created_at=info.created_at, + last_access_time=info.last_access_time, + created_new=result["asset_created"], + ) + session.commit() + + return created_result + + +def update_asset( + *, + asset_info_id: str, + name: str | None = None, + tags: list[str] | None = None, + user_metadata: dict | None = None, + owner_id: str = "", +) -> schemas_out.AssetUpdated: + with create_session() as session: + info_row = get_asset_info_by_id(session, asset_info_id=asset_info_id) + if not info_row: + raise ValueError(f"AssetInfo {asset_info_id} not found") + if info_row.owner_id and info_row.owner_id != owner_id: + raise PermissionError("not owner") + + info = update_asset_info_full( + session, + asset_info_id=asset_info_id, + name=name, + tags=tags, + user_metadata=user_metadata, + tag_origin="manual", + asset_info_row=info_row, + ) + + tag_names = get_asset_tags(session, asset_info_id=asset_info_id) + result = schemas_out.AssetUpdated( + id=info.id, + name=info.name, + asset_hash=info.asset.hash if info.asset else None, + tags=tag_names, + user_metadata=info.user_metadata or {}, + updated_at=info.updated_at, + ) + session.commit() + + return result + + +def set_asset_preview( + *, + asset_info_id: str, + preview_asset_id: str | None = None, + owner_id: str = "", +) -> schemas_out.AssetDetail: + with create_session() as session: + info_row = get_asset_info_by_id(session, asset_info_id=asset_info_id) + if not info_row: + raise ValueError(f"AssetInfo {asset_info_id} not found") + if info_row.owner_id and info_row.owner_id != owner_id: + raise PermissionError("not owner") + + set_asset_info_preview( + session, + asset_info_id=asset_info_id, + preview_asset_id=preview_asset_id, + ) + + res = fetch_asset_info_asset_and_tags(session, asset_info_id=asset_info_id, owner_id=owner_id) + if not res: + raise RuntimeError("State changed during preview update") + info, asset, tags = res + result = schemas_out.AssetDetail( + id=info.id, + name=info.name, + asset_hash=asset.hash if asset else None, + size=int(asset.size_bytes) if asset and asset.size_bytes is not None else None, + mime_type=asset.mime_type if asset else None, + tags=tags, + user_metadata=info.user_metadata or {}, + preview_id=info.preview_id, + created_at=info.created_at, + last_access_time=info.last_access_time, + ) + session.commit() + + return result + + +def delete_asset_reference(*, asset_info_id: str, owner_id: str, delete_content_if_orphan: bool = True) -> bool: + with create_session() as session: + info_row = get_asset_info_by_id(session, asset_info_id=asset_info_id) + asset_id = info_row.asset_id if info_row else None + deleted = delete_asset_info_by_id(session, asset_info_id=asset_info_id, owner_id=owner_id) + if not deleted: + session.commit() + return False + + if not delete_content_if_orphan or not asset_id: + session.commit() + return True + + still_exists = asset_info_exists_for_asset_id(session, asset_id=asset_id) + if still_exists: + session.commit() + return True + + states = list_cache_states_by_asset_id(session, asset_id=asset_id) + file_paths = [s.file_path for s in (states or []) if getattr(s, "file_path", None)] + + asset_row = session.get(Asset, asset_id) + if asset_row is not None: + session.delete(asset_row) + + session.commit() + for p in file_paths: + with contextlib.suppress(Exception): + if p and os.path.isfile(p): + os.remove(p) + return True + + +def create_asset_from_hash( + *, + hash_str: str, + name: str, + tags: list[str] | None = None, + user_metadata: dict | None = None, + owner_id: str = "", +) -> schemas_out.AssetCreated | None: + canonical = hash_str.strip().lower() + with create_session() as session: + asset = get_asset_by_hash(session, asset_hash=canonical) + if not asset: + return None + + info = create_asset_info_for_existing_asset( + session, + asset_hash=canonical, + name=_safe_filename(name, fallback=canonical.split(":", 1)[1]), + user_metadata=user_metadata or {}, + tags=tags or [], + tag_origin="manual", + owner_id=owner_id, + ) + tag_names = get_asset_tags(session, asset_info_id=info.id) + result = schemas_out.AssetCreated( + id=info.id, + name=info.name, + asset_hash=asset.hash, + size=int(asset.size_bytes), + mime_type=asset.mime_type, + tags=tag_names, + user_metadata=info.user_metadata or {}, + preview_id=info.preview_id, + created_at=info.created_at, + last_access_time=info.last_access_time, + created_new=False, + ) + session.commit() + + return result + + +def add_tags_to_asset( + *, + asset_info_id: str, + tags: list[str], + origin: str = "manual", + owner_id: str = "", +) -> schemas_out.TagsAdd: + with create_session() as session: + info_row = get_asset_info_by_id(session, asset_info_id=asset_info_id) + if not info_row: + raise ValueError(f"AssetInfo {asset_info_id} not found") + if info_row.owner_id and info_row.owner_id != owner_id: + raise PermissionError("not owner") + data = add_tags_to_asset_info( + session, + asset_info_id=asset_info_id, + tags=tags, + origin=origin, + create_if_missing=True, + asset_info_row=info_row, + ) + session.commit() + return schemas_out.TagsAdd(**data) + + +def remove_tags_from_asset( + *, + asset_info_id: str, + tags: list[str], + owner_id: str = "", +) -> schemas_out.TagsRemove: + with create_session() as session: + info_row = get_asset_info_by_id(session, asset_info_id=asset_info_id) + if not info_row: + raise ValueError(f"AssetInfo {asset_info_id} not found") + if info_row.owner_id and info_row.owner_id != owner_id: + raise PermissionError("not owner") + + data = remove_tags_from_asset_info( + session, + asset_info_id=asset_info_id, + tags=tags, + ) + session.commit() + return schemas_out.TagsRemove(**data) + + def list_tags( prefix: str | None = None, limit: int = 100, diff --git a/app/assets/scanner.py b/app/assets/scanner.py index a16e41d94..0172a5c2f 100644 --- a/app/assets/scanner.py +++ b/app/assets/scanner.py @@ -27,6 +27,7 @@ def seed_assets(roots: tuple[RootType, ...], enable_logging: bool = False) -> No t_start = time.perf_counter() created = 0 skipped_existing = 0 + orphans_pruned = 0 paths: list[str] = [] try: existing_paths: set[str] = set() @@ -38,6 +39,11 @@ def seed_assets(roots: tuple[RootType, ...], enable_logging: bool = False) -> No except Exception as e: logging.exception("fast DB scan failed for %s: %s", r, e) + try: + orphans_pruned = _prune_orphaned_assets(roots) + except Exception as e: + logging.exception("orphan pruning failed: %s", e) + if "models" in roots: paths.extend(collect_models_files()) if "input" in roots: @@ -85,15 +91,43 @@ def seed_assets(roots: tuple[RootType, ...], enable_logging: bool = False) -> No finally: if enable_logging: logging.info( - "Assets scan(roots=%s) completed in %.3fs (created=%d, skipped_existing=%d, total_seen=%d)", + "Assets scan(roots=%s) completed in %.3fs (created=%d, skipped_existing=%d, orphans_pruned=%d, total_seen=%d)", roots, time.perf_counter() - t_start, created, skipped_existing, + orphans_pruned, len(paths), ) +def _prune_orphaned_assets(roots: tuple[RootType, ...]) -> int: + """Prune cache states outside configured prefixes, then delete orphaned seed assets.""" + all_prefixes = [os.path.abspath(p) for r in roots for p in prefixes_for_root(r)] + if not all_prefixes: + return 0 + + def make_prefix_condition(prefix: str): + base = prefix if prefix.endswith(os.sep) else prefix + os.sep + escaped, esc = escape_like_prefix(base) + return AssetCacheState.file_path.like(escaped + "%", escape=esc) + + matches_valid_prefix = sqlalchemy.or_(*[make_prefix_condition(p) for p in all_prefixes]) + + orphan_subq = ( + sqlalchemy.select(Asset.id) + .outerjoin(AssetCacheState, AssetCacheState.asset_id == Asset.id) + .where(Asset.hash.is_(None), AssetCacheState.id.is_(None)) + ).scalar_subquery() + + with create_session() as sess: + sess.execute(sqlalchemy.delete(AssetCacheState).where(~matches_valid_prefix)) + sess.execute(sqlalchemy.delete(AssetInfo).where(AssetInfo.asset_id.in_(orphan_subq))) + result = sess.execute(sqlalchemy.delete(Asset).where(Asset.id.in_(orphan_subq))) + sess.commit() + return result.rowcount + + def _fast_db_consistency_pass( root: RootType, *, diff --git a/comfy/comfy_types/node_typing.py b/comfy/comfy_types/node_typing.py index 071b98332..0194b7d70 100644 --- a/comfy/comfy_types/node_typing.py +++ b/comfy/comfy_types/node_typing.py @@ -236,6 +236,8 @@ class ComfyNodeABC(ABC): """Flags a node as experimental, informing users that it may change or not work as expected.""" DEPRECATED: bool """Flags a node as deprecated, indicating to users that they should find alternatives to this node.""" + DEV_ONLY: bool + """Flags a node as dev-only, hiding it from search/menus unless dev mode is enabled.""" API_NODE: Optional[bool] """Flags a node as an API node. See: https://docs.comfy.org/tutorials/api-nodes/overview.""" diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py index cb4f52ce1..4b3a3798c 100644 --- a/comfy/latent_formats.py +++ b/comfy/latent_formats.py @@ -8,6 +8,7 @@ class LatentFormat: latent_rgb_factors_bias = None latent_rgb_factors_reshape = None taesd_decoder_name = None + spacial_downscale_ratio = 8 def process_in(self, latent): return latent * self.scale_factor @@ -80,6 +81,7 @@ class SD_X4(LatentFormat): class SC_Prior(LatentFormat): latent_channels = 16 + spacial_downscale_ratio = 42 def __init__(self): self.scale_factor = 1.0 self.latent_rgb_factors = [ @@ -102,6 +104,7 @@ class SC_Prior(LatentFormat): ] class SC_B(LatentFormat): + spacial_downscale_ratio = 4 def __init__(self): self.scale_factor = 1.0 / 0.43 self.latent_rgb_factors = [ @@ -181,6 +184,7 @@ class Flux(SD3): class Flux2(LatentFormat): latent_channels = 128 + spacial_downscale_ratio = 16 def __init__(self): self.latent_rgb_factors =[ @@ -272,6 +276,7 @@ class Mochi(LatentFormat): class LTXV(LatentFormat): latent_channels = 128 latent_dimensions = 3 + spacial_downscale_ratio = 32 def __init__(self): self.latent_rgb_factors = [ @@ -515,6 +520,7 @@ class Wan21(LatentFormat): class Wan22(Wan21): latent_channels = 48 latent_dimensions = 3 + spacial_downscale_ratio = 16 latent_rgb_factors = [ [ 0.0119, 0.0103, 0.0046], @@ -592,6 +598,7 @@ class Wan22(Wan21): class HunyuanImage21(LatentFormat): latent_channels = 64 latent_dimensions = 2 + spacial_downscale_ratio = 32 scale_factor = 0.75289 latent_rgb_factors = [ @@ -725,6 +732,7 @@ class HunyuanVideo15(LatentFormat): latent_rgb_factors_bias = [ 0.0456, -0.0202, -0.0644] latent_channels = 32 latent_dimensions = 3 + spacial_downscale_ratio = 16 scale_factor = 1.03682 taesd_decoder_name = "lighttaehy1_5" @@ -749,6 +757,7 @@ class ACEAudio(LatentFormat): class ChromaRadiance(LatentFormat): latent_channels = 3 + spacial_downscale_ratio = 1 def __init__(self): self.latent_rgb_factors = [ diff --git a/comfy/ldm/cosmos/predict2.py b/comfy/ldm/cosmos/predict2.py index 07a4fc79f..c270e6333 100644 --- a/comfy/ldm/cosmos/predict2.py +++ b/comfy/ldm/cosmos/predict2.py @@ -13,6 +13,7 @@ from torchvision import transforms import comfy.patcher_extension from comfy.ldm.modules.attention import optimized_attention +import comfy.ldm.common_dit def apply_rotary_pos_emb( t: torch.Tensor, @@ -835,6 +836,8 @@ class MiniTrainDIT(nn.Module): padding_mask: Optional[torch.Tensor] = None, **kwargs, ): + orig_shape = list(x.shape) + x = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_temporal, self.patch_spatial, self.patch_spatial)) x_B_C_T_H_W = x timesteps_B_T = timesteps crossattn_emb = context @@ -882,5 +885,5 @@ class MiniTrainDIT(nn.Module): ) x_B_T_H_W_O = self.final_layer(x_B_T_H_W_D, t_embedding_B_T_D, adaln_lora_B_T_3D=adaln_lora_B_T_3D) - x_B_C_Tt_Hp_Wp = self.unpatchify(x_B_T_H_W_O) + x_B_C_Tt_Hp_Wp = self.unpatchify(x_B_T_H_W_O)[:, :, :orig_shape[-3], :orig_shape[-2], :orig_shape[-1]] return x_B_C_Tt_Hp_Wp diff --git a/comfy/ldm/lightricks/av_model.py b/comfy/ldm/lightricks/av_model.py index c12ace241..2c6954ecd 100644 --- a/comfy/ldm/lightricks/av_model.py +++ b/comfy/ldm/lightricks/av_model.py @@ -18,12 +18,12 @@ class CompressedTimestep: def __init__(self, tensor: torch.Tensor, patches_per_frame: int): """ tensor: [batch_size, num_tokens, feature_dim] tensor where num_tokens = num_frames * patches_per_frame - patches_per_frame: Number of spatial patches per frame (height * width in latent space) + patches_per_frame: Number of spatial patches per frame (height * width in latent space), or None to disable compression """ self.batch_size, num_tokens, self.feature_dim = tensor.shape # Check if compression is valid (num_tokens must be divisible by patches_per_frame) - if num_tokens % patches_per_frame == 0 and num_tokens >= patches_per_frame: + if patches_per_frame is not None and num_tokens % patches_per_frame == 0 and num_tokens >= patches_per_frame: self.patches_per_frame = patches_per_frame self.num_frames = num_tokens // patches_per_frame @@ -215,22 +215,9 @@ class BasicAVTransformerBlock(nn.Module): return (*scale_shift_ada_values, *gate_ada_values) def forward( - self, - x: Tuple[torch.Tensor, torch.Tensor], - v_context=None, - a_context=None, - attention_mask=None, - v_timestep=None, - a_timestep=None, - v_pe=None, - a_pe=None, - v_cross_pe=None, - a_cross_pe=None, - v_cross_scale_shift_timestep=None, - a_cross_scale_shift_timestep=None, - v_cross_gate_timestep=None, - a_cross_gate_timestep=None, - transformer_options=None, + self, x: Tuple[torch.Tensor, torch.Tensor], v_context=None, a_context=None, attention_mask=None, v_timestep=None, a_timestep=None, + v_pe=None, a_pe=None, v_cross_pe=None, a_cross_pe=None, v_cross_scale_shift_timestep=None, a_cross_scale_shift_timestep=None, + v_cross_gate_timestep=None, a_cross_gate_timestep=None, transformer_options=None, ) -> Tuple[torch.Tensor, torch.Tensor]: run_vx = transformer_options.get("run_vx", True) run_ax = transformer_options.get("run_ax", True) @@ -240,144 +227,102 @@ class BasicAVTransformerBlock(nn.Module): run_a2v = run_vx and transformer_options.get("a2v_cross_attn", True) and ax.numel() > 0 run_v2a = run_ax and transformer_options.get("v2a_cross_attn", True) + # video if run_vx: - vshift_msa, vscale_msa, vgate_msa = ( - self.get_ada_values(self.scale_shift_table, vx.shape[0], v_timestep, slice(0, 3)) - ) - + # video self-attention + vshift_msa, vscale_msa = (self.get_ada_values(self.scale_shift_table, vx.shape[0], v_timestep, slice(0, 2))) norm_vx = comfy.ldm.common_dit.rms_norm(vx) * (1 + vscale_msa) + vshift_msa - vx += self.attn1(norm_vx, pe=v_pe, transformer_options=transformer_options) * vgate_msa - vx += self.attn2( - comfy.ldm.common_dit.rms_norm(vx), - context=v_context, - mask=attention_mask, - transformer_options=transformer_options, - ) - - del vshift_msa, vscale_msa, vgate_msa + del vshift_msa, vscale_msa + attn1_out = self.attn1(norm_vx, pe=v_pe, transformer_options=transformer_options) + del norm_vx + # video cross-attention + vgate_msa = self.get_ada_values(self.scale_shift_table, vx.shape[0], v_timestep, slice(2, 3))[0] + vx.addcmul_(attn1_out, vgate_msa) + del vgate_msa, attn1_out + vx.add_(self.attn2(comfy.ldm.common_dit.rms_norm(vx), context=v_context, mask=attention_mask, transformer_options=transformer_options)) + # audio if run_ax: - ashift_msa, ascale_msa, agate_msa = ( - self.get_ada_values(self.audio_scale_shift_table, ax.shape[0], a_timestep, slice(0, 3)) - ) - + # audio self-attention + ashift_msa, ascale_msa = (self.get_ada_values(self.audio_scale_shift_table, ax.shape[0], a_timestep, slice(0, 2))) norm_ax = comfy.ldm.common_dit.rms_norm(ax) * (1 + ascale_msa) + ashift_msa - ax += ( - self.audio_attn1(norm_ax, pe=a_pe, transformer_options=transformer_options) - * agate_msa - ) - ax += self.audio_attn2( - comfy.ldm.common_dit.rms_norm(ax), - context=a_context, - mask=attention_mask, - transformer_options=transformer_options, - ) + del ashift_msa, ascale_msa + attn1_out = self.audio_attn1(norm_ax, pe=a_pe, transformer_options=transformer_options) + del norm_ax + # audio cross-attention + agate_msa = self.get_ada_values(self.audio_scale_shift_table, ax.shape[0], a_timestep, slice(2, 3))[0] + ax.addcmul_(attn1_out, agate_msa) + del agate_msa, attn1_out + ax.add_(self.audio_attn2(comfy.ldm.common_dit.rms_norm(ax), context=a_context, mask=attention_mask, transformer_options=transformer_options)) - del ashift_msa, ascale_msa, agate_msa - - # Audio - Video cross attention. + # video - audio cross attention. if run_a2v or run_v2a: - # norm3 vx_norm3 = comfy.ldm.common_dit.rms_norm(vx) ax_norm3 = comfy.ldm.common_dit.rms_norm(ax) - ( - scale_ca_audio_hidden_states_a2v, - shift_ca_audio_hidden_states_a2v, - scale_ca_audio_hidden_states_v2a, - shift_ca_audio_hidden_states_v2a, - gate_out_v2a, - ) = self.get_av_ca_ada_values( - self.scale_shift_table_a2v_ca_audio, - ax.shape[0], - a_cross_scale_shift_timestep, - a_cross_gate_timestep, - ) - - ( - scale_ca_video_hidden_states_a2v, - shift_ca_video_hidden_states_a2v, - scale_ca_video_hidden_states_v2a, - shift_ca_video_hidden_states_v2a, - gate_out_a2v, - ) = self.get_av_ca_ada_values( - self.scale_shift_table_a2v_ca_video, - vx.shape[0], - v_cross_scale_shift_timestep, - v_cross_gate_timestep, - ) - + # audio to video cross attention if run_a2v: - vx_scaled = ( - vx_norm3 * (1 + scale_ca_video_hidden_states_a2v) - + shift_ca_video_hidden_states_a2v - ) - ax_scaled = ( - ax_norm3 * (1 + scale_ca_audio_hidden_states_a2v) - + shift_ca_audio_hidden_states_a2v - ) - vx += ( - self.audio_to_video_attn( - vx_scaled, - context=ax_scaled, - pe=v_cross_pe, - k_pe=a_cross_pe, - transformer_options=transformer_options, - ) - * gate_out_a2v - ) + scale_ca_audio_hidden_states_a2v, shift_ca_audio_hidden_states_a2v = self.get_ada_values( + self.scale_shift_table_a2v_ca_audio[:4, :], ax.shape[0], a_cross_scale_shift_timestep)[:2] + scale_ca_video_hidden_states_a2v_v, shift_ca_video_hidden_states_a2v_v = self.get_ada_values( + self.scale_shift_table_a2v_ca_video[:4, :], vx.shape[0], v_cross_scale_shift_timestep)[:2] - del gate_out_a2v - del scale_ca_video_hidden_states_a2v,\ - shift_ca_video_hidden_states_a2v,\ - scale_ca_audio_hidden_states_a2v,\ - shift_ca_audio_hidden_states_a2v,\ + vx_scaled = vx_norm3 * (1 + scale_ca_video_hidden_states_a2v_v) + shift_ca_video_hidden_states_a2v_v + ax_scaled = ax_norm3 * (1 + scale_ca_audio_hidden_states_a2v) + shift_ca_audio_hidden_states_a2v + del scale_ca_video_hidden_states_a2v_v, shift_ca_video_hidden_states_a2v_v, scale_ca_audio_hidden_states_a2v, shift_ca_audio_hidden_states_a2v + a2v_out = self.audio_to_video_attn(vx_scaled, context=ax_scaled, pe=v_cross_pe, k_pe=a_cross_pe, transformer_options=transformer_options) + del vx_scaled, ax_scaled + + gate_out_a2v = self.get_ada_values(self.scale_shift_table_a2v_ca_video[4:, :], vx.shape[0], v_cross_gate_timestep)[0] + vx.addcmul_(a2v_out, gate_out_a2v) + del gate_out_a2v, a2v_out + + # video to audio cross attention if run_v2a: - ax_scaled = ( - ax_norm3 * (1 + scale_ca_audio_hidden_states_v2a) - + shift_ca_audio_hidden_states_v2a - ) - vx_scaled = ( - vx_norm3 * (1 + scale_ca_video_hidden_states_v2a) - + shift_ca_video_hidden_states_v2a - ) - ax += ( - self.video_to_audio_attn( - ax_scaled, - context=vx_scaled, - pe=a_cross_pe, - k_pe=v_cross_pe, - transformer_options=transformer_options, - ) - * gate_out_v2a - ) + scale_ca_audio_hidden_states_v2a, shift_ca_audio_hidden_states_v2a = self.get_ada_values( + self.scale_shift_table_a2v_ca_audio[:4, :], ax.shape[0], a_cross_scale_shift_timestep)[2:4] + scale_ca_video_hidden_states_v2a, shift_ca_video_hidden_states_v2a = self.get_ada_values( + self.scale_shift_table_a2v_ca_video[:4, :], vx.shape[0], v_cross_scale_shift_timestep)[2:4] - del gate_out_v2a - del scale_ca_video_hidden_states_v2a,\ - shift_ca_video_hidden_states_v2a,\ - scale_ca_audio_hidden_states_v2a,\ - shift_ca_audio_hidden_states_v2a + ax_scaled = ax_norm3 * (1 + scale_ca_audio_hidden_states_v2a) + shift_ca_audio_hidden_states_v2a + vx_scaled = vx_norm3 * (1 + scale_ca_video_hidden_states_v2a) + shift_ca_video_hidden_states_v2a + del scale_ca_video_hidden_states_v2a, shift_ca_video_hidden_states_v2a, scale_ca_audio_hidden_states_v2a, shift_ca_audio_hidden_states_v2a + v2a_out = self.video_to_audio_attn(ax_scaled, context=vx_scaled, pe=a_cross_pe, k_pe=v_cross_pe, transformer_options=transformer_options) + del ax_scaled, vx_scaled + + gate_out_v2a = self.get_ada_values(self.scale_shift_table_a2v_ca_audio[4:, :], ax.shape[0], a_cross_gate_timestep)[0] + ax.addcmul_(v2a_out, gate_out_v2a) + del gate_out_v2a, v2a_out + + del vx_norm3, ax_norm3 + + # video feedforward if run_vx: - vshift_mlp, vscale_mlp, vgate_mlp = ( - self.get_ada_values(self.scale_shift_table, vx.shape[0], v_timestep, slice(3, None)) - ) - + vshift_mlp, vscale_mlp = self.get_ada_values(self.scale_shift_table, vx.shape[0], v_timestep, slice(3, 5)) vx_scaled = comfy.ldm.common_dit.rms_norm(vx) * (1 + vscale_mlp) + vshift_mlp - vx += self.ff(vx_scaled) * vgate_mlp - del vshift_mlp, vscale_mlp, vgate_mlp + del vshift_mlp, vscale_mlp + ff_out = self.ff(vx_scaled) + del vx_scaled + + vgate_mlp = self.get_ada_values(self.scale_shift_table, vx.shape[0], v_timestep, slice(5, 6))[0] + vx.addcmul_(ff_out, vgate_mlp) + del vgate_mlp, ff_out + + # audio feedforward if run_ax: - ashift_mlp, ascale_mlp, agate_mlp = ( - self.get_ada_values(self.audio_scale_shift_table, ax.shape[0], a_timestep, slice(3, None)) - ) - + ashift_mlp, ascale_mlp = self.get_ada_values(self.audio_scale_shift_table, ax.shape[0], a_timestep, slice(3, 5)) ax_scaled = comfy.ldm.common_dit.rms_norm(ax) * (1 + ascale_mlp) + ashift_mlp - ax += self.audio_ff(ax_scaled) * agate_mlp + del ashift_mlp, ascale_mlp - del ashift_mlp, ascale_mlp, agate_mlp + ff_out = self.audio_ff(ax_scaled) + del ax_scaled + agate_mlp = self.get_ada_values(self.audio_scale_shift_table, ax.shape[0], a_timestep, slice(5, 6))[0] + ax.addcmul_(ff_out, agate_mlp) + del agate_mlp, ff_out return vx, ax @@ -589,9 +534,20 @@ class LTXAVModel(LTXVModel): audio_length = kwargs.get("audio_length", 0) # Separate audio and video latents vx, ax = self.separate_audio_and_video_latents(x, audio_length) + + has_spatial_mask = False + if denoise_mask is not None: + # check if any frame has spatial variation (inpainting) + for frame_idx in range(denoise_mask.shape[2]): + frame_mask = denoise_mask[0, 0, frame_idx] + if frame_mask.numel() > 0 and frame_mask.min() != frame_mask.max(): + has_spatial_mask = True + break + [vx, v_pixel_coords, additional_args] = super()._process_input( vx, keyframe_idxs, denoise_mask, **kwargs ) + additional_args["has_spatial_mask"] = has_spatial_mask ax, a_latent_coords = self.a_patchifier.patchify(ax) ax = self.audio_patchify_proj(ax) @@ -618,8 +574,9 @@ class LTXAVModel(LTXVModel): # Calculate patches_per_frame from orig_shape: [batch, channels, frames, height, width] # Video tokens are arranged as (frames * height * width), so patches_per_frame = height * width orig_shape = kwargs.get("orig_shape") + has_spatial_mask = kwargs.get("has_spatial_mask", None) v_patches_per_frame = None - if orig_shape is not None and len(orig_shape) == 5: + if not has_spatial_mask and orig_shape is not None and len(orig_shape) == 5: # orig_shape[3] = height, orig_shape[4] = width (in latent space) v_patches_per_frame = orig_shape[3] * orig_shape[4] @@ -662,10 +619,11 @@ class LTXAVModel(LTXVModel): ) # Compress cross-attention timesteps (only video side, audio is too small to benefit) + # v_patches_per_frame is None for spatial masks, set for temporal masks or no mask cross_av_timestep_ss = [ av_ca_audio_scale_shift_timestep.view(batch_size, -1, av_ca_audio_scale_shift_timestep.shape[-1]), - CompressedTimestep(av_ca_video_scale_shift_timestep.view(batch_size, -1, av_ca_video_scale_shift_timestep.shape[-1]), v_patches_per_frame), # video - compressed - CompressedTimestep(av_ca_a2v_gate_noise_timestep.view(batch_size, -1, av_ca_a2v_gate_noise_timestep.shape[-1]), v_patches_per_frame), # video - compressed + CompressedTimestep(av_ca_video_scale_shift_timestep.view(batch_size, -1, av_ca_video_scale_shift_timestep.shape[-1]), v_patches_per_frame), # video - compressed if possible + CompressedTimestep(av_ca_a2v_gate_noise_timestep.view(batch_size, -1, av_ca_a2v_gate_noise_timestep.shape[-1]), v_patches_per_frame), # video - compressed if possible av_ca_v2a_gate_noise_timestep.view(batch_size, -1, av_ca_v2a_gate_noise_timestep.shape[-1]), ] diff --git a/comfy/ldm/lightricks/vae/causal_conv3d.py b/comfy/ldm/lightricks/vae/causal_conv3d.py index 70d612e86..b8341edbc 100644 --- a/comfy/ldm/lightricks/vae/causal_conv3d.py +++ b/comfy/ldm/lightricks/vae/causal_conv3d.py @@ -1,11 +1,11 @@ from typing import Tuple, Union +import threading import torch import torch.nn as nn import comfy.ops ops = comfy.ops.disable_weight_init - class CausalConv3d(nn.Module): def __init__( self, @@ -42,23 +42,34 @@ class CausalConv3d(nn.Module): padding_mode=spatial_padding_mode, groups=groups, ) + self.temporal_cache_state={} def forward(self, x, causal: bool = True): - if causal: - first_frame_pad = x[:, :, :1, :, :].repeat( - (1, 1, self.time_kernel_size - 1, 1, 1) - ) - x = torch.concatenate((first_frame_pad, x), dim=2) - else: - first_frame_pad = x[:, :, :1, :, :].repeat( - (1, 1, (self.time_kernel_size - 1) // 2, 1, 1) - ) - last_frame_pad = x[:, :, -1:, :, :].repeat( - (1, 1, (self.time_kernel_size - 1) // 2, 1, 1) - ) - x = torch.concatenate((first_frame_pad, x, last_frame_pad), dim=2) - x = self.conv(x) - return x + tid = threading.get_ident() + + cached, is_end = self.temporal_cache_state.get(tid, (None, False)) + if cached is None: + padding_length = self.time_kernel_size - 1 + if not causal: + padding_length = padding_length // 2 + if x.shape[2] == 0: + return x + cached = x[:, :, :1, :, :].repeat((1, 1, padding_length, 1, 1)) + pieces = [ cached, x ] + if is_end and not causal: + pieces.append(x[:, :, -1:, :, :].repeat((1, 1, (self.time_kernel_size - 1) // 2, 1, 1))) + + needs_caching = not is_end + if needs_caching and x.shape[2] >= self.time_kernel_size - 1: + needs_caching = False + self.temporal_cache_state[tid] = (x[:, :, -(self.time_kernel_size - 1):, :, :], False) + + x = torch.cat(pieces, dim=2) + + if needs_caching: + self.temporal_cache_state[tid] = (x[:, :, -(self.time_kernel_size - 1):, :, :], False) + + return self.conv(x) if x.shape[2] >= self.time_kernel_size else x[:, :, :0, :, :] @property def weight(self): diff --git a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py index 75ed069ad..cbfdf412d 100644 --- a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py +++ b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py @@ -1,4 +1,5 @@ from __future__ import annotations +import threading import torch from torch import nn from functools import partial @@ -6,12 +7,35 @@ import math from einops import rearrange from typing import List, Optional, Tuple, Union from .conv_nd_factory import make_conv_nd, make_linear_nd +from .causal_conv3d import CausalConv3d from .pixel_norm import PixelNorm from ..model import PixArtAlphaCombinedTimestepSizeEmbeddings import comfy.ops +from comfy.ldm.modules.diffusionmodules.model import torch_cat_if_needed ops = comfy.ops.disable_weight_init +def mark_conv3d_ended(module): + tid = threading.get_ident() + for _, m in module.named_modules(): + if isinstance(m, CausalConv3d): + current = m.temporal_cache_state.get(tid, (None, False)) + m.temporal_cache_state[tid] = (current[0], True) + +def split2(tensor, split_point, dim=2): + return torch.split(tensor, [split_point, tensor.shape[dim] - split_point], dim=dim) + +def add_exchange_cache(dest, cache_in, new_input, dim=2): + if dest is not None: + if cache_in is not None: + cache_to_dest = min(dest.shape[dim], cache_in.shape[dim]) + lead_in_dest, dest = split2(dest, cache_to_dest, dim=dim) + lead_in_source, cache_in = split2(cache_in, cache_to_dest, dim=dim) + lead_in_dest.add_(lead_in_source) + body, new_input = split2(new_input, dest.shape[dim], dim) + dest.add_(body) + return torch_cat_if_needed([cache_in, new_input], dim=dim) + class Encoder(nn.Module): r""" The `Encoder` layer of a variational autoencoder that encodes its input into a latent representation. @@ -205,7 +229,7 @@ class Encoder(nn.Module): self.gradient_checkpointing = False - def forward(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def forward_orig(self, sample: torch.FloatTensor) -> torch.FloatTensor: r"""The forward method of the `Encoder` class.""" sample = patchify(sample, patch_size_hw=self.patch_size, patch_size_t=1) @@ -254,6 +278,22 @@ class Encoder(nn.Module): return sample + def forward(self, *args, **kwargs): + #No encoder support so just flag the end so it doesnt use the cache. + mark_conv3d_ended(self) + try: + return self.forward_orig(*args, **kwargs) + finally: + tid = threading.get_ident() + for _, module in self.named_modules(): + # ComfyUI doesn't thread this kind of stuff today, but just in case + # we key on the thread to make it thread safe. + tid = threading.get_ident() + if hasattr(module, "temporal_cache_state"): + module.temporal_cache_state.pop(tid, None) + + +MAX_CHUNK_SIZE=(128 * 1024 ** 2) class Decoder(nn.Module): r""" @@ -341,18 +381,6 @@ class Decoder(nn.Module): timestep_conditioning=timestep_conditioning, spatial_padding_mode=spatial_padding_mode, ) - elif block_name == "attn_res_x": - block = UNetMidBlock3D( - dims=dims, - in_channels=input_channel, - num_layers=block_params["num_layers"], - resnet_groups=norm_num_groups, - norm_layer=norm_layer, - inject_noise=block_params.get("inject_noise", False), - timestep_conditioning=timestep_conditioning, - attention_head_dim=block_params["attention_head_dim"], - spatial_padding_mode=spatial_padding_mode, - ) elif block_name == "res_x_y": output_channel = output_channel // block_params.get("multiplier", 2) block = ResnetBlock3D( @@ -428,8 +456,9 @@ class Decoder(nn.Module): ) self.last_scale_shift_table = nn.Parameter(torch.empty(2, output_channel)) + # def forward(self, sample: torch.FloatTensor, target_shape) -> torch.FloatTensor: - def forward( + def forward_orig( self, sample: torch.FloatTensor, timestep: Optional[torch.Tensor] = None, @@ -437,6 +466,7 @@ class Decoder(nn.Module): r"""The forward method of the `Decoder` class.""" batch_size = sample.shape[0] + mark_conv3d_ended(self.conv_in) sample = self.conv_in(sample, causal=self.causal) checkpoint_fn = ( @@ -445,24 +475,12 @@ class Decoder(nn.Module): else lambda x: x ) - scaled_timestep = None + timestep_shift_scale = None if self.timestep_conditioning: assert ( timestep is not None ), "should pass timestep with timestep_conditioning=True" scaled_timestep = timestep * self.timestep_scale_multiplier.to(dtype=sample.dtype, device=sample.device) - - for up_block in self.up_blocks: - if self.timestep_conditioning and isinstance(up_block, UNetMidBlock3D): - sample = checkpoint_fn(up_block)( - sample, causal=self.causal, timestep=scaled_timestep - ) - else: - sample = checkpoint_fn(up_block)(sample, causal=self.causal) - - sample = self.conv_norm_out(sample) - - if self.timestep_conditioning: embedded_timestep = self.last_time_embedder( timestep=scaled_timestep.flatten(), resolution=None, @@ -483,16 +501,62 @@ class Decoder(nn.Module): embedded_timestep.shape[-2], embedded_timestep.shape[-1], ) - shift, scale = ada_values.unbind(dim=1) - sample = sample * (1 + scale) + shift + timestep_shift_scale = ada_values.unbind(dim=1) - sample = self.conv_act(sample) - sample = self.conv_out(sample, causal=self.causal) + output = [] + + def run_up(idx, sample, ended): + if idx >= len(self.up_blocks): + sample = self.conv_norm_out(sample) + if timestep_shift_scale is not None: + shift, scale = timestep_shift_scale + sample = sample * (1 + scale) + shift + sample = self.conv_act(sample) + if ended: + mark_conv3d_ended(self.conv_out) + sample = self.conv_out(sample, causal=self.causal) + if sample is not None and sample.shape[2] > 0: + output.append(sample) + return + + up_block = self.up_blocks[idx] + if (ended): + mark_conv3d_ended(up_block) + if self.timestep_conditioning and isinstance(up_block, UNetMidBlock3D): + sample = checkpoint_fn(up_block)( + sample, causal=self.causal, timestep=scaled_timestep + ) + else: + sample = checkpoint_fn(up_block)(sample, causal=self.causal) + + if sample is None or sample.shape[2] == 0: + return + + total_bytes = sample.numel() * sample.element_size() + num_chunks = (total_bytes + MAX_CHUNK_SIZE - 1) // MAX_CHUNK_SIZE + samples = torch.chunk(sample, chunks=num_chunks, dim=2) + + for chunk_idx, sample1 in enumerate(samples): + run_up(idx + 1, sample1, ended and chunk_idx == len(samples) - 1) + + run_up(0, sample, True) + sample = torch.cat(output, dim=2) sample = unpatchify(sample, patch_size_hw=self.patch_size, patch_size_t=1) return sample + def forward(self, *args, **kwargs): + try: + return self.forward_orig(*args, **kwargs) + finally: + for _, module in self.named_modules(): + #ComfyUI doesn't thread this kind of stuff today, but just incase + #we key on the thread to make it thread safe. + tid = threading.get_ident() + if hasattr(module, "temporal_cache_state"): + module.temporal_cache_state.pop(tid, None) + class UNetMidBlock3D(nn.Module): """ @@ -663,8 +727,22 @@ class DepthToSpaceUpsample(nn.Module): ) self.residual = residual self.out_channels_reduction_factor = out_channels_reduction_factor + self.temporal_cache_state = {} def forward(self, x, causal: bool = True, timestep: Optional[torch.Tensor] = None): + tid = threading.get_ident() + cached, drop_first_conv, drop_first_res = self.temporal_cache_state.get(tid, (None, True, True)) + y = self.conv(x, causal=causal) + y = rearrange( + y, + "b (c p1 p2 p3) d h w -> b c (d p1) (h p2) (w p3)", + p1=self.stride[0], + p2=self.stride[1], + p3=self.stride[2], + ) + if self.stride[0] == 2 and y.shape[2] > 0 and drop_first_conv: + y = y[:, :, 1:, :, :] + drop_first_conv = False if self.residual: # Reshape and duplicate the input to match the output shape x_in = rearrange( @@ -676,21 +754,20 @@ class DepthToSpaceUpsample(nn.Module): ) num_repeat = math.prod(self.stride) // self.out_channels_reduction_factor x_in = x_in.repeat(1, num_repeat, 1, 1, 1) - if self.stride[0] == 2: + if self.stride[0] == 2 and x_in.shape[2] > 0 and drop_first_res: x_in = x_in[:, :, 1:, :, :] - x = self.conv(x, causal=causal) - x = rearrange( - x, - "b (c p1 p2 p3) d h w -> b c (d p1) (h p2) (w p3)", - p1=self.stride[0], - p2=self.stride[1], - p3=self.stride[2], - ) - if self.stride[0] == 2: - x = x[:, :, 1:, :, :] - if self.residual: - x = x + x_in - return x + drop_first_res = False + + if y.shape[2] == 0: + y = None + + cached = add_exchange_cache(y, cached, x_in, dim=2) + self.temporal_cache_state[tid] = (cached, drop_first_conv, drop_first_res) + + else: + self.temporal_cache_state[tid] = (None, drop_first_conv, False) + + return y class LayerNorm(nn.Module): def __init__(self, dim, eps, elementwise_affine=True) -> None: @@ -807,6 +884,8 @@ class ResnetBlock3D(nn.Module): torch.randn(4, in_channels) / in_channels**0.5 ) + self.temporal_cache_state={} + def _feed_spatial_noise( self, hidden_states: torch.FloatTensor, per_channel_scale: torch.FloatTensor ) -> torch.FloatTensor: @@ -880,9 +959,12 @@ class ResnetBlock3D(nn.Module): input_tensor = self.conv_shortcut(input_tensor) - output_tensor = input_tensor + hidden_states + tid = threading.get_ident() + cached = self.temporal_cache_state.get(tid, None) + cached = add_exchange_cache(hidden_states, cached, input_tensor, dim=2) + self.temporal_cache_state[tid] = cached - return output_tensor + return hidden_states def patchify(x, patch_size_hw, patch_size_t=1): diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py index b114d9e31..77d1abc97 100644 --- a/comfy/ldm/lumina/model.py +++ b/comfy/ldm/lumina/model.py @@ -451,6 +451,7 @@ class NextDiT(nn.Module): device=None, dtype=None, operations=None, + **kwargs, ) -> None: super().__init__() self.dtype = dtype diff --git a/comfy/ldm/modules/diffusionmodules/model.py b/comfy/ldm/modules/diffusionmodules/model.py index 1ae3ef034..5a22ef030 100644 --- a/comfy/ldm/modules/diffusionmodules/model.py +++ b/comfy/ldm/modules/diffusionmodules/model.py @@ -14,10 +14,13 @@ if model_management.xformers_enabled_vae(): import xformers.ops def torch_cat_if_needed(xl, dim): + xl = [x for x in xl if x is not None and x.shape[dim] > 0] if len(xl) > 1: return torch.cat(xl, dim) - else: + elif len(xl) == 1: return xl[0] + else: + return None def get_timestep_embedding(timesteps, embedding_dim): """ diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index 00c597535..6eb744286 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -170,8 +170,14 @@ class Attention(nn.Module): joint_query = apply_rope1(joint_query, image_rotary_emb) joint_key = apply_rope1(joint_key, image_rotary_emb) + if encoder_hidden_states_mask is not None: + attn_mask = torch.zeros((batch_size, 1, seq_txt + seq_img), dtype=hidden_states.dtype, device=hidden_states.device) + attn_mask[:, 0, :seq_txt] = encoder_hidden_states_mask + else: + attn_mask = None + joint_hidden_states = optimized_attention_masked(joint_query, joint_key, joint_value, self.heads, - attention_mask, transformer_options=transformer_options, + attn_mask, transformer_options=transformer_options, skip_reshape=True) txt_attn_output = joint_hidden_states[:, :seq_txt, :] @@ -430,6 +436,9 @@ class QwenImageTransformer2DModel(nn.Module): encoder_hidden_states = context encoder_hidden_states_mask = attention_mask + if encoder_hidden_states_mask is not None and not torch.is_floating_point(encoder_hidden_states_mask): + encoder_hidden_states_mask = (encoder_hidden_states_mask - 1).to(x.dtype) * torch.finfo(x.dtype).max + hidden_states, img_ids, orig_shape = self.process_img(x) num_embeds = hidden_states.shape[1] diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index 4216ce831..ea123acb4 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -62,6 +62,8 @@ class WanSelfAttention(nn.Module): x(Tensor): Shape [B, L, num_heads, C / num_heads] freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2] """ + patches = transformer_options.get("patches", {}) + b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim def qkv_fn_q(x): @@ -86,6 +88,10 @@ class WanSelfAttention(nn.Module): transformer_options=transformer_options, ) + if "attn1_patch" in patches: + for p in patches["attn1_patch"]: + x = p({"x": x, "q": q, "k": k, "transformer_options": transformer_options}) + x = self.o(x) return x @@ -225,6 +231,8 @@ class WanAttentionBlock(nn.Module): """ # assert e.dtype == torch.float32 + patches = transformer_options.get("patches", {}) + if e.ndim < 4: e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e).chunk(6, dim=1) else: @@ -242,6 +250,11 @@ class WanAttentionBlock(nn.Module): # cross-attention & ffn x = x + self.cross_attn(self.norm3(x), context, context_img_len=context_img_len, transformer_options=transformer_options) + + if "attn2_patch" in patches: + for p in patches["attn2_patch"]: + x = p({"x": x, "transformer_options": transformer_options}) + y = self.ffn(torch.addcmul(repeat_e(e[3], x), self.norm2(x), 1 + repeat_e(e[4], x))) x = torch.addcmul(x, y, repeat_e(e[5], x)) return x @@ -488,7 +501,7 @@ class WanModel(torch.nn.Module): self.blocks = nn.ModuleList([ wan_attn_block_class(cross_attn_type, dim, ffn_dim, num_heads, window_size, qk_norm, cross_attn_norm, eps, operation_settings=operation_settings) - for _ in range(num_layers) + for i in range(num_layers) ]) # head @@ -541,6 +554,7 @@ class WanModel(torch.nn.Module): # embeddings x = self.patch_embedding(x.float()).to(x.dtype) grid_sizes = x.shape[2:] + transformer_options["grid_sizes"] = grid_sizes x = x.flatten(2).transpose(1, 2) # time embeddings @@ -738,6 +752,7 @@ class VaceWanModel(WanModel): # embeddings x = self.patch_embedding(x.float()).to(x.dtype) grid_sizes = x.shape[2:] + transformer_options["grid_sizes"] = grid_sizes x = x.flatten(2).transpose(1, 2) # time embeddings diff --git a/comfy/ldm/wan/model_multitalk.py b/comfy/ldm/wan/model_multitalk.py new file mode 100644 index 000000000..c9dd98c4d --- /dev/null +++ b/comfy/ldm/wan/model_multitalk.py @@ -0,0 +1,500 @@ +import torch +from einops import rearrange, repeat +import comfy +from comfy.ldm.modules.attention import optimized_attention + + +def calculate_x_ref_attn_map(visual_q, ref_k, ref_target_masks, split_num=8): + scale = 1.0 / visual_q.shape[-1] ** 0.5 + visual_q = visual_q.transpose(1, 2) * scale + + B, H, x_seqlens, K = visual_q.shape + + x_ref_attn_maps = [] + for class_idx, ref_target_mask in enumerate(ref_target_masks): + ref_target_mask = ref_target_mask.view(1, 1, 1, -1) + + x_ref_attnmap = torch.zeros(B, H, x_seqlens, device=visual_q.device, dtype=visual_q.dtype) + chunk_size = min(max(x_seqlens // split_num, 1), x_seqlens) + + for i in range(0, x_seqlens, chunk_size): + end_i = min(i + chunk_size, x_seqlens) + + attn_chunk = visual_q[:, :, i:end_i] @ ref_k.permute(0, 2, 3, 1) # B, H, chunk, ref_seqlens + + # Apply softmax + attn_max = attn_chunk.max(dim=-1, keepdim=True).values + attn_chunk = (attn_chunk - attn_max).exp() + attn_sum = attn_chunk.sum(dim=-1, keepdim=True) + attn_chunk = attn_chunk / (attn_sum + 1e-8) + + # Apply mask and sum + masked_attn = attn_chunk * ref_target_mask + x_ref_attnmap[:, :, i:end_i] = masked_attn.sum(-1) / (ref_target_mask.sum() + 1e-8) + + del attn_chunk, masked_attn + + # Average across heads + x_ref_attnmap = x_ref_attnmap.mean(dim=1) # B, x_seqlens + x_ref_attn_maps.append(x_ref_attnmap) + + del visual_q, ref_k + + return torch.cat(x_ref_attn_maps, dim=0) + +def get_attn_map_with_target(visual_q, ref_k, shape, ref_target_masks=None, split_num=2): + """Args: + query (torch.tensor): B M H K + key (torch.tensor): B M H K + shape (tuple): (N_t, N_h, N_w) + ref_target_masks: [B, N_h * N_w] + """ + + N_t, N_h, N_w = shape + + x_seqlens = N_h * N_w + ref_k = ref_k[:, :x_seqlens] + _, seq_lens, heads, _ = visual_q.shape + class_num, _ = ref_target_masks.shape + x_ref_attn_maps = torch.zeros(class_num, seq_lens).to(visual_q) + + split_chunk = heads // split_num + + for i in range(split_num): + x_ref_attn_maps_perhead = calculate_x_ref_attn_map( + visual_q[:, :, i*split_chunk:(i+1)*split_chunk, :], + ref_k[:, :, i*split_chunk:(i+1)*split_chunk, :], + ref_target_masks + ) + x_ref_attn_maps += x_ref_attn_maps_perhead + + return x_ref_attn_maps / split_num + + +def normalize_and_scale(column, source_range, target_range, epsilon=1e-8): + source_min, source_max = source_range + new_min, new_max = target_range + normalized = (column - source_min) / (source_max - source_min + epsilon) + scaled = normalized * (new_max - new_min) + new_min + return scaled + + +def rotate_half(x): + x = rearrange(x, "... (d r) -> ... d r", r=2) + x1, x2 = x.unbind(dim=-1) + x = torch.stack((-x2, x1), dim=-1) + return rearrange(x, "... d r -> ... (d r)") + + +def get_audio_embeds(encoded_audio, audio_start, audio_end): + audio_embs = [] + human_num = len(encoded_audio) + audio_frames = encoded_audio[0].shape[0] + + indices = (torch.arange(4 + 1) - 2) * 1 + + for human_idx in range(human_num): + if audio_end > audio_frames: # in case of not enough audio for current window, pad with first audio frame as that's most likely silence + pad_len = audio_end - audio_frames + pad_shape = list(encoded_audio[human_idx].shape) + pad_shape[0] = pad_len + pad_tensor = encoded_audio[human_idx][:1].repeat(pad_len, *([1] * (encoded_audio[human_idx].dim() - 1))) + encoded_audio_in = torch.cat([encoded_audio[human_idx], pad_tensor], dim=0) + else: + encoded_audio_in = encoded_audio[human_idx] + center_indices = torch.arange(audio_start, audio_end, 1).unsqueeze(1) + indices.unsqueeze(0) + center_indices = torch.clamp(center_indices, min=0, max=encoded_audio_in.shape[0] - 1) + audio_emb = encoded_audio_in[center_indices].unsqueeze(0) + audio_embs.append(audio_emb) + + return torch.cat(audio_embs, dim=0) + + +def project_audio_features(audio_proj, encoded_audio, audio_start, audio_end): + audio_embs = get_audio_embeds(encoded_audio, audio_start, audio_end) + + first_frame_audio_emb_s = audio_embs[:, :1, ...] + latter_frame_audio_emb = audio_embs[:, 1:, ...] + latter_frame_audio_emb = rearrange(latter_frame_audio_emb, "b (n_t n) w s c -> b n_t n w s c", n=4) + + middle_index = audio_proj.seq_len // 2 + + latter_first_frame_audio_emb = latter_frame_audio_emb[:, :, :1, :middle_index+1, ...] + latter_first_frame_audio_emb = rearrange(latter_first_frame_audio_emb, "b n_t n w s c -> b n_t (n w) s c") + latter_last_frame_audio_emb = latter_frame_audio_emb[:, :, -1:, middle_index:, ...] + latter_last_frame_audio_emb = rearrange(latter_last_frame_audio_emb, "b n_t n w s c -> b n_t (n w) s c") + latter_middle_frame_audio_emb = latter_frame_audio_emb[:, :, 1:-1, middle_index:middle_index+1, ...] + latter_middle_frame_audio_emb = rearrange(latter_middle_frame_audio_emb, "b n_t n w s c -> b n_t (n w) s c") + latter_frame_audio_emb_s = torch.cat([latter_first_frame_audio_emb, latter_middle_frame_audio_emb, latter_last_frame_audio_emb], dim=2) + + audio_emb = audio_proj(first_frame_audio_emb_s, latter_frame_audio_emb_s) + audio_emb = torch.cat(audio_emb.split(1), dim=2) + + return audio_emb + + +class RotaryPositionalEmbedding1D(torch.nn.Module): + def __init__(self, + head_dim, + ): + super().__init__() + self.head_dim = head_dim + self.base = 10000 + + def precompute_freqs_cis_1d(self, pos_indices): + freqs = 1.0 / (self.base ** (torch.arange(0, self.head_dim, 2)[: (self.head_dim // 2)].float() / self.head_dim)) + freqs = freqs.to(pos_indices.device) + freqs = torch.einsum("..., f -> ... f", pos_indices.float(), freqs) + freqs = repeat(freqs, "... n -> ... (n r)", r=2) + return freqs + + def forward(self, x, pos_indices): + freqs_cis = self.precompute_freqs_cis_1d(pos_indices) + + x_ = x.float() + + freqs_cis = freqs_cis.float().to(x.device) + cos, sin = freqs_cis.cos(), freqs_cis.sin() + cos, sin = rearrange(cos, 'n d -> 1 1 n d'), rearrange(sin, 'n d -> 1 1 n d') + x_ = (x_ * cos) + (rotate_half(x_) * sin) + + return x_.type_as(x) + +class SingleStreamAttention(torch.nn.Module): + def __init__( + self, + dim: int, + encoder_hidden_states_dim: int, + num_heads: int, + qkv_bias: bool, + device=None, dtype=None, operations=None + ) -> None: + super().__init__() + self.dim = dim + self.encoder_hidden_states_dim = encoder_hidden_states_dim + self.num_heads = num_heads + self.head_dim = dim // num_heads + + self.q_linear = operations.Linear(dim, dim, bias=qkv_bias, device=device, dtype=dtype) + self.proj = operations.Linear(dim, dim, device=device, dtype=dtype) + self.kv_linear = operations.Linear(encoder_hidden_states_dim, dim * 2, bias=qkv_bias, device=device, dtype=dtype) + + def forward(self, x: torch.Tensor, encoder_hidden_states: torch.Tensor, shape=None) -> torch.Tensor: + N_t, N_h, N_w = shape + + expected_tokens = N_t * N_h * N_w + actual_tokens = x.shape[1] + x_extra = None + + if actual_tokens != expected_tokens: + x_extra = x[:, -N_h * N_w:, :] + x = x[:, :-N_h * N_w, :] + N_t = N_t - 1 + + B = x.shape[0] + S = N_h * N_w + x = x.view(B * N_t, S, self.dim) + + # get q for hidden_state + q = self.q_linear(x).view(B * N_t, S, self.num_heads, self.head_dim) + + # get kv from encoder_hidden_states # shape: (B, N, num_heads, head_dim) + kv = self.kv_linear(encoder_hidden_states) + encoder_k, encoder_v = kv.view(B * N_t, encoder_hidden_states.shape[1], 2, self.num_heads, self.head_dim).unbind(2) + + #print("q.shape", q.shape) #torch.Size([21, 1024, 40, 128]) + x = optimized_attention( + q.transpose(1, 2), + encoder_k.transpose(1, 2), + encoder_v.transpose(1, 2), + heads=self.num_heads, skip_reshape=True, skip_output_reshape=True).transpose(1, 2) + + # linear transform + x = self.proj(x.reshape(B * N_t, S, self.dim)) + x = x.view(B, N_t * S, self.dim) + + if x_extra is not None: + x = torch.cat([x, torch.zeros_like(x_extra)], dim=1) + + return x + +class SingleStreamMultiAttention(SingleStreamAttention): + def __init__( + self, + dim: int, + encoder_hidden_states_dim: int, + num_heads: int, + qkv_bias: bool, + class_range: int = 24, + class_interval: int = 4, + device=None, dtype=None, operations=None + ) -> None: + super().__init__( + dim=dim, + encoder_hidden_states_dim=encoder_hidden_states_dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + device=device, + dtype=dtype, + operations=operations + ) + + # Rotary-embedding layout parameters + self.class_interval = class_interval + self.class_range = class_range + self.max_humans = self.class_range // self.class_interval + + # Constant bucket used for background tokens + self.rope_bak = int(self.class_range // 2) + + self.rope_1d = RotaryPositionalEmbedding1D(self.head_dim) + + def forward( + self, + x: torch.Tensor, + encoder_hidden_states: torch.Tensor, + shape=None, + x_ref_attn_map=None + ) -> torch.Tensor: + encoder_hidden_states = encoder_hidden_states.squeeze(0).to(x.device) + human_num = x_ref_attn_map.shape[0] if x_ref_attn_map is not None else 1 + # Single-speaker fall-through + if human_num <= 1: + return super().forward(x, encoder_hidden_states, shape) + + N_t, N_h, N_w = shape + + x_extra = None + if x.shape[0] * N_t != encoder_hidden_states.shape[0]: + x_extra = x[:, -N_h * N_w:, :] + x = x[:, :-N_h * N_w, :] + N_t = N_t - 1 + x = rearrange(x, "B (N_t S) C -> (B N_t) S C", N_t=N_t) + + # Query projection + B, N, C = x.shape + q = self.q_linear(x) + q = q.view(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3) + + # Use `class_range` logic for 2 speakers + rope_h1 = (0, self.class_interval) + rope_h2 = (self.class_range - self.class_interval, self.class_range) + rope_bak = int(self.class_range // 2) + + # Normalize and scale attention maps for each speaker + max_values = x_ref_attn_map.max(1).values[:, None, None] + min_values = x_ref_attn_map.min(1).values[:, None, None] + max_min_values = torch.cat([max_values, min_values], dim=2) + + human1_max_value, human1_min_value = max_min_values[0, :, 0].max(), max_min_values[0, :, 1].min() + human2_max_value, human2_min_value = max_min_values[1, :, 0].max(), max_min_values[1, :, 1].min() + + human1 = normalize_and_scale(x_ref_attn_map[0], (human1_min_value, human1_max_value), rope_h1) + human2 = normalize_and_scale(x_ref_attn_map[1], (human2_min_value, human2_max_value), rope_h2) + back = torch.full((x_ref_attn_map.size(1),), rope_bak, dtype=human1.dtype, device=human1.device) + + # Token-wise speaker dominance + max_indices = x_ref_attn_map.argmax(dim=0) + normalized_map = torch.stack([human1, human2, back], dim=1) + normalized_pos = normalized_map[torch.arange(x_ref_attn_map.size(1)), max_indices] + + # Apply rotary to Q + q = rearrange(q, "(B N_t) H S C -> B H (N_t S) C", N_t=N_t) + q = self.rope_1d(q, normalized_pos) + q = rearrange(q, "B H (N_t S) C -> (B N_t) H S C", N_t=N_t) + + # Keys / Values + _, N_a, _ = encoder_hidden_states.shape + encoder_kv = self.kv_linear(encoder_hidden_states) + encoder_kv = encoder_kv.view(B, N_a, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4) + encoder_k, encoder_v = encoder_kv.unbind(0) + + # Rotary for keys – assign centre of each speaker bucket to its context tokens + per_frame = torch.zeros(N_a, dtype=encoder_k.dtype, device=encoder_k.device) + per_frame[: per_frame.size(0) // 2] = (rope_h1[0] + rope_h1[1]) / 2 + per_frame[per_frame.size(0) // 2 :] = (rope_h2[0] + rope_h2[1]) / 2 + encoder_pos = torch.cat([per_frame] * N_t, dim=0) + + encoder_k = rearrange(encoder_k, "(B N_t) H S C -> B H (N_t S) C", N_t=N_t) + encoder_k = self.rope_1d(encoder_k, encoder_pos) + encoder_k = rearrange(encoder_k, "B H (N_t S) C -> (B N_t) H S C", N_t=N_t) + + # Final attention + q = rearrange(q, "B H M K -> B M H K") + encoder_k = rearrange(encoder_k, "B H M K -> B M H K") + encoder_v = rearrange(encoder_v, "B H M K -> B M H K") + + x = optimized_attention( + q.transpose(1, 2), + encoder_k.transpose(1, 2), + encoder_v.transpose(1, 2), + heads=self.num_heads, skip_reshape=True, skip_output_reshape=True).transpose(1, 2) + + # Linear projection + x = x.reshape(B, N, C) + x = self.proj(x) + + # Restore original layout + x = rearrange(x, "(B N_t) S C -> B (N_t S) C", N_t=N_t) + if x_extra is not None: + x = torch.cat([x, torch.zeros_like(x_extra)], dim=1) + + return x + + +class MultiTalkAudioProjModel(torch.nn.Module): + def __init__( + self, + seq_len: int = 5, + seq_len_vf: int = 12, + blocks: int = 12, + channels: int = 768, + intermediate_dim: int = 512, + out_dim: int = 768, + context_tokens: int = 32, + device=None, dtype=None, operations=None + ): + super().__init__() + + self.seq_len = seq_len + self.blocks = blocks + self.channels = channels + self.input_dim = seq_len * blocks * channels + self.input_dim_vf = seq_len_vf * blocks * channels + self.intermediate_dim = intermediate_dim + self.context_tokens = context_tokens + self.out_dim = out_dim + + # define multiple linear layers + self.proj1 = operations.Linear(self.input_dim, intermediate_dim, device=device, dtype=dtype) + self.proj1_vf = operations.Linear(self.input_dim_vf, intermediate_dim, device=device, dtype=dtype) + self.proj2 = operations.Linear(intermediate_dim, intermediate_dim, device=device, dtype=dtype) + self.proj3 = operations.Linear(intermediate_dim, context_tokens * out_dim, device=device, dtype=dtype) + self.norm = operations.LayerNorm(out_dim, device=device, dtype=dtype) + + def forward(self, audio_embeds, audio_embeds_vf): + video_length = audio_embeds.shape[1] + audio_embeds_vf.shape[1] + B, _, _, S, C = audio_embeds.shape + + # process audio of first frame + audio_embeds = rearrange(audio_embeds, "bz f w b c -> (bz f) w b c") + batch_size, window_size, blocks, channels = audio_embeds.shape + audio_embeds = audio_embeds.view(batch_size, window_size * blocks * channels) + + # process audio of latter frame + audio_embeds_vf = rearrange(audio_embeds_vf, "bz f w b c -> (bz f) w b c") + batch_size_vf, window_size_vf, blocks_vf, channels_vf = audio_embeds_vf.shape + audio_embeds_vf = audio_embeds_vf.view(batch_size_vf, window_size_vf * blocks_vf * channels_vf) + + # first projection + audio_embeds = torch.relu(self.proj1(audio_embeds)) + audio_embeds_vf = torch.relu(self.proj1_vf(audio_embeds_vf)) + audio_embeds = rearrange(audio_embeds, "(bz f) c -> bz f c", bz=B) + audio_embeds_vf = rearrange(audio_embeds_vf, "(bz f) c -> bz f c", bz=B) + audio_embeds_c = torch.concat([audio_embeds, audio_embeds_vf], dim=1) + batch_size_c, N_t, C_a = audio_embeds_c.shape + audio_embeds_c = audio_embeds_c.view(batch_size_c*N_t, C_a) + + # second projection + audio_embeds_c = torch.relu(self.proj2(audio_embeds_c)) + + context_tokens = self.proj3(audio_embeds_c).reshape(batch_size_c*N_t, self.context_tokens, self.out_dim) + + # normalization and reshape + context_tokens = self.norm(context_tokens) + context_tokens = rearrange(context_tokens, "(bz f) m c -> bz f m c", f=video_length) + + return context_tokens + + +class WanMultiTalkAttentionBlock(torch.nn.Module): + def __init__(self, in_dim=5120, out_dim=768, device=None, dtype=None, operations=None): + super().__init__() + self.audio_cross_attn = SingleStreamMultiAttention(in_dim, out_dim, num_heads=40, qkv_bias=True, device=device, dtype=dtype, operations=operations) + self.norm_x = operations.LayerNorm(in_dim, device=device, dtype=dtype, elementwise_affine=True) + + +class MultiTalkGetAttnMapPatch: + def __init__(self, ref_target_masks=None): + self.ref_target_masks = ref_target_masks + + def __call__(self, kwargs): + transformer_options = kwargs.get("transformer_options", {}) + x = kwargs["x"] + + if self.ref_target_masks is not None: + x_ref_attn_map = get_attn_map_with_target(kwargs["q"], kwargs["k"], transformer_options["grid_sizes"], ref_target_masks=self.ref_target_masks.to(x.device)) + transformer_options["x_ref_attn_map"] = x_ref_attn_map + return x + + +class MultiTalkCrossAttnPatch: + def __init__(self, model_patch, audio_scale=1.0, ref_target_masks=None): + self.model_patch = model_patch + self.audio_scale = audio_scale + self.ref_target_masks = ref_target_masks + + def __call__(self, kwargs): + transformer_options = kwargs.get("transformer_options", {}) + block_idx = transformer_options.get("block_index", None) + x = kwargs["x"] + if block_idx is None: + return torch.zeros_like(x) + + audio_embeds = transformer_options.get("audio_embeds") + x_ref_attn_map = transformer_options.pop("x_ref_attn_map", None) + + norm_x = self.model_patch.model.blocks[block_idx].norm_x(x) + x_audio = self.model_patch.model.blocks[block_idx].audio_cross_attn( + norm_x, audio_embeds.to(x.dtype), + shape=transformer_options["grid_sizes"], + x_ref_attn_map=x_ref_attn_map + ) + x = x + x_audio * self.audio_scale + return x + + def models(self): + return [self.model_patch] + +class MultiTalkApplyModelWrapper: + def __init__(self, init_latents): + self.init_latents = init_latents + + def __call__(self, executor, x, *args, **kwargs): + x[:, :, :self.init_latents.shape[2]] = self.init_latents.to(x) + samples = executor(x, *args, **kwargs) + return samples + + +class InfiniteTalkOuterSampleWrapper: + def __init__(self, motion_frames_latent, model_patch, is_extend=False): + self.motion_frames_latent = motion_frames_latent + self.model_patch = model_patch + self.is_extend = is_extend + + def __call__(self, executor, *args, **kwargs): + model_patcher = executor.class_obj.model_patcher + model_options = executor.class_obj.model_options + process_latent_in = model_patcher.model.process_latent_in + + # for InfiniteTalk, model input first latent(s) need to always be replaced on every step + if self.motion_frames_latent is not None: + wrappers = model_options["transformer_options"]["wrappers"] + w = wrappers.setdefault(comfy.patcher_extension.WrappersMP.APPLY_MODEL, {}) + w["MultiTalk_apply_model"] = [MultiTalkApplyModelWrapper(process_latent_in(self.motion_frames_latent))] + + # run the sampling process + result = executor(*args, **kwargs) + + # insert motion frames before decoding + if self.is_extend: + overlap = self.motion_frames_latent.shape[2] + result = torch.cat([self.motion_frames_latent.to(result), result[:, :, overlap:]], dim=2) + + return result + + def to(self, device_or_dtype): + if isinstance(device_or_dtype, torch.device): + if self.motion_frames_latent is not None: + self.motion_frames_latent = self.motion_frames_latent.to(device_or_dtype) + return self diff --git a/comfy/ldm/wan/vae.py b/comfy/ldm/wan/vae.py index 08315f1a8..fd125ceed 100644 --- a/comfy/ldm/wan/vae.py +++ b/comfy/ldm/wan/vae.py @@ -5,7 +5,7 @@ import torch import torch.nn as nn import torch.nn.functional as F from einops import rearrange -from comfy.ldm.modules.diffusionmodules.model import vae_attention +from comfy.ldm.modules.diffusionmodules.model import vae_attention, torch_cat_if_needed import comfy.ops ops = comfy.ops.disable_weight_init @@ -20,22 +20,29 @@ class CausalConv3d(ops.Conv3d): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self._padding = (self.padding[2], self.padding[2], self.padding[1], - self.padding[1], 2 * self.padding[0], 0) - self.padding = (0, 0, 0) + self._padding = 2 * self.padding[0] + self.padding = (0, self.padding[1], self.padding[2]) def forward(self, x, cache_x=None, cache_list=None, cache_idx=None): if cache_list is not None: cache_x = cache_list[cache_idx] cache_list[cache_idx] = None - padding = list(self._padding) - if cache_x is not None and self._padding[4] > 0: - cache_x = cache_x.to(x.device) - x = torch.cat([cache_x, x], dim=2) - padding[4] -= cache_x.shape[2] + if cache_x is None and x.shape[2] == 1: + #Fast path - the op will pad for use by truncating the weight + #and save math on a pile of zeros. + return super().forward(x, autopad="causal_zero") + + if self._padding > 0: + padding_needed = self._padding + if cache_x is not None: + cache_x = cache_x.to(x.device) + padding_needed = max(0, padding_needed - cache_x.shape[2]) + padding_shape = list(x.shape) + padding_shape[2] = padding_needed + padding = torch.zeros(padding_shape, device=x.device, dtype=x.dtype) + x = torch_cat_if_needed([padding, cache_x, x], dim=2) del cache_x - x = F.pad(x, padding) return super().forward(x) @@ -472,10 +479,12 @@ class WanVAE(nn.Module): def encode(self, x): conv_idx = [0] - feat_map = [None] * count_conv3d(self.decoder) ## cache t = x.shape[2] iter_ = 1 + (t - 1) // 4 + feat_map = None + if iter_ > 1: + feat_map = [None] * count_conv3d(self.decoder) ## 对encode输入的x,按时间拆分为1、4、4、4.... for i in range(iter_): conv_idx = [0] @@ -495,10 +504,11 @@ class WanVAE(nn.Module): def decode(self, z): conv_idx = [0] - feat_map = [None] * count_conv3d(self.decoder) # z: [b,c,t,h,w] - iter_ = z.shape[2] + feat_map = None + if iter_ > 1: + feat_map = [None] * count_conv3d(self.decoder) x = self.conv2(z) for i in range(iter_): conv_idx = [0] diff --git a/comfy/lora.py b/comfy/lora.py index e8246bd66..7b31d055c 100644 --- a/comfy/lora.py +++ b/comfy/lora.py @@ -260,6 +260,7 @@ def model_lora_keys_unet(model, key_map={}): key_map["transformer.{}".format(k[:-len(".weight")])] = to #simpletrainer and probably regular diffusers flux lora format key_map["lycoris_{}".format(k[:-len(".weight")].replace(".", "_"))] = to #simpletrainer lycoris key_map["lora_transformer_{}".format(k[:-len(".weight")].replace(".", "_"))] = to #onetrainer + key_map[k[:-len(".weight")]] = to #DiffSynth lora format for k in sdk: hidden_size = model.model_config.unet_config.get("hidden_size", 0) if k.endswith(".weight") and ".linear1." in k: diff --git a/comfy/model_base.py b/comfy/model_base.py index 1d57562cc..66e52864d 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1578,6 +1578,9 @@ class QwenImage(BaseModel): def extra_conds(self, **kwargs): out = super().extra_conds(**kwargs) + attention_mask = kwargs.get("attention_mask", None) + if attention_mask is not None: + out['attention_mask'] = comfy.conds.CONDRegular(attention_mask) cross_attn = kwargs.get("cross_attn", None) if cross_attn is not None: out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index b29a033cc..8cea16e50 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -444,6 +444,10 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["ffn_dim_multiplier"] = (8.0 / 3.0) dit_config["z_image_modulation"] = True dit_config["time_scale"] = 1000.0 + try: + dit_config["allow_fp16"] = torch.std(state_dict['{}layers.{}.ffn_norm1.weight'.format(key_prefix, dit_config["n_layers"] - 2)], unbiased=False).item() < 0.42 + except Exception: + pass if '{}cap_pad_token'.format(key_prefix) in state_dict_keys: dit_config["pad_tokens_multiple"] = 32 sig_weight = state_dict.get('{}siglip_embedder.0.weight'.format(key_prefix), None) diff --git a/comfy/ops.py b/comfy/ops.py index 415c39e92..e406ba7ed 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -203,7 +203,9 @@ class disable_weight_init: def reset_parameters(self): return None - def _conv_forward(self, input, weight, bias, *args, **kwargs): + def _conv_forward(self, input, weight, bias, autopad=None, *args, **kwargs): + if autopad == "causal_zero": + weight = weight[:, :, -input.shape[2]:, :, :] if NVIDIA_MEMORY_CONV_BUG_WORKAROUND and weight.dtype in (torch.float16, torch.bfloat16): out = torch.cudnn_convolution(input, weight, self.padding, self.stride, self.dilation, self.groups, benchmark=False, deterministic=False, allow_tf32=True) if bias is not None: @@ -212,15 +214,15 @@ class disable_weight_init: else: return super()._conv_forward(input, weight, bias, *args, **kwargs) - def forward_comfy_cast_weights(self, input): + def forward_comfy_cast_weights(self, input, autopad=None): weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) - x = self._conv_forward(input, weight, bias) + x = self._conv_forward(input, weight, bias, autopad=autopad) uncast_bias_weight(self, weight, bias, offload_stream) return x def forward(self, *args, **kwargs): run_every_op() - if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0: + if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0 or "autopad" in kwargs: return self.forward_comfy_cast_weights(*args, **kwargs) else: return super().forward(*args, **kwargs) diff --git a/comfy/sample.py b/comfy/sample.py index 2f8f3a51c..a2a39b527 100644 --- a/comfy/sample.py +++ b/comfy/sample.py @@ -37,12 +37,18 @@ def prepare_noise(latent_image, seed, noise_inds=None): return noises -def fix_empty_latent_channels(model, latent_image): +def fix_empty_latent_channels(model, latent_image, downscale_ratio_spacial=None): if latent_image.is_nested: return latent_image latent_format = model.get_model_object("latent_format") #Resize the empty latent image so it has the right number of channels - if latent_format.latent_channels != latent_image.shape[1] and torch.count_nonzero(latent_image) == 0: - latent_image = comfy.utils.repeat_to_batch_size(latent_image, latent_format.latent_channels, dim=1) + if torch.count_nonzero(latent_image) == 0: + if latent_format.latent_channels != latent_image.shape[1]: + latent_image = comfy.utils.repeat_to_batch_size(latent_image, latent_format.latent_channels, dim=1) + if downscale_ratio_spacial is not None: + if downscale_ratio_spacial != latent_format.spacial_downscale_ratio: + ratio = downscale_ratio_spacial / latent_format.spacial_downscale_ratio + latent_image = comfy.utils.common_upscale(latent_image, round(latent_image.shape[-1] * ratio), round(latent_image.shape[-2] * ratio), "nearest-exact", crop="disabled") + if latent_format.latent_dimensions == 3 and latent_image.ndim == 4: latent_image = latent_image.unsqueeze(2) return latent_image diff --git a/comfy/sd.py b/comfy/sd.py index f7f6a44a0..f627f7d55 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -20,6 +20,7 @@ import comfy.ldm.ace.vae.music_dcae_pipeline import comfy.ldm.hunyuan_video.vae import comfy.ldm.mmaudio.vae.autoencoder import comfy.pixel_space_convert +import comfy.weight_adapter import yaml import math import os @@ -101,6 +102,105 @@ def load_lora_for_models(model, clip, lora, strength_model, strength_clip): return (new_modelpatcher, new_clip) +def load_bypass_lora_for_models(model, clip, lora, strength_model, strength_clip): + """ + Load LoRA in bypass mode without modifying base model weights. + + Instead of patching weights, this injects the LoRA computation into the + forward pass: output = base_forward(x) + lora_path(x) + + Non-adapter patches (bias diff, weight diff, etc.) are applied as regular patches. + + This is useful for training and when model weights are offloaded. + """ + key_map = {} + if model is not None: + key_map = comfy.lora.model_lora_keys_unet(model.model, key_map) + if clip is not None: + key_map = comfy.lora.model_lora_keys_clip(clip.cond_stage_model, key_map) + + logging.debug(f"[BypassLoRA] key_map has {len(key_map)} entries") + + lora = comfy.lora_convert.convert_lora(lora) + loaded = comfy.lora.load_lora(lora, key_map) + + logging.debug(f"[BypassLoRA] loaded has {len(loaded)} entries") + + # Separate adapters (for bypass) from other patches (for regular patching) + bypass_patches = {} # WeightAdapterBase instances -> bypass mode + regular_patches = {} # diff, set, bias patches -> regular weight patching + + for key, patch_data in loaded.items(): + if isinstance(patch_data, comfy.weight_adapter.WeightAdapterBase): + bypass_patches[key] = patch_data + else: + regular_patches[key] = patch_data + + logging.debug(f"[BypassLoRA] {len(bypass_patches)} bypass adapters, {len(regular_patches)} regular patches") + + k = set() + k1 = set() + + if model is not None: + new_modelpatcher = model.clone() + + # Apply regular patches (bias diff, weight diff, etc.) via normal patching + if regular_patches: + patched_keys = new_modelpatcher.add_patches(regular_patches, strength_model) + k.update(patched_keys) + + # Apply adapter patches via bypass injection + manager = comfy.weight_adapter.BypassInjectionManager() + model_sd_keys = set(new_modelpatcher.model.state_dict().keys()) + + for key, adapter in bypass_patches.items(): + if key in model_sd_keys: + manager.add_adapter(key, adapter, strength=strength_model) + k.add(key) + else: + logging.warning(f"[BypassLoRA] Adapter key not in model state_dict: {key}") + + injections = manager.create_injections(new_modelpatcher.model) + + if manager.get_hook_count() > 0: + new_modelpatcher.set_injections("bypass_lora", injections) + else: + new_modelpatcher = None + + if clip is not None: + new_clip = clip.clone() + + # Apply regular patches to clip + if regular_patches: + patched_keys = new_clip.add_patches(regular_patches, strength_clip) + k1.update(patched_keys) + + # Apply adapter patches via bypass injection + clip_manager = comfy.weight_adapter.BypassInjectionManager() + clip_sd_keys = set(new_clip.cond_stage_model.state_dict().keys()) + + for key, adapter in bypass_patches.items(): + if key in clip_sd_keys: + clip_manager.add_adapter(key, adapter, strength=strength_clip) + k1.add(key) + + clip_injections = clip_manager.create_injections(new_clip.cond_stage_model) + if clip_manager.get_hook_count() > 0: + new_clip.patcher.set_injections("bypass_lora", clip_injections) + else: + new_clip = None + + for x in loaded: + if (x not in k) and (x not in k1): + patch_data = loaded[x] + patch_type = type(patch_data).__name__ + if isinstance(patch_data, tuple): + patch_type = f"tuple({patch_data[0]})" + logging.warning(f"NOT LOADED: {x} (type={patch_type})") + + return (new_modelpatcher, new_clip) + + class CLIP: def __init__(self, target=None, embedding_directory=None, no_init=False, tokenizer_data={}, parameters=0, state_dict=[], model_options={}): if no_init: @@ -636,14 +736,13 @@ class VAE: self.upscale_index_formula = (4, 16, 16) self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 16, 16) self.downscale_index_formula = (4, 16, 16) - if self.latent_channels == 48: # Wan 2.2 + if self.latent_channels in [48, 128]: # Wan 2.2 and LTX2 self.first_stage_model = comfy.taesd.taehv.TAEHV(latent_channels=self.latent_channels, latent_format=None) # taehv doesn't need scaling - self.process_input = lambda image: (_ for _ in ()).throw(NotImplementedError("This light tae doesn't support encoding currently")) + self.process_input = self.process_output = lambda image: image self.process_output = lambda image: image self.memory_used_decode = lambda shape, dtype: (1800 * (max(1, (shape[-3] ** 0.7 * 0.1)) * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype)) elif self.latent_channels == 32 and sd["decoder.22.bias"].shape[0] == 12: # lighttae_hv15 self.first_stage_model = comfy.taesd.taehv.TAEHV(latent_channels=self.latent_channels, latent_format=comfy.latent_formats.HunyuanVideo15) - self.process_input = lambda image: (_ for _ in ()).throw(NotImplementedError("This light tae doesn't support encoding currently")) self.memory_used_decode = lambda shape, dtype: (1200 * (max(1, (shape[-3] ** 0.7 * 0.05)) * shape[-2] * shape[-1] * 32 * 32) * model_management.dtype_size(dtype)) else: if sd["decoder.1.weight"].dtype == torch.float16: # taehv currently only available in float16, so assume it's not lighttaew2_1 as otherwise state dicts are identical diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py index c512ca5d0..d4f22120b 100644 --- a/comfy/sd1_clip.py +++ b/comfy/sd1_clip.py @@ -466,7 +466,7 @@ def load_embed(embedding_name, embedding_directory, embedding_size, embed_key=No return embed_out class SDTokenizer: - def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, pad_left=False, disable_weights=False, tokenizer_data={}, tokenizer_args={}): + def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, start_token=None, min_padding=None, pad_left=False, disable_weights=False, tokenizer_data={}, tokenizer_args={}): if tokenizer_path is None: tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "sd1_tokenizer") self.tokenizer = tokenizer_class.from_pretrained(tokenizer_path, **tokenizer_args) @@ -479,8 +479,15 @@ class SDTokenizer: empty = self.tokenizer('')["input_ids"] self.tokenizer_adds_end_token = has_end_token if has_start_token: - self.tokens_start = 1 - self.start_token = empty[0] + if len(empty) > 0: + self.tokens_start = 1 + self.start_token = empty[0] + else: + self.tokens_start = 0 + self.start_token = start_token + if start_token is None: + logging.warning("WARNING: There's something wrong with your tokenizers.'") + if end_token is not None: self.end_token = end_token else: @@ -488,7 +495,7 @@ class SDTokenizer: self.end_token = empty[1] else: self.tokens_start = 0 - self.start_token = None + self.start_token = start_token if end_token is not None: self.end_token = end_token else: diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 70abebf46..d25271d6e 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -771,10 +771,24 @@ class Flux2(Flux): return out def clip_target(self, state_dict={}): - return None # TODO pref = self.text_encoder_key_prefix[0] - t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref)) - return supported_models_base.ClipTarget(comfy.text_encoders.flux.FluxTokenizer, comfy.text_encoders.flux.flux_clip(**t5_detect)) + detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen3_4b.transformer.".format(pref)) + if len(detect) > 0: + detect["model_type"] = "qwen3_4b" + return supported_models_base.ClipTarget(comfy.text_encoders.flux.KleinTokenizer, comfy.text_encoders.flux.klein_te(**detect)) + + detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen3_8b.transformer.".format(pref)) + if len(detect) > 0: + detect["model_type"] = "qwen3_8b" + return supported_models_base.ClipTarget(comfy.text_encoders.flux.KleinTokenizer8B, comfy.text_encoders.flux.klein_te(**detect)) + + detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}mistral3_24b.transformer.".format(pref)) + if len(detect) > 0: + if "{}mistral3_24b.transformer.model.layers.39.post_attention_layernorm.weight".format(pref) not in state_dict: + detect["pruned"] = True + return supported_models_base.ClipTarget(comfy.text_encoders.flux.Flux2Tokenizer, comfy.text_encoders.flux.flux2_te(**detect)) + + return None class GenmoMochi(supported_models_base.BASE): unet_config = { @@ -1079,7 +1093,7 @@ class ZImage(Lumina2): def __init__(self, unet_config): super().__init__(unet_config) - if comfy.model_management.extended_fp16_support(): + if comfy.model_management.extended_fp16_support() and unet_config.get("allow_fp16", False): self.supported_inference_dtypes = self.supported_inference_dtypes.copy() self.supported_inference_dtypes.insert(1, torch.float16) diff --git a/comfy/taesd/taehv.py b/comfy/taesd/taehv.py index 0e5f9a378..6c06ce19d 100644 --- a/comfy/taesd/taehv.py +++ b/comfy/taesd/taehv.py @@ -112,7 +112,8 @@ def apply_model_with_memblocks(model, x, parallel, show_progress_bar): class TAEHV(nn.Module): - def __init__(self, latent_channels, parallel=False, decoder_time_upscale=(True, True), decoder_space_upscale=(True, True, True), latent_format=None, show_progress_bar=True): + def __init__(self, latent_channels, parallel=False, encoder_time_downscale=(True, True, False), decoder_time_upscale=(False, True, True), decoder_space_upscale=(True, True, True), + latent_format=None, show_progress_bar=False): super().__init__() self.image_channels = 3 self.patch_size = 1 @@ -124,6 +125,9 @@ class TAEHV(nn.Module): self.process_out = latent_format().process_out if latent_format is not None else (lambda x: x) if self.latent_channels in [48, 32]: # Wan 2.2 and HunyuanVideo1.5 self.patch_size = 2 + elif self.latent_channels == 128: # LTX2 + self.patch_size, self.latent_channels, encoder_time_downscale, decoder_time_upscale = 4, 128, (True, True, True), (True, True, True) + if self.latent_channels == 32: # HunyuanVideo1.5 act_func = nn.LeakyReLU(0.2, inplace=True) else: # HunyuanVideo, Wan 2.1 @@ -131,41 +135,52 @@ class TAEHV(nn.Module): self.encoder = nn.Sequential( conv(self.image_channels*self.patch_size**2, 64), act_func, - TPool(64, 2), conv(64, 64, stride=2, bias=False), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), - TPool(64, 2), conv(64, 64, stride=2, bias=False), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), - TPool(64, 1), conv(64, 64, stride=2, bias=False), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), + TPool(64, 2 if encoder_time_downscale[0] else 1), conv(64, 64, stride=2, bias=False), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), + TPool(64, 2 if encoder_time_downscale[1] else 1), conv(64, 64, stride=2, bias=False), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), + TPool(64, 2 if encoder_time_downscale[2] else 1), conv(64, 64, stride=2, bias=False), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), conv(64, self.latent_channels), ) n_f = [256, 128, 64, 64] - self.frames_to_trim = 2**sum(decoder_time_upscale) - 1 + self.decoder = nn.Sequential( Clamp(), conv(self.latent_channels, n_f[0]), act_func, - MemBlock(n_f[0], n_f[0], act_func), MemBlock(n_f[0], n_f[0], act_func), MemBlock(n_f[0], n_f[0], act_func), nn.Upsample(scale_factor=2 if decoder_space_upscale[0] else 1), TGrow(n_f[0], 1), conv(n_f[0], n_f[1], bias=False), - MemBlock(n_f[1], n_f[1], act_func), MemBlock(n_f[1], n_f[1], act_func), MemBlock(n_f[1], n_f[1], act_func), nn.Upsample(scale_factor=2 if decoder_space_upscale[1] else 1), TGrow(n_f[1], 2 if decoder_time_upscale[0] else 1), conv(n_f[1], n_f[2], bias=False), - MemBlock(n_f[2], n_f[2], act_func), MemBlock(n_f[2], n_f[2], act_func), MemBlock(n_f[2], n_f[2], act_func), nn.Upsample(scale_factor=2 if decoder_space_upscale[2] else 1), TGrow(n_f[2], 2 if decoder_time_upscale[1] else 1), conv(n_f[2], n_f[3], bias=False), + MemBlock(n_f[0], n_f[0], act_func), MemBlock(n_f[0], n_f[0], act_func), MemBlock(n_f[0], n_f[0], act_func), nn.Upsample(scale_factor=2 if decoder_space_upscale[0] else 1), TGrow(n_f[0], 2 if decoder_time_upscale[0] else 1), conv(n_f[0], n_f[1], bias=False), + MemBlock(n_f[1], n_f[1], act_func), MemBlock(n_f[1], n_f[1], act_func), MemBlock(n_f[1], n_f[1], act_func), nn.Upsample(scale_factor=2 if decoder_space_upscale[1] else 1), TGrow(n_f[1], 2 if decoder_time_upscale[1] else 1), conv(n_f[1], n_f[2], bias=False), + MemBlock(n_f[2], n_f[2], act_func), MemBlock(n_f[2], n_f[2], act_func), MemBlock(n_f[2], n_f[2], act_func), nn.Upsample(scale_factor=2 if decoder_space_upscale[2] else 1), TGrow(n_f[2], 2 if decoder_time_upscale[2] else 1), conv(n_f[2], n_f[3], bias=False), act_func, conv(n_f[3], self.image_channels*self.patch_size**2), ) - @property - def show_progress_bar(self): - return self._show_progress_bar - @show_progress_bar.setter - def show_progress_bar(self, value): - self._show_progress_bar = value + self.t_downscale = 2**sum(t.stride == 2 for t in self.encoder if isinstance(t, TPool)) + self.t_upscale = 2**sum(t.stride == 2 for t in self.decoder if isinstance(t, TGrow)) + self.frames_to_trim = self.t_upscale - 1 + self._show_progress_bar = show_progress_bar + + @property + def show_progress_bar(self): + return self._show_progress_bar + + @show_progress_bar.setter + def show_progress_bar(self, value): + self._show_progress_bar = value def encode(self, x, **kwargs): - if self.patch_size > 1: - x = F.pixel_unshuffle(x, self.patch_size) x = x.movedim(2, 1) # [B, C, T, H, W] -> [B, T, C, H, W] - if x.shape[1] % 4 != 0: - # pad at end to multiple of 4 - n_pad = 4 - x.shape[1] % 4 + if self.patch_size > 1: + B, T, C, H, W = x.shape + x = x.reshape(B * T, C, H, W) + x = F.pixel_unshuffle(x, self.patch_size) + x = x.reshape(B, T, C * self.patch_size ** 2, H // self.patch_size, W // self.patch_size) + if x.shape[1] % self.t_downscale != 0: + # pad at end to multiple of t_downscale + n_pad = self.t_downscale - x.shape[1] % self.t_downscale padding = x[:, -1:].repeat_interleave(n_pad, dim=1) x = torch.cat([x, padding], 1) x = apply_model_with_memblocks(self.encoder, x, self.parallel, self.show_progress_bar).movedim(2, 1) return self.process_out(x) def decode(self, x, **kwargs): + x = x.unsqueeze(0) if x.ndim == 4 else x # [T, C, H, W] -> [1, T, C, H, W] + x = x.movedim(1, 2) if x.shape[1] != self.latent_channels else x # [B, T, C, H, W] or [B, C, T, H, W] x = self.process_in(x).movedim(2, 1) # [B, C, T, H, W] -> [B, T, C, H, W] x = apply_model_with_memblocks(self.decoder, x, self.parallel, self.show_progress_bar) if self.patch_size > 1: diff --git a/comfy/text_encoders/flux.py b/comfy/text_encoders/flux.py index 4075afca4..f67a5f805 100644 --- a/comfy/text_encoders/flux.py +++ b/comfy/text_encoders/flux.py @@ -118,7 +118,7 @@ class MistralTokenizerClass: class Mistral3Tokenizer(sd1_clip.SDTokenizer): def __init__(self, embedding_directory=None, tokenizer_data={}): self.tekken_data = tokenizer_data.get("tekken_model", None) - super().__init__("", pad_with_end=False, embedding_size=5120, embedding_key='mistral3_24b', tokenizer_class=MistralTokenizerClass, has_end_token=False, pad_to_max_length=False, pad_token=11, max_length=99999999, min_length=1, pad_left=True, tokenizer_args=load_mistral_tokenizer(self.tekken_data), tokenizer_data=tokenizer_data) + super().__init__("", pad_with_end=False, embedding_size=5120, embedding_key='mistral3_24b', tokenizer_class=MistralTokenizerClass, has_end_token=False, pad_to_max_length=False, pad_token=11, start_token=1, max_length=99999999, min_length=1, pad_left=True, tokenizer_args=load_mistral_tokenizer(self.tekken_data), tokenizer_data=tokenizer_data) def state_dict(self): return {"tekken_model": self.tekken_data} diff --git a/comfy/text_encoders/hunyuan_video.py b/comfy/text_encoders/hunyuan_video.py index a9a6c525e..2ddb4da60 100644 --- a/comfy/text_encoders/hunyuan_video.py +++ b/comfy/text_encoders/hunyuan_video.py @@ -10,9 +10,11 @@ import comfy.utils def llama_detect(state_dict, prefix=""): out = {} - t5_key = "{}model.norm.weight".format(prefix) - if t5_key in state_dict: - out["dtype_llama"] = state_dict[t5_key].dtype + norm_keys = ["{}model.norm.weight".format(prefix), "{}model.layers.0.input_layernorm.weight".format(prefix)] + for norm_key in norm_keys: + if norm_key in state_dict: + out["dtype_llama"] = state_dict[norm_key].dtype + break quant = comfy.utils.detect_layer_quantization(state_dict, prefix) if quant is not None: diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py index 3080a3e09..68ac1e804 100644 --- a/comfy/text_encoders/llama.py +++ b/comfy/text_encoders/llama.py @@ -1,7 +1,7 @@ import torch import torch.nn as nn from dataclasses import dataclass -from typing import Optional, Any +from typing import Optional, Any, Tuple import math from comfy.ldm.modules.attention import optimized_attention_for_device @@ -32,6 +32,7 @@ class Llama2Config: k_norm = None rope_scale = None final_norm: bool = True + lm_head: bool = False @dataclass class Mistral3Small24BConfig: @@ -54,6 +55,7 @@ class Mistral3Small24BConfig: k_norm = None rope_scale = None final_norm: bool = True + lm_head: bool = False @dataclass class Qwen25_3BConfig: @@ -76,6 +78,7 @@ class Qwen25_3BConfig: k_norm = None rope_scale = None final_norm: bool = True + lm_head: bool = False @dataclass class Qwen3_06BConfig: @@ -98,6 +101,7 @@ class Qwen3_06BConfig: k_norm = "gemma3" rope_scale = None final_norm: bool = True + lm_head: bool = False @dataclass class Qwen3_4BConfig: @@ -120,6 +124,7 @@ class Qwen3_4BConfig: k_norm = "gemma3" rope_scale = None final_norm: bool = True + lm_head: bool = False @dataclass class Qwen3_8BConfig: @@ -142,6 +147,7 @@ class Qwen3_8BConfig: k_norm = "gemma3" rope_scale = None final_norm: bool = True + lm_head: bool = False @dataclass class Ovis25_2BConfig: @@ -164,6 +170,7 @@ class Ovis25_2BConfig: k_norm = "gemma3" rope_scale = None final_norm: bool = True + lm_head: bool = False @dataclass class Qwen25_7BVLI_Config: @@ -186,6 +193,7 @@ class Qwen25_7BVLI_Config: k_norm = None rope_scale = None final_norm: bool = True + lm_head: bool = False @dataclass class Gemma2_2B_Config: @@ -209,6 +217,7 @@ class Gemma2_2B_Config: sliding_attention = None rope_scale = None final_norm: bool = True + lm_head: bool = False @dataclass class Gemma3_4B_Config: @@ -232,6 +241,7 @@ class Gemma3_4B_Config: sliding_attention = [1024, 1024, 1024, 1024, 1024, False] rope_scale = [8.0, 1.0] final_norm: bool = True + lm_head: bool = False @dataclass class Gemma3_12B_Config: @@ -255,6 +265,7 @@ class Gemma3_12B_Config: sliding_attention = [1024, 1024, 1024, 1024, 1024, False] rope_scale = [8.0, 1.0] final_norm: bool = True + lm_head: bool = False vision_config = {"num_channels": 3, "hidden_act": "gelu_pytorch_tanh", "hidden_size": 1152, "image_size": 896, "intermediate_size": 4304, "model_type": "siglip_vision_model", "num_attention_heads": 16, "num_hidden_layers": 27, "patch_size": 14} mm_tokens_per_image = 256 @@ -356,6 +367,7 @@ class Attention(nn.Module): attention_mask: Optional[torch.Tensor] = None, freqs_cis: Optional[torch.Tensor] = None, optimized_attention=None, + past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, ): batch_size, seq_length, _ = hidden_states.shape xq = self.q_proj(hidden_states) @@ -373,11 +385,30 @@ class Attention(nn.Module): xq, xk = apply_rope(xq, xk, freqs_cis=freqs_cis) + present_key_value = None + if past_key_value is not None: + index = 0 + num_tokens = xk.shape[2] + if len(past_key_value) > 0: + past_key, past_value, index = past_key_value + if past_key.shape[2] >= (index + num_tokens): + past_key[:, :, index:index + xk.shape[2]] = xk + past_value[:, :, index:index + xv.shape[2]] = xv + xk = past_key[:, :, :index + xk.shape[2]] + xv = past_value[:, :, :index + xv.shape[2]] + present_key_value = (past_key, past_value, index + num_tokens) + else: + xk = torch.cat((past_key[:, :, :index], xk), dim=2) + xv = torch.cat((past_value[:, :, :index], xv), dim=2) + present_key_value = (xk, xv, index + num_tokens) + else: + present_key_value = (xk, xv, index + num_tokens) + xk = xk.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1) xv = xv.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1) output = optimized_attention(xq, xk, xv, self.num_heads, mask=attention_mask, skip_reshape=True) - return self.o_proj(output) + return self.o_proj(output), present_key_value class MLP(nn.Module): def __init__(self, config: Llama2Config, device=None, dtype=None, ops: Any = None): @@ -408,15 +439,17 @@ class TransformerBlock(nn.Module): attention_mask: Optional[torch.Tensor] = None, freqs_cis: Optional[torch.Tensor] = None, optimized_attention=None, + past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, ): # Self Attention residual = x x = self.input_layernorm(x) - x = self.self_attn( + x, present_key_value = self.self_attn( hidden_states=x, attention_mask=attention_mask, freqs_cis=freqs_cis, optimized_attention=optimized_attention, + past_key_value=past_key_value, ) x = residual + x @@ -426,7 +459,7 @@ class TransformerBlock(nn.Module): x = self.mlp(x) x = residual + x - return x + return x, present_key_value class TransformerBlockGemma2(nn.Module): def __init__(self, config: Llama2Config, index, device=None, dtype=None, ops: Any = None): @@ -451,6 +484,7 @@ class TransformerBlockGemma2(nn.Module): attention_mask: Optional[torch.Tensor] = None, freqs_cis: Optional[torch.Tensor] = None, optimized_attention=None, + past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, ): if self.transformer_type == 'gemma3': if self.sliding_attention: @@ -468,11 +502,12 @@ class TransformerBlockGemma2(nn.Module): # Self Attention residual = x x = self.input_layernorm(x) - x = self.self_attn( + x, present_key_value = self.self_attn( hidden_states=x, attention_mask=attention_mask, freqs_cis=freqs_cis, optimized_attention=optimized_attention, + past_key_value=past_key_value, ) x = self.post_attention_layernorm(x) @@ -485,7 +520,7 @@ class TransformerBlockGemma2(nn.Module): x = self.post_feedforward_layernorm(x) x = residual + x - return x + return x, present_key_value class Llama2_(nn.Module): def __init__(self, config, device=None, dtype=None, ops=None): @@ -516,9 +551,10 @@ class Llama2_(nn.Module): else: self.norm = None - # self.lm_head = ops.Linear(config.hidden_size, config.vocab_size, bias=False, device=device, dtype=dtype) + if config.lm_head: + self.lm_head = ops.Linear(config.hidden_size, config.vocab_size, bias=False, device=device, dtype=dtype) - def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, position_ids=None, embeds_info=[]): + def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, position_ids=None, embeds_info=[], past_key_values=None): if embeds is not None: x = embeds else: @@ -527,8 +563,13 @@ class Llama2_(nn.Module): if self.normalize_in: x *= self.config.hidden_size ** 0.5 + seq_len = x.shape[1] + past_len = 0 + if past_key_values is not None and len(past_key_values) > 0: + past_len = past_key_values[0][2] + if position_ids is None: - position_ids = torch.arange(0, x.shape[1], device=x.device).unsqueeze(0) + position_ids = torch.arange(past_len, past_len + seq_len, device=x.device).unsqueeze(0) freqs_cis = precompute_freqs_cis(self.config.head_dim, position_ids, @@ -539,14 +580,16 @@ class Llama2_(nn.Module): mask = None if attention_mask is not None: - mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]) + mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, seq_len, attention_mask.shape[-1]) mask = mask.masked_fill(mask.to(torch.bool), float("-inf")) - causal_mask = torch.empty(x.shape[1], x.shape[1], dtype=x.dtype, device=x.device).fill_(float("-inf")).triu_(1) - if mask is not None: - mask += causal_mask - else: - mask = causal_mask + if seq_len > 1: + causal_mask = torch.empty(past_len + seq_len, past_len + seq_len, dtype=x.dtype, device=x.device).fill_(float("-inf")).triu_(1) + if mask is not None: + mask += causal_mask + else: + mask = causal_mask + optimized_attention = optimized_attention_for_device(x.device, mask=mask is not None, small_input=True) intermediate = None @@ -562,16 +605,27 @@ class Llama2_(nn.Module): elif intermediate_output < 0: intermediate_output = len(self.layers) + intermediate_output + next_key_values = [] for i, layer in enumerate(self.layers): if all_intermediate is not None: if only_layers is None or (i in only_layers): all_intermediate.append(x.unsqueeze(1).clone()) - x = layer( + + past_kv = None + if past_key_values is not None: + past_kv = past_key_values[i] if len(past_key_values) > 0 else [] + + x, current_kv = layer( x=x, attention_mask=mask, freqs_cis=freqs_cis, optimized_attention=optimized_attention, + past_key_value=past_kv, ) + + if current_kv is not None: + next_key_values.append(current_kv) + if i == intermediate_output: intermediate = x.clone() @@ -588,7 +642,10 @@ class Llama2_(nn.Module): if intermediate is not None and final_layer_norm_intermediate and self.norm is not None: intermediate = self.norm(intermediate) - return x, intermediate + if len(next_key_values) > 0: + return x, intermediate, next_key_values + else: + return x, intermediate class Gemma3MultiModalProjector(torch.nn.Module): diff --git a/comfy/text_encoders/lt.py b/comfy/text_encoders/lt.py index c33c77db7..e49161964 100644 --- a/comfy/text_encoders/lt.py +++ b/comfy/text_encoders/lt.py @@ -118,9 +118,18 @@ class LTXAVTEModel(torch.nn.Module): sdo = comfy.utils.state_dict_prefix_replace(sd, {"text_embedding_projection.aggregate_embed.weight": "text_embedding_projection.weight", "model.diffusion_model.video_embeddings_connector.": "video_embeddings_connector.", "model.diffusion_model.audio_embeddings_connector.": "audio_embeddings_connector."}, filter_keys=True) if len(sdo) == 0: sdo = sd - missing, unexpected = self.load_state_dict(sdo, strict=False) - missing = [k for k in missing if not k.startswith("gemma3_12b.")] # filter out keys that belong to the main gemma model - return (missing, unexpected) + + missing_all = [] + unexpected_all = [] + + for prefix, component in [("text_embedding_projection.", self.text_embedding_projection), ("video_embeddings_connector.", self.video_embeddings_connector), ("audio_embeddings_connector.", self.audio_embeddings_connector)]: + component_sd = {k.replace(prefix, ""): v for k, v in sdo.items() if k.startswith(prefix)} + if component_sd: + missing, unexpected = component.load_state_dict(component_sd, strict=False) + missing_all.extend([f"{prefix}{k}" for k in missing]) + unexpected_all.extend([f"{prefix}{k}" for k in unexpected]) + + return (missing_all, unexpected_all) def memory_estimation_function(self, token_weight_pairs, device=None): constant = 6.0 diff --git a/comfy/weight_adapter/__init__.py b/comfy/weight_adapter/__init__.py index b40f920e4..b9fa8d5cf 100644 --- a/comfy/weight_adapter/__init__.py +++ b/comfy/weight_adapter/__init__.py @@ -5,6 +5,11 @@ from .lokr import LoKrAdapter from .glora import GLoRAAdapter from .oft import OFTAdapter from .boft import BOFTAdapter +from .bypass import ( + BypassInjectionManager, + BypassForwardHook, + create_bypass_injections_from_patches, +) adapters: list[type[WeightAdapterBase]] = [ @@ -31,4 +36,7 @@ __all__ = [ "WeightAdapterTrainBase", "adapters", "adapter_maps", + "BypassInjectionManager", + "BypassForwardHook", + "create_bypass_injections_from_patches", ] + [a.__name__ for a in adapters] diff --git a/comfy/weight_adapter/base.py b/comfy/weight_adapter/base.py index 43644b106..bce89a0e2 100644 --- a/comfy/weight_adapter/base.py +++ b/comfy/weight_adapter/base.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Callable, Optional import torch import torch.nn as nn @@ -7,12 +7,35 @@ import comfy.model_management class WeightAdapterBase: + """ + Base class for weight adapters (LoRA, LoHa, LoKr, OFT, etc.) + + Bypass Mode: + All adapters follow the pattern: bypass(f)(x) = g(f(x) + h(x)) + + - h(x): Additive component (LoRA path). Returns delta to add to base output. + - g(y): Output transformation. Applied after base + h(x). + + For LoRA/LoHa/LoKr: g = identity, h = adapter(x) + For OFT/BOFT: g = transform, h = 0 + """ + name: str loaded_keys: set[str] weights: list[torch.Tensor] + # Attributes set by bypass system + multiplier: float = 1.0 + shape: tuple = None # (out_features, in_features) or (out_ch, in_ch, *kernel) + @classmethod - def load(cls, x: str, lora: dict[str, torch.Tensor], alpha: float, dora_scale: torch.Tensor) -> Optional["WeightAdapterBase"]: + def load( + cls, + x: str, + lora: dict[str, torch.Tensor], + alpha: float, + dora_scale: torch.Tensor, + ) -> Optional["WeightAdapterBase"]: raise NotImplementedError def to_train(self) -> "WeightAdapterTrainBase": @@ -39,18 +62,202 @@ class WeightAdapterBase: ): raise NotImplementedError + # ===== Bypass Mode Methods ===== + # + # IMPORTANT: Bypass mode is designed for quantized models where original weights + # may not be accessible in a usable format. Therefore, h() and bypass_forward() + # do NOT take org_weight as a parameter. All necessary information (out_channels, + # in_channels, conv params, etc.) is provided via attributes set by BypassForwardHook. + + def h(self, x: torch.Tensor, base_out: torch.Tensor) -> torch.Tensor: + """ + Additive bypass component: h(x, base_out) + + Computes the adapter's contribution to be added to base forward output. + For adapters that only transform output (OFT/BOFT), returns zeros. + + Note: + This method does NOT access original model weights. Bypass mode is + designed for quantized models where weights may not be in a usable format. + All shape info comes from module attributes set by BypassForwardHook. + + Args: + x: Input tensor + base_out: Output from base forward f(x), can be used for shape reference + + Returns: + Delta tensor to add to base output. Shape matches base output. + + Reference: LyCORIS LoConModule.bypass_forward_diff + """ + # Default: no additive component (for OFT/BOFT) + # Simply return zeros matching base_out shape + return torch.zeros_like(base_out) + + def g(self, y: torch.Tensor) -> torch.Tensor: + """ + Output transformation: g(y) + + Applied after base forward + h(x). For most adapters this is identity. + OFT/BOFT override this to apply orthogonal transformation. + + Args: + y: Combined output (base + h(x)) + + Returns: + Transformed output + + Reference: LyCORIS OFTModule applies orthogonal transform here + """ + # Default: identity (for LoRA/LoHa/LoKr) + return y + + def bypass_forward( + self, + org_forward: Callable, + x: torch.Tensor, + *args, + **kwargs, + ) -> torch.Tensor: + """ + Full bypass forward: g(f(x) + h(x, f(x))) + + Note: + This method does NOT take org_weight/org_bias parameters. Bypass mode + is designed for quantized models where weights may not be accessible. + The original forward function handles weight access internally. + + Args: + org_forward: Original module forward function + x: Input tensor + *args, **kwargs: Additional arguments for org_forward + + Returns: + Output with adapter applied in bypass mode + + Reference: LyCORIS LoConModule.bypass_forward + """ + # Base forward: f(x) + base_out = org_forward(x, *args, **kwargs) + + # Additive component: h(x, base_out) - base_out provided for shape reference + h_out = self.h(x, base_out) + + # Output transformation: g(base + h) + return self.g(base_out + h_out) + class WeightAdapterTrainBase(nn.Module): - # We follow the scheme of PR #7032 + """ + Base class for trainable weight adapters (LoRA, LoHa, LoKr, OFT, etc.) + + Bypass Mode: + All adapters follow the pattern: bypass(f)(x) = g(f(x) + h(x)) + + - h(x): Additive component (LoRA path). Returns delta to add to base output. + - g(y): Output transformation. Applied after base + h(x). + + For LoRA/LoHa/LoKr: g = identity, h = adapter(x) + For OFT: g = transform, h = 0 + + Note: + Unlike WeightAdapterBase, TrainBase classes have simplified weight formats + with fewer branches (e.g., LoKr only has w1/w2, not w1_a/w1_b decomposition). + + We follow the scheme of PR #7032 + """ + + # Attributes set by bypass system (BypassForwardHook) + # These are set before h()/g()/bypass_forward() are called + multiplier: float = 1.0 + is_conv: bool = False + conv_dim: int = 0 # 0=linear, 1=conv1d, 2=conv2d, 3=conv3d + kw_dict: dict = {} # Conv kwargs: stride, padding, dilation, groups + kernel_size: tuple = () + in_channels: int = None + out_channels: int = None + def __init__(self): super().__init__() def __call__(self, w): """ - w: The original weight tensor to be modified. + Weight modification mode: returns modified weight. + + Args: + w: The original weight tensor to be modified. + + Returns: + Modified weight tensor. """ raise NotImplementedError + # ===== Bypass Mode Methods ===== + + def h(self, x: torch.Tensor, base_out: torch.Tensor) -> torch.Tensor: + """ + Additive bypass component: h(x, base_out) + + Computes the adapter's contribution to be added to base forward output. + For adapters that only transform output (OFT), returns zeros. + + Args: + x: Input tensor + base_out: Output from base forward f(x), can be used for shape reference + + Returns: + Delta tensor to add to base output. Shape matches base output. + + Subclasses should override this method. + """ + raise NotImplementedError( + f"{self.__class__.__name__}.h() not implemented. " + "Subclasses must implement h() for bypass mode." + ) + + def g(self, y: torch.Tensor) -> torch.Tensor: + """ + Output transformation: g(y) + + Applied after base forward + h(x). For most adapters this is identity. + OFT overrides this to apply orthogonal transformation. + + Args: + y: Combined output (base + h(x)) + + Returns: + Transformed output + """ + # Default: identity (for LoRA/LoHa/LoKr) + return y + + def bypass_forward( + self, + org_forward: Callable, + x: torch.Tensor, + *args, + **kwargs, + ) -> torch.Tensor: + """ + Full bypass forward: g(f(x) + h(x, f(x))) + + Args: + org_forward: Original module forward function + x: Input tensor + *args, **kwargs: Additional arguments for org_forward + + Returns: + Output with adapter applied in bypass mode + """ + # Base forward: f(x) + base_out = org_forward(x, *args, **kwargs) + + # Additive component: h(x, base_out) - base_out provided for shape reference + h_out = self.h(x, base_out) + + # Output transformation: g(base + h) + return self.g(base_out + h_out) + def passive_memory_usage(self): raise NotImplementedError("passive_memory_usage is not implemented") @@ -59,8 +266,12 @@ class WeightAdapterTrainBase(nn.Module): return self.passive_memory_usage() -def weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function): - dora_scale = comfy.model_management.cast_to_device(dora_scale, weight.device, intermediate_dtype) +def weight_decompose( + dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function +): + dora_scale = comfy.model_management.cast_to_device( + dora_scale, weight.device, intermediate_dtype + ) lora_diff *= alpha weight_calc = weight + function(lora_diff).type(weight.dtype) @@ -106,10 +317,14 @@ def pad_tensor_to_shape(tensor: torch.Tensor, new_shape: list[int]) -> torch.Ten the original tensor will be truncated in that dimension. """ if any([new_shape[i] < tensor.shape[i] for i in range(len(new_shape))]): - raise ValueError("The new shape must be larger than the original tensor in all dimensions") + raise ValueError( + "The new shape must be larger than the original tensor in all dimensions" + ) if len(new_shape) != len(tensor.shape): - raise ValueError("The new shape must have the same number of dimensions as the original tensor") + raise ValueError( + "The new shape must have the same number of dimensions as the original tensor" + ) # Create a new tensor filled with zeros padded_tensor = torch.zeros(new_shape, dtype=tensor.dtype, device=tensor.device) diff --git a/comfy/weight_adapter/boft.py b/comfy/weight_adapter/boft.py index b2a2f1bd4..02a8dc130 100644 --- a/comfy/weight_adapter/boft.py +++ b/comfy/weight_adapter/boft.py @@ -62,9 +62,13 @@ class BOFTAdapter(WeightAdapterBase): alpha = v[2] dora_scale = v[3] - blocks = comfy.model_management.cast_to_device(blocks, weight.device, intermediate_dtype) + blocks = comfy.model_management.cast_to_device( + blocks, weight.device, intermediate_dtype + ) if rescale is not None: - rescale = comfy.model_management.cast_to_device(rescale, weight.device, intermediate_dtype) + rescale = comfy.model_management.cast_to_device( + rescale, weight.device, intermediate_dtype + ) boft_m, block_num, boft_b, *_ = blocks.shape @@ -74,7 +78,7 @@ class BOFTAdapter(WeightAdapterBase): # for Q = -Q^T q = blocks - blocks.transpose(-1, -2) normed_q = q - if alpha > 0: # alpha in boft/bboft is for constraint + if alpha > 0: # alpha in boft/bboft is for constraint q_norm = torch.norm(q) + 1e-8 if q_norm > alpha: normed_q = q * alpha / q_norm @@ -83,13 +87,13 @@ class BOFTAdapter(WeightAdapterBase): r = r.to(weight) inp = org = weight - r_b = boft_b//2 + r_b = boft_b // 2 for i in range(boft_m): bi = r[i] g = 2 k = 2**i * r_b if strength != 1: - bi = bi * strength + (1-strength) * I + bi = bi * strength + (1 - strength) * I inp = ( inp.unflatten(0, (-1, g, k)) .transpose(1, 2) @@ -98,18 +102,117 @@ class BOFTAdapter(WeightAdapterBase): ) inp = torch.einsum("b i j, b j ...-> b i ...", bi, inp) inp = ( - inp.flatten(0, 1).unflatten(0, (-1, k, g)).transpose(1, 2).flatten(0, 2) + inp.flatten(0, 1) + .unflatten(0, (-1, k, g)) + .transpose(1, 2) + .flatten(0, 2) ) if rescale is not None: inp = inp * rescale lora_diff = inp - org - lora_diff = comfy.model_management.cast_to_device(lora_diff, weight.device, intermediate_dtype) + lora_diff = comfy.model_management.cast_to_device( + lora_diff, weight.device, intermediate_dtype + ) if dora_scale is not None: - weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function) + weight = weight_decompose( + dora_scale, + weight, + lora_diff, + alpha, + strength, + intermediate_dtype, + function, + ) else: weight += function((strength * lora_diff).type(weight.dtype)) except Exception as e: logging.error("ERROR {} {} {}".format(self.name, key, e)) return weight + + def _get_orthogonal_matrices(self, device, dtype): + """Compute the orthogonal rotation matrices R from BOFT blocks.""" + v = self.weights + blocks = v[0].to(device=device, dtype=dtype) + alpha = v[2] + if alpha is None: + alpha = 0 + + boft_m, block_num, boft_b, _ = blocks.shape + I = torch.eye(boft_b, device=device, dtype=dtype) + + # Q = blocks - blocks^T (skew-symmetric) + q = blocks - blocks.transpose(-1, -2) + normed_q = q + + # Apply constraint if alpha > 0 + if alpha > 0: + q_norm = torch.norm(q) + 1e-8 + if q_norm > alpha: + normed_q = q * alpha / q_norm + + # Cayley transform: R = (I + Q)(I - Q)^-1 + r = (I + normed_q) @ (I - normed_q).float().inverse() + return r, boft_m, boft_b + + def g(self, y: torch.Tensor) -> torch.Tensor: + """ + Output transformation for BOFT: applies butterfly orthogonal transform. + + BOFT uses multiple stages of butterfly-structured orthogonal transforms. + + Reference: LyCORIS ButterflyOFTModule._bypass_forward + """ + v = self.weights + rescale = v[1] + + r, boft_m, boft_b = self._get_orthogonal_matrices(y.device, y.dtype) + r_b = boft_b // 2 + + # Apply multiplier + multiplier = getattr(self, "multiplier", 1.0) + I = torch.eye(boft_b, device=y.device, dtype=y.dtype) + + # Use module info from bypass injection to determine conv vs linear + is_conv = getattr(self, "is_conv", y.dim() > 2) + + if is_conv: + # Conv output: (N, C, H, W, ...) -> transpose to (N, H, W, ..., C) + y = y.transpose(1, -1) + + # Apply butterfly transform stages + inp = y + for i in range(boft_m): + bi = r[i] # (block_num, boft_b, boft_b) + g = 2 + k = 2**i * r_b + + # Interpolate with identity based on multiplier + if multiplier != 1: + bi = bi * multiplier + (1 - multiplier) * I + + # Reshape for butterfly: unflatten last dim, transpose, flatten, unflatten + inp = ( + inp.unflatten(-1, (-1, g, k)) + .transpose(-2, -1) + .flatten(-3) + .unflatten(-1, (-1, boft_b)) + ) + # Apply block-diagonal orthogonal transform + inp = torch.einsum("b i j, ... b j -> ... b i", bi, inp) + # Reshape back + inp = ( + inp.flatten(-2).unflatten(-1, (-1, k, g)).transpose(-2, -1).flatten(-3) + ) + + # Apply rescale if present + if rescale is not None: + rescale = rescale.to(device=y.device, dtype=y.dtype) + inp = inp * rescale.transpose(0, -1) + + if is_conv: + # Transpose back: (N, H, W, ..., C) -> (N, C, H, W, ...) + inp = inp.transpose(1, -1) + + return inp diff --git a/comfy/weight_adapter/bypass.py b/comfy/weight_adapter/bypass.py new file mode 100644 index 000000000..d4aaf98ca --- /dev/null +++ b/comfy/weight_adapter/bypass.py @@ -0,0 +1,437 @@ +""" +Bypass mode implementation for weight adapters (LoRA, LoKr, LoHa, etc.) + +Bypass mode applies adapters during forward pass without modifying base weights: + bypass(f)(x) = g(f(x) + h(x)) + +Where: + - f(x): Original layer forward + - h(x): Additive component from adapter (LoRA path) + - g(y): Output transformation (identity for most adapters) + +This is useful for: + - Training with gradient checkpointing + - Avoiding weight modifications when weights are offloaded + - Supporting multiple adapters with different strengths dynamically +""" + +import logging +from typing import Optional, Union + +import torch +import torch.nn as nn + +from .base import WeightAdapterBase, WeightAdapterTrainBase +from comfy.patcher_extension import PatcherInjection + +# Type alias for adapters that support bypass mode +BypassAdapter = Union[WeightAdapterBase, WeightAdapterTrainBase] + + +def get_module_type_info(module: nn.Module) -> dict: + """ + Determine module type and extract conv parameters from module class. + + This is more reliable than checking weight.ndim, especially for quantized layers + where weight shape might be different. + + Returns: + dict with keys: is_conv, conv_dim, stride, padding, dilation, groups + """ + info = { + "is_conv": False, + "conv_dim": 0, + "stride": (1,), + "padding": (0,), + "dilation": (1,), + "groups": 1, + "kernel_size": (1,), + "in_channels": None, + "out_channels": None, + } + + # Determine conv type + if isinstance(module, nn.Conv1d): + info["is_conv"] = True + info["conv_dim"] = 1 + elif isinstance(module, nn.Conv2d): + info["is_conv"] = True + info["conv_dim"] = 2 + elif isinstance(module, nn.Conv3d): + info["is_conv"] = True + info["conv_dim"] = 3 + elif isinstance(module, nn.Linear): + info["is_conv"] = False + info["conv_dim"] = 0 + else: + # Try to infer from class name for custom/quantized layers + class_name = type(module).__name__.lower() + if "conv3d" in class_name: + info["is_conv"] = True + info["conv_dim"] = 3 + elif "conv2d" in class_name: + info["is_conv"] = True + info["conv_dim"] = 2 + elif "conv1d" in class_name: + info["is_conv"] = True + info["conv_dim"] = 1 + elif "conv" in class_name: + info["is_conv"] = True + info["conv_dim"] = 2 + + # Extract conv parameters if it's a conv layer + if info["is_conv"]: + # Try to get stride, padding, dilation, groups, kernel_size from module + info["stride"] = getattr(module, "stride", (1,) * info["conv_dim"]) + info["padding"] = getattr(module, "padding", (0,) * info["conv_dim"]) + info["dilation"] = getattr(module, "dilation", (1,) * info["conv_dim"]) + info["groups"] = getattr(module, "groups", 1) + info["kernel_size"] = getattr(module, "kernel_size", (1,) * info["conv_dim"]) + info["in_channels"] = getattr(module, "in_channels", None) + info["out_channels"] = getattr(module, "out_channels", None) + + # Ensure they're tuples + if isinstance(info["stride"], int): + info["stride"] = (info["stride"],) * info["conv_dim"] + if isinstance(info["padding"], int): + info["padding"] = (info["padding"],) * info["conv_dim"] + if isinstance(info["dilation"], int): + info["dilation"] = (info["dilation"],) * info["conv_dim"] + if isinstance(info["kernel_size"], int): + info["kernel_size"] = (info["kernel_size"],) * info["conv_dim"] + + return info + + +class BypassForwardHook: + """ + Hook that wraps a layer's forward to apply adapter in bypass mode. + + Stores the original forward and replaces it with bypass version. + + Supports both: + - WeightAdapterBase: Inference adapters (uses self.weights tuple) + - WeightAdapterTrainBase: Training adapters (nn.Module with parameters) + """ + + def __init__( + self, + module: nn.Module, + adapter: BypassAdapter, + multiplier: float = 1.0, + ): + self.module = module + self.adapter = adapter + self.multiplier = multiplier + self.original_forward = None + + # Determine layer type and conv params from module class (works for quantized layers) + module_info = get_module_type_info(module) + + # Set multiplier and layer type info on adapter for use in h() + adapter.multiplier = multiplier + adapter.is_conv = module_info["is_conv"] + adapter.conv_dim = module_info["conv_dim"] + adapter.kernel_size = module_info["kernel_size"] + adapter.in_channels = module_info["in_channels"] + adapter.out_channels = module_info["out_channels"] + # Store kw_dict for conv operations (like LyCORIS extra_args) + if module_info["is_conv"]: + adapter.kw_dict = { + "stride": module_info["stride"], + "padding": module_info["padding"], + "dilation": module_info["dilation"], + "groups": module_info["groups"], + } + else: + adapter.kw_dict = {} + + def _bypass_forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + """Bypass forward: uses adapter's bypass_forward or default g(f(x) + h(x)) + + Note: + Bypass mode does NOT access original model weights (org_weight). + This is intentional - bypass mode is designed for quantized models + where weights may not be in a usable format. All necessary shape + information is provided via adapter attributes set during inject(). + """ + # Check if adapter has custom bypass_forward (e.g., GLoRA) + adapter_bypass = getattr(self.adapter, "bypass_forward", None) + if adapter_bypass is not None: + # Check if it's overridden (not the base class default) + # Need to check both base classes since adapter could be either type + adapter_type = type(self.adapter) + is_default_bypass = ( + adapter_type.bypass_forward is WeightAdapterBase.bypass_forward + or adapter_type.bypass_forward is WeightAdapterTrainBase.bypass_forward + ) + if not is_default_bypass: + return adapter_bypass(self.original_forward, x, *args, **kwargs) + + # Default bypass: g(f(x) + h(x, f(x))) + base_out = self.original_forward(x, *args, **kwargs) + h_out = self.adapter.h(x, base_out) + return self.adapter.g(base_out + h_out) + + def inject(self): + """Replace module forward with bypass version.""" + if self.original_forward is not None: + logging.debug( + f"[BypassHook] Already injected for {type(self.module).__name__}" + ) + return # Already injected + + # Move adapter weights to module's device to avoid CPU-GPU transfer on every forward + device = None + dtype = None + if hasattr(self.module, "weight") and self.module.weight is not None: + device = self.module.weight.device + dtype = self.module.weight.dtype + elif hasattr(self.module, "W_q"): # Quantized layers might use different attr + device = self.module.W_q.device + dtype = self.module.W_q.dtype + + if device is not None: + self._move_adapter_weights_to_device(device, dtype) + + self.original_forward = self.module.forward + self.module.forward = self._bypass_forward + logging.debug( + f"[BypassHook] Injected bypass forward for {type(self.module).__name__} (adapter={type(self.adapter).__name__})" + ) + + def _move_adapter_weights_to_device(self, device, dtype=None): + """Move adapter weights to specified device to avoid per-forward transfers. + + Handles both: + - WeightAdapterBase: has self.weights tuple of tensors + - WeightAdapterTrainBase: nn.Module with parameters, uses .to() method + """ + adapter = self.adapter + + # Check if adapter is an nn.Module (WeightAdapterTrainBase) + if isinstance(adapter, nn.Module): + # In training mode we don't touch dtype as trainer will handle it + adapter.to(device=device) + logging.debug( + f"[BypassHook] Moved training adapter (nn.Module) to {device}" + ) + return + + # WeightAdapterBase: handle self.weights tuple + if not hasattr(adapter, "weights") or adapter.weights is None: + return + + weights = adapter.weights + if isinstance(weights, (list, tuple)): + new_weights = [] + for w in weights: + if isinstance(w, torch.Tensor): + if dtype is not None: + new_weights.append(w.to(device=device, dtype=dtype)) + else: + new_weights.append(w.to(device=device)) + else: + new_weights.append(w) + adapter.weights = ( + tuple(new_weights) if isinstance(weights, tuple) else new_weights + ) + elif isinstance(weights, torch.Tensor): + if dtype is not None: + adapter.weights = weights.to(device=device, dtype=dtype) + else: + adapter.weights = weights.to(device=device) + + logging.debug(f"[BypassHook] Moved adapter weights to {device}") + + def eject(self): + """Restore original module forward.""" + if self.original_forward is None: + logging.debug(f"[BypassHook] Not injected for {type(self.module).__name__}") + return # Not injected + + self.module.forward = self.original_forward + self.original_forward = None + logging.debug( + f"[BypassHook] Ejected bypass forward for {type(self.module).__name__}" + ) + + +class BypassInjectionManager: + """ + Manages bypass mode injection for a collection of adapters. + + Creates PatcherInjection objects that can be used with ModelPatcher. + + Supports both inference adapters (WeightAdapterBase) and training adapters + (WeightAdapterTrainBase). + + Usage: + manager = BypassInjectionManager() + manager.add_adapter("model.layers.0.self_attn.q_proj", lora_adapter, strength=0.8) + manager.add_adapter("model.layers.0.self_attn.k_proj", lora_adapter, strength=0.8) + + injections = manager.create_injections(model) + model_patcher.set_injections("bypass_lora", injections) + """ + + def __init__(self): + self.adapters: dict[str, tuple[BypassAdapter, float]] = {} + self.hooks: list[BypassForwardHook] = [] + + def add_adapter( + self, + key: str, + adapter: BypassAdapter, + strength: float = 1.0, + ): + """ + Add an adapter for a specific weight key. + + Args: + key: Weight key (e.g., "model.layers.0.self_attn.q_proj.weight") + adapter: The weight adapter (LoRAAdapter, LoKrAdapter, etc.) + strength: Multiplier for adapter effect + """ + # Remove .weight suffix if present for module lookup + module_key = key + if module_key.endswith(".weight"): + module_key = module_key[:-7] + logging.debug( + f"[BypassManager] Stripped .weight suffix: {key} -> {module_key}" + ) + + self.adapters[module_key] = (adapter, strength) + logging.debug( + f"[BypassManager] Added adapter: {module_key} (type={type(adapter).__name__}, strength={strength})" + ) + + def clear_adapters(self): + """Remove all adapters.""" + self.adapters.clear() + + def _get_module_by_key(self, model: nn.Module, key: str) -> Optional[nn.Module]: + """Get a submodule by dot-separated key.""" + parts = key.split(".") + module = model + try: + for i, part in enumerate(parts): + if part.isdigit(): + module = module[int(part)] + else: + module = getattr(module, part) + logging.debug( + f"[BypassManager] Found module for key {key}: {type(module).__name__}" + ) + return module + except (AttributeError, IndexError, KeyError) as e: + logging.error(f"[BypassManager] Failed to find module for key {key}: {e}") + logging.error( + f"[BypassManager] Failed at part index {i}, part={part}, current module type={type(module).__name__}" + ) + return None + + def create_injections(self, model: nn.Module) -> list[PatcherInjection]: + """ + Create PatcherInjection objects for all registered adapters. + + Args: + model: The model to inject into (e.g., model_patcher.model) + + Returns: + List of PatcherInjection objects to use with model_patcher.set_injections() + """ + self.hooks.clear() + + logging.debug( + f"[BypassManager] create_injections called with {len(self.adapters)} adapters" + ) + logging.debug(f"[BypassManager] Model type: {type(model).__name__}") + + for key, (adapter, strength) in self.adapters.items(): + logging.debug(f"[BypassManager] Looking for module: {key}") + module = self._get_module_by_key(model, key) + + if module is None: + logging.warning(f"[BypassManager] Module not found for key {key}") + continue + + if not hasattr(module, "weight"): + logging.warning( + f"[BypassManager] Module {key} has no weight attribute (type={type(module).__name__})" + ) + continue + + logging.debug( + f"[BypassManager] Creating hook for {key} (module type={type(module).__name__}, weight shape={module.weight.shape})" + ) + hook = BypassForwardHook(module, adapter, multiplier=strength) + self.hooks.append(hook) + + logging.debug(f"[BypassManager] Created {len(self.hooks)} hooks") + + # Create single injection that manages all hooks + def inject_all(model_patcher): + logging.debug( + f"[BypassManager] inject_all called, injecting {len(self.hooks)} hooks" + ) + for hook in self.hooks: + hook.inject() + logging.debug( + f"[BypassManager] Injected hook for {type(hook.module).__name__}" + ) + + def eject_all(model_patcher): + logging.debug( + f"[BypassManager] eject_all called, ejecting {len(self.hooks)} hooks" + ) + for hook in self.hooks: + hook.eject() + + return [PatcherInjection(inject=inject_all, eject=eject_all)] + + def get_hook_count(self) -> int: + """Return number of hooks that will be/are injected.""" + return len(self.hooks) + + +def create_bypass_injections_from_patches( + model: nn.Module, + patches: dict, + strength: float = 1.0, +) -> list[PatcherInjection]: + """ + Convenience function to create bypass injections from a patches dict. + + This is useful when you have patches in the format used by model_patcher.add_patches() + and want to apply them in bypass mode instead. + + Args: + model: The model to inject into + patches: Dict mapping weight keys to adapter data + strength: Global strength multiplier + + Returns: + List of PatcherInjection objects + """ + manager = BypassInjectionManager() + + for key, patch_list in patches.items(): + if not patch_list: + continue + + # patches format: list of (strength_patch, patch_data, strength_model, offset, function) + for patch in patch_list: + patch_strength, patch_data, strength_model, offset, function = patch + + # patch_data should be a WeightAdapterBase/WeightAdapterTrainBase or tuple + if isinstance(patch_data, (WeightAdapterBase, WeightAdapterTrainBase)): + adapter = patch_data + else: + # Skip non-adapter patches + continue + + combined_strength = strength * patch_strength + manager.add_adapter(key, adapter, strength=combined_strength) + + return manager.create_injections(model) diff --git a/comfy/weight_adapter/glora.py b/comfy/weight_adapter/glora.py index 939abbba5..d6b97a23b 100644 --- a/comfy/weight_adapter/glora.py +++ b/comfy/weight_adapter/glora.py @@ -1,7 +1,8 @@ import logging -from typing import Optional +from typing import Callable, Optional import torch +import torch.nn.functional as F import comfy.model_management from .base import WeightAdapterBase, weight_decompose @@ -29,7 +30,14 @@ class GLoRAAdapter(WeightAdapterBase): b1_name = "{}.b1.weight".format(x) b2_name = "{}.b2.weight".format(x) if a1_name in lora: - weights = (lora[a1_name], lora[a2_name], lora[b1_name], lora[b2_name], alpha, dora_scale) + weights = ( + lora[a1_name], + lora[a2_name], + lora[b1_name], + lora[b2_name], + alpha, + dora_scale, + ) loaded_keys.add(a1_name) loaded_keys.add(a2_name) loaded_keys.add(b1_name) @@ -58,16 +66,28 @@ class GLoRAAdapter(WeightAdapterBase): old_glora = True if v[3].shape[0] == v[2].shape[1] == v[0].shape[1] == v[1].shape[0]: - if old_glora and v[1].shape[0] == weight.shape[0] and weight.shape[0] == weight.shape[1]: + if ( + old_glora + and v[1].shape[0] == weight.shape[0] + and weight.shape[0] == weight.shape[1] + ): pass else: old_glora = False rank = v[1].shape[0] - a1 = comfy.model_management.cast_to_device(v[0].flatten(start_dim=1), weight.device, intermediate_dtype) - a2 = comfy.model_management.cast_to_device(v[1].flatten(start_dim=1), weight.device, intermediate_dtype) - b1 = comfy.model_management.cast_to_device(v[2].flatten(start_dim=1), weight.device, intermediate_dtype) - b2 = comfy.model_management.cast_to_device(v[3].flatten(start_dim=1), weight.device, intermediate_dtype) + a1 = comfy.model_management.cast_to_device( + v[0].flatten(start_dim=1), weight.device, intermediate_dtype + ) + a2 = comfy.model_management.cast_to_device( + v[1].flatten(start_dim=1), weight.device, intermediate_dtype + ) + b1 = comfy.model_management.cast_to_device( + v[2].flatten(start_dim=1), weight.device, intermediate_dtype + ) + b2 = comfy.model_management.cast_to_device( + v[3].flatten(start_dim=1), weight.device, intermediate_dtype + ) if v[4] is not None: alpha = v[4] / rank @@ -76,18 +96,195 @@ class GLoRAAdapter(WeightAdapterBase): try: if old_glora: - lora_diff = (torch.mm(b2, b1) + torch.mm(torch.mm(weight.flatten(start_dim=1).to(dtype=intermediate_dtype), a2), a1)).reshape(weight.shape) #old lycoris glora + lora_diff = ( + torch.mm(b2, b1) + + torch.mm( + torch.mm( + weight.flatten(start_dim=1).to(dtype=intermediate_dtype), a2 + ), + a1, + ) + ).reshape( + weight.shape + ) # old lycoris glora else: if weight.dim() > 2: - lora_diff = torch.einsum("o i ..., i j -> o j ...", torch.einsum("o i ..., i j -> o j ...", weight.to(dtype=intermediate_dtype), a1), a2).reshape(weight.shape) + lora_diff = torch.einsum( + "o i ..., i j -> o j ...", + torch.einsum( + "o i ..., i j -> o j ...", + weight.to(dtype=intermediate_dtype), + a1, + ), + a2, + ).reshape(weight.shape) else: - lora_diff = torch.mm(torch.mm(weight.to(dtype=intermediate_dtype), a1), a2).reshape(weight.shape) + lora_diff = torch.mm( + torch.mm(weight.to(dtype=intermediate_dtype), a1), a2 + ).reshape(weight.shape) lora_diff += torch.mm(b1, b2).reshape(weight.shape) if dora_scale is not None: - weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function) + weight = weight_decompose( + dora_scale, + weight, + lora_diff, + alpha, + strength, + intermediate_dtype, + function, + ) else: weight += function(((strength * alpha) * lora_diff).type(weight.dtype)) except Exception as e: logging.error("ERROR {} {} {}".format(self.name, key, e)) return weight + + def _compute_paths(self, x: torch.Tensor): + """ + Compute A path and B path outputs for GLoRA bypass. + + GLoRA: f(x) = Wx + WAx + Bx + - A path: a1(a2(x)) - modifies input to base forward + - B path: b1(b2(x)) - additive component + + Note: + Does not access original model weights - bypass mode is designed + for quantized models where weights may not be accessible. + + Returns: (a_out, b_out) + """ + v = self.weights + # v = (a1, a2, b1, b2, alpha, dora_scale) + a1 = v[0] + a2 = v[1] + b1 = v[2] + b2 = v[3] + alpha = v[4] + + dtype = x.dtype + + # Cast dtype (weights should already be on correct device from inject()) + a1 = a1.to(dtype=dtype) + a2 = a2.to(dtype=dtype) + b1 = b1.to(dtype=dtype) + b2 = b2.to(dtype=dtype) + + # Determine rank and scale + # Check for old vs new glora format + old_glora = False + if b2.shape[1] == b1.shape[0] == a1.shape[0] == a2.shape[1]: + rank = a1.shape[0] + old_glora = True + + if b2.shape[0] == b1.shape[1] == a1.shape[1] == a2.shape[0]: + if old_glora and a2.shape[0] == x.shape[-1] and x.shape[-1] == x.shape[-1]: + pass + else: + old_glora = False + rank = a2.shape[0] + + if alpha is not None: + scale = alpha / rank + else: + scale = 1.0 + + # Apply multiplier + multiplier = getattr(self, "multiplier", 1.0) + scale = scale * multiplier + + # Use module info from bypass injection, not input tensor shape + is_conv = getattr(self, "is_conv", False) + conv_dim = getattr(self, "conv_dim", 0) + kw_dict = getattr(self, "kw_dict", {}) + + if is_conv: + # Conv case - conv_dim is 1/2/3 for conv1d/2d/3d + conv_fn = (F.conv1d, F.conv2d, F.conv3d)[conv_dim - 1] + + # Get module's stride/padding for spatial dimension handling + module_stride = kw_dict.get("stride", (1,) * conv_dim) + module_padding = kw_dict.get("padding", (0,) * conv_dim) + kernel_size = getattr(self, "kernel_size", (1,) * conv_dim) + in_channels = getattr(self, "in_channels", None) + + # Ensure weights are in conv shape + # a1, a2, b1 are always 1x1 kernels + if a1.ndim == 2: + a1 = a1.view(*a1.shape, *([1] * conv_dim)) + if a2.ndim == 2: + a2 = a2.view(*a2.shape, *([1] * conv_dim)) + if b1.ndim == 2: + b1 = b1.view(*b1.shape, *([1] * conv_dim)) + # b2 has actual kernel_size (like LoRA down) + if b2.ndim == 2: + if in_channels is not None: + b2 = b2.view(b2.shape[0], in_channels, *kernel_size) + else: + b2 = b2.view(*b2.shape, *([1] * conv_dim)) + + # A path: a2(x) -> a1(...) - 1x1 convs, no stride/padding needed, a_out is added to x + a2_out = conv_fn(x, a2) + a_out = conv_fn(a2_out, a1) * scale + + # B path: b2(x) with kernel/stride/padding -> b1(...) 1x1 + b2_out = conv_fn(x, b2, stride=module_stride, padding=module_padding) + b_out = conv_fn(b2_out, b1) * scale + else: + # Linear case + if old_glora: + # Old format: a1 @ a2 @ x, b2 @ b1 + a_out = F.linear(F.linear(x, a2), a1) * scale + b_out = F.linear(F.linear(x, b1), b2) * scale + else: + # New format: x @ a1 @ a2, b1 @ b2 + a_out = F.linear(F.linear(x, a1), a2) * scale + b_out = F.linear(F.linear(x, b2), b1) * scale + + return a_out, b_out + + def bypass_forward( + self, + org_forward: Callable, + x: torch.Tensor, + *args, + **kwargs, + ) -> torch.Tensor: + """ + GLoRA bypass forward: f(x + a(x)) + b(x) + + Unlike standard adapters, GLoRA modifies the input to the base forward + AND adds the B path output. + + Note: + Does not access original model weights - bypass mode is designed + for quantized models where weights may not be accessible. + + Reference: LyCORIS GLoRAModule._bypass_forward + """ + a_out, b_out = self._compute_paths(x) + + # Call base forward with modified input + base_out = org_forward(x + a_out, *args, **kwargs) + + # Add B path + return base_out + b_out + + def h(self, x: torch.Tensor, base_out: torch.Tensor) -> torch.Tensor: + """ + For GLoRA, h() returns the B path output. + + Note: + GLoRA's full bypass requires overriding bypass_forward() since + it also modifies the input to org_forward. This h() is provided for + compatibility but bypass_forward() should be used for correct behavior. + + Does not access original model weights - bypass mode is designed + for quantized models where weights may not be accessible. + + Args: + x: Input tensor + base_out: Output from base forward (unused, for API consistency) + """ + _, b_out = self._compute_paths(x) + return b_out diff --git a/comfy/weight_adapter/loha.py b/comfy/weight_adapter/loha.py index 0abb2d403..8007b7b44 100644 --- a/comfy/weight_adapter/loha.py +++ b/comfy/weight_adapter/loha.py @@ -1,11 +1,22 @@ import logging +from functools import cache from typing import Optional import torch +import torch.nn.functional as F import comfy.model_management from .base import WeightAdapterBase, WeightAdapterTrainBase, weight_decompose +@cache +def _warn_loha_bypass_inefficient(): + """One-time warning about LoHa bypass inefficiency.""" + logging.warning( + "LoHa bypass mode is inefficient: full weight diff is computed each forward pass. " + "Consider using LoRA or LoKr for training with bypass mode." + ) + + class HadaWeight(torch.autograd.Function): @staticmethod def forward(ctx, w1u, w1d, w2u, w2d, scale=torch.tensor(1)): @@ -105,9 +116,19 @@ class LohaDiff(WeightAdapterTrainBase): scale = self.alpha / self.rank if self.use_tucker: - diff_weight = HadaWeightTucker.apply(self.hada_t1, self.hada_w1_a, self.hada_w1_b, self.hada_t2, self.hada_w2_a, self.hada_w2_b, scale) + diff_weight = HadaWeightTucker.apply( + self.hada_t1, + self.hada_w1_a, + self.hada_w1_b, + self.hada_t2, + self.hada_w2_a, + self.hada_w2_b, + scale, + ) else: - diff_weight = HadaWeight.apply(self.hada_w1_a, self.hada_w1_b, self.hada_w2_a, self.hada_w2_b, scale) + diff_weight = HadaWeight.apply( + self.hada_w1_a, self.hada_w1_b, self.hada_w2_a, self.hada_w2_b, scale + ) # Add the scaled difference to the original weight weight = w.to(diff_weight) + diff_weight.reshape(w.shape) @@ -138,9 +159,7 @@ class LoHaAdapter(WeightAdapterBase): mat4 = torch.empty(rank, in_dim, device=weight.device, dtype=torch.float32) torch.nn.init.normal_(mat3, 0.1) torch.nn.init.normal_(mat4, 0.01) - return LohaDiff( - (mat1, mat2, alpha, mat3, mat4, None, None, None) - ) + return LohaDiff((mat1, mat2, alpha, mat3, mat4, None, None, None)) def to_train(self): return LohaDiff(self.weights) @@ -172,7 +191,16 @@ class LoHaAdapter(WeightAdapterBase): loaded_keys.add(hada_t1_name) loaded_keys.add(hada_t2_name) - weights = (lora[hada_w1_a_name], lora[hada_w1_b_name], alpha, lora[hada_w2_a_name], lora[hada_w2_b_name], hada_t1, hada_t2, dora_scale) + weights = ( + lora[hada_w1_a_name], + lora[hada_w1_b_name], + alpha, + lora[hada_w2_a_name], + lora[hada_w2_b_name], + hada_t1, + hada_t2, + dora_scale, + ) loaded_keys.add(hada_w1_a_name) loaded_keys.add(hada_w1_b_name) loaded_keys.add(hada_w2_a_name) @@ -203,30 +231,148 @@ class LoHaAdapter(WeightAdapterBase): w2a = v[3] w2b = v[4] dora_scale = v[7] - if v[5] is not None: #cp decomposition + if v[5] is not None: # cp decomposition t1 = v[5] t2 = v[6] - m1 = torch.einsum('i j k l, j r, i p -> p r k l', - comfy.model_management.cast_to_device(t1, weight.device, intermediate_dtype), - comfy.model_management.cast_to_device(w1b, weight.device, intermediate_dtype), - comfy.model_management.cast_to_device(w1a, weight.device, intermediate_dtype)) + m1 = torch.einsum( + "i j k l, j r, i p -> p r k l", + comfy.model_management.cast_to_device( + t1, weight.device, intermediate_dtype + ), + comfy.model_management.cast_to_device( + w1b, weight.device, intermediate_dtype + ), + comfy.model_management.cast_to_device( + w1a, weight.device, intermediate_dtype + ), + ) - m2 = torch.einsum('i j k l, j r, i p -> p r k l', - comfy.model_management.cast_to_device(t2, weight.device, intermediate_dtype), - comfy.model_management.cast_to_device(w2b, weight.device, intermediate_dtype), - comfy.model_management.cast_to_device(w2a, weight.device, intermediate_dtype)) + m2 = torch.einsum( + "i j k l, j r, i p -> p r k l", + comfy.model_management.cast_to_device( + t2, weight.device, intermediate_dtype + ), + comfy.model_management.cast_to_device( + w2b, weight.device, intermediate_dtype + ), + comfy.model_management.cast_to_device( + w2a, weight.device, intermediate_dtype + ), + ) else: - m1 = torch.mm(comfy.model_management.cast_to_device(w1a, weight.device, intermediate_dtype), - comfy.model_management.cast_to_device(w1b, weight.device, intermediate_dtype)) - m2 = torch.mm(comfy.model_management.cast_to_device(w2a, weight.device, intermediate_dtype), - comfy.model_management.cast_to_device(w2b, weight.device, intermediate_dtype)) + m1 = torch.mm( + comfy.model_management.cast_to_device( + w1a, weight.device, intermediate_dtype + ), + comfy.model_management.cast_to_device( + w1b, weight.device, intermediate_dtype + ), + ) + m2 = torch.mm( + comfy.model_management.cast_to_device( + w2a, weight.device, intermediate_dtype + ), + comfy.model_management.cast_to_device( + w2b, weight.device, intermediate_dtype + ), + ) try: lora_diff = (m1 * m2).reshape(weight.shape) if dora_scale is not None: - weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function) + weight = weight_decompose( + dora_scale, + weight, + lora_diff, + alpha, + strength, + intermediate_dtype, + function, + ) else: weight += function(((strength * alpha) * lora_diff).type(weight.dtype)) except Exception as e: logging.error("ERROR {} {} {}".format(self.name, key, e)) return weight + + def h(self, x: torch.Tensor, base_out: torch.Tensor) -> torch.Tensor: + """ + Additive bypass component for LoHa: h(x) = diff_weight @ x + + WARNING: Inefficient - computes full Hadamard product each forward. + + Note: + Does not access original model weights - bypass mode is designed + for quantized models where weights may not be accessible. + + Args: + x: Input tensor + base_out: Output from base forward (unused, for API consistency) + + Reference: LyCORIS functional/loha.py bypass_forward_diff + """ + _warn_loha_bypass_inefficient() + + # FUNC_LIST: [None, None, F.linear, F.conv1d, F.conv2d, F.conv3d] + FUNC_LIST = [None, None, F.linear, F.conv1d, F.conv2d, F.conv3d] + + v = self.weights + # v[0]=w1a, v[1]=w1b, v[2]=alpha, v[3]=w2a, v[4]=w2b, v[5]=t1, v[6]=t2, v[7]=dora + w1a = v[0] + w1b = v[1] + alpha = v[2] + w2a = v[3] + w2b = v[4] + t1 = v[5] + t2 = v[6] + + # Compute scale + rank = w1b.shape[0] + scale = (alpha / rank if alpha is not None else 1.0) * getattr( + self, "multiplier", 1.0 + ) + + # Cast dtype + w1a = w1a.to(dtype=x.dtype) + w1b = w1b.to(dtype=x.dtype) + w2a = w2a.to(dtype=x.dtype) + w2b = w2b.to(dtype=x.dtype) + + # Use module info from bypass injection, not weight dimension + is_conv = getattr(self, "is_conv", False) + conv_dim = getattr(self, "conv_dim", 0) + kw_dict = getattr(self, "kw_dict", {}) + + # Compute diff weight using Hadamard product + if t1 is not None and t2 is not None: + t1 = t1.to(dtype=x.dtype) + t2 = t2.to(dtype=x.dtype) + m1 = torch.einsum("i j k l, j r, i p -> p r k l", t1, w1b, w1a) + m2 = torch.einsum("i j k l, j r, i p -> p r k l", t2, w2b, w2a) + diff_weight = (m1 * m2) * scale + else: + m1 = w1a @ w1b + m2 = w2a @ w2b + diff_weight = (m1 * m2) * scale + + if is_conv: + op = FUNC_LIST[conv_dim + 2] + kernel_size = getattr(self, "kernel_size", (1,) * conv_dim) + in_channels = getattr(self, "in_channels", None) + + # Reshape 2D diff_weight to conv format using kernel_size + # diff_weight: [out_channels, in_channels * prod(kernel_size)] -> [out_channels, in_channels, *kernel_size] + if diff_weight.dim() == 2: + if in_channels is not None: + diff_weight = diff_weight.view( + diff_weight.shape[0], in_channels, *kernel_size + ) + else: + diff_weight = diff_weight.view( + *diff_weight.shape, *([1] * conv_dim) + ) + else: + op = F.linear + kw_dict = {} + + return op(x, diff_weight, **kw_dict) diff --git a/comfy/weight_adapter/lokr.py b/comfy/weight_adapter/lokr.py index 9b2aff2d7..b83750012 100644 --- a/comfy/weight_adapter/lokr.py +++ b/comfy/weight_adapter/lokr.py @@ -2,6 +2,7 @@ import logging from typing import Optional import torch +import torch.nn.functional as F import comfy.model_management from .base import ( WeightAdapterBase, @@ -14,7 +15,17 @@ from .base import ( class LokrDiff(WeightAdapterTrainBase): def __init__(self, weights): super().__init__() - (lokr_w1, lokr_w2, alpha, lokr_w1_a, lokr_w1_b, lokr_w2_a, lokr_w2_b, lokr_t2, dora_scale) = weights + ( + lokr_w1, + lokr_w2, + alpha, + lokr_w1_a, + lokr_w1_b, + lokr_w2_a, + lokr_w2_b, + lokr_t2, + dora_scale, + ) = weights self.use_tucker = False if lokr_w1_a is not None: _, rank_a = lokr_w1_a.shape[0], lokr_w1_a.shape[1] @@ -57,10 +68,10 @@ class LokrDiff(WeightAdapterTrainBase): if self.w2_rebuild: if self.use_tucker: w2 = torch.einsum( - 'i j k l, j r, i p -> p r k l', + "i j k l, j r, i p -> p r k l", self.lokr_t2, self.lokr_w2_b, - self.lokr_w2_a + self.lokr_w2_a, ) else: w2 = self.lokr_w2_a @ self.lokr_w2_b @@ -69,9 +80,89 @@ class LokrDiff(WeightAdapterTrainBase): return self.lokr_w2 def __call__(self, w): - diff = torch.kron(self.w1, self.w2) + w1 = self.w1 + w2 = self.w2 + # Unsqueeze w1 to match w2 dims for proper kron product (like LyCORIS make_kron) + for _ in range(w2.dim() - w1.dim()): + w1 = w1.unsqueeze(-1) + diff = torch.kron(w1, w2) return w + diff.reshape(w.shape).to(w) + def h(self, x: torch.Tensor, base_out: torch.Tensor) -> torch.Tensor: + """ + Additive bypass component for LoKr training: efficient Kronecker product. + + Uses w1/w2 properties which handle both direct and decomposed cases. + For create_train (direct w1/w2), no alpha scaling in properties. + For to_train (decomposed), alpha/rank scaling is in properties. + + Args: + x: Input tensor + base_out: Output from base forward (unused, for API consistency) + """ + # Get w1, w2 from properties (handles rebuild vs direct) + w1 = self.w1 + w2 = self.w2 + + # Multiplier from bypass injection + multiplier = getattr(self, "multiplier", 1.0) + + # Get module info from bypass injection + is_conv = getattr(self, "is_conv", False) + conv_dim = getattr(self, "conv_dim", 0) + kw_dict = getattr(self, "kw_dict", {}) + + # Efficient Kronecker application without materializing full weight + # kron(w1, w2) @ x can be computed as nested operations + # w1: [out_l, in_m], w2: [out_k, in_n, *k_size] + # Full weight would be [out_l*out_k, in_m*in_n, *k_size] + + uq = w1.size(1) # in_m - inner grouping dimension + + if is_conv: + conv_fn = (F.conv1d, F.conv2d, F.conv3d)[conv_dim - 1] + + B, C_in, *spatial = x.shape + # Reshape input for grouped application: [B * uq, C_in // uq, *spatial] + h_in_group = x.reshape(B * uq, -1, *spatial) + + # Ensure w2 has conv dims + if w2.dim() == 2: + w2 = w2.view(*w2.shape, *([1] * conv_dim)) + + # Apply w2 path with stride/padding + hb = conv_fn(h_in_group, w2, **kw_dict) + + # Reshape for cross-group operation + hb = hb.view(B, -1, *hb.shape[1:]) + h_cross = hb.transpose(1, -1) + + # Apply w1 (always 2D, applied as linear on channel dim) + hc = F.linear(h_cross, w1) + hc = hc.transpose(1, -1) + + # Reshape to output + out = hc.reshape(B, -1, *hc.shape[3:]) + else: + # Linear case + # Reshape input: [..., in_m * in_n] -> [..., uq (in_m), in_n] + h_in_group = x.reshape(*x.shape[:-1], uq, -1) + + # Apply w2: [..., uq, in_n] @ [out_k, in_n].T -> [..., uq, out_k] + hb = F.linear(h_in_group, w2) + + # Transpose for w1: [..., uq, out_k] -> [..., out_k, uq] + h_cross = hb.transpose(-1, -2) + + # Apply w1: [..., out_k, uq] @ [out_l, uq].T -> [..., out_k, out_l] + hc = F.linear(h_cross, w1) + + # Transpose back and flatten: [..., out_k, out_l] -> [..., out_l * out_k] + hc = hc.transpose(-1, -2) + out = hc.reshape(*hc.shape[:-2], -1) + + return out * multiplier + def passive_memory_usage(self): return sum(param.numel() * param.element_size() for param in self.parameters()) @@ -86,16 +177,22 @@ class LoKrAdapter(WeightAdapterBase): @classmethod def create_train(cls, weight, rank=1, alpha=1.0): out_dim = weight.shape[0] - in_dim = weight.shape[1:].numel() - out1, out2 = factorization(out_dim, rank) - in1, in2 = factorization(in_dim, rank) - mat1 = torch.empty(out1, in1, device=weight.device, dtype=torch.float32) - mat2 = torch.empty(out2, in2, device=weight.device, dtype=torch.float32) + in_dim = weight.shape[1] # Just in_channels, not flattened with kernel + k_size = weight.shape[2:] if weight.dim() > 2 else () + + out_l, out_k = factorization(out_dim, rank) + in_m, in_n = factorization(in_dim, rank) + + # w1: [out_l, in_m] + mat1 = torch.empty(out_l, in_m, device=weight.device, dtype=torch.float32) + # w2: [out_k, in_n, *k_size] for conv, [out_k, in_n] for linear + mat2 = torch.empty( + out_k, in_n, *k_size, device=weight.device, dtype=torch.float32 + ) + torch.nn.init.kaiming_uniform_(mat2, a=5**0.5) torch.nn.init.constant_(mat1, 0.0) - return LokrDiff( - (mat1, mat2, alpha, None, None, None, None, None, None) - ) + return LokrDiff((mat1, mat2, alpha, None, None, None, None, None, None)) def to_train(self): return LokrDiff(self.weights) @@ -154,8 +251,23 @@ class LoKrAdapter(WeightAdapterBase): lokr_t2 = lora[lokr_t2_name] loaded_keys.add(lokr_t2_name) - if (lokr_w1 is not None) or (lokr_w2 is not None) or (lokr_w1_a is not None) or (lokr_w2_a is not None): - weights = (lokr_w1, lokr_w2, alpha, lokr_w1_a, lokr_w1_b, lokr_w2_a, lokr_w2_b, lokr_t2, dora_scale) + if ( + (lokr_w1 is not None) + or (lokr_w2 is not None) + or (lokr_w1_a is not None) + or (lokr_w2_a is not None) + ): + weights = ( + lokr_w1, + lokr_w2, + alpha, + lokr_w1_a, + lokr_w1_b, + lokr_w2_a, + lokr_w2_b, + lokr_t2, + dora_scale, + ) return cls(loaded_keys, weights) else: return None @@ -184,23 +296,47 @@ class LoKrAdapter(WeightAdapterBase): if w1 is None: dim = w1_b.shape[0] - w1 = torch.mm(comfy.model_management.cast_to_device(w1_a, weight.device, intermediate_dtype), - comfy.model_management.cast_to_device(w1_b, weight.device, intermediate_dtype)) + w1 = torch.mm( + comfy.model_management.cast_to_device( + w1_a, weight.device, intermediate_dtype + ), + comfy.model_management.cast_to_device( + w1_b, weight.device, intermediate_dtype + ), + ) else: - w1 = comfy.model_management.cast_to_device(w1, weight.device, intermediate_dtype) + w1 = comfy.model_management.cast_to_device( + w1, weight.device, intermediate_dtype + ) if w2 is None: dim = w2_b.shape[0] if t2 is None: - w2 = torch.mm(comfy.model_management.cast_to_device(w2_a, weight.device, intermediate_dtype), - comfy.model_management.cast_to_device(w2_b, weight.device, intermediate_dtype)) + w2 = torch.mm( + comfy.model_management.cast_to_device( + w2_a, weight.device, intermediate_dtype + ), + comfy.model_management.cast_to_device( + w2_b, weight.device, intermediate_dtype + ), + ) else: - w2 = torch.einsum('i j k l, j r, i p -> p r k l', - comfy.model_management.cast_to_device(t2, weight.device, intermediate_dtype), - comfy.model_management.cast_to_device(w2_b, weight.device, intermediate_dtype), - comfy.model_management.cast_to_device(w2_a, weight.device, intermediate_dtype)) + w2 = torch.einsum( + "i j k l, j r, i p -> p r k l", + comfy.model_management.cast_to_device( + t2, weight.device, intermediate_dtype + ), + comfy.model_management.cast_to_device( + w2_b, weight.device, intermediate_dtype + ), + comfy.model_management.cast_to_device( + w2_a, weight.device, intermediate_dtype + ), + ) else: - w2 = comfy.model_management.cast_to_device(w2, weight.device, intermediate_dtype) + w2 = comfy.model_management.cast_to_device( + w2, weight.device, intermediate_dtype + ) if len(w2.shape) == 4: w1 = w1.unsqueeze(2).unsqueeze(2) @@ -212,9 +348,134 @@ class LoKrAdapter(WeightAdapterBase): try: lora_diff = torch.kron(w1, w2).reshape(weight.shape) if dora_scale is not None: - weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function) + weight = weight_decompose( + dora_scale, + weight, + lora_diff, + alpha, + strength, + intermediate_dtype, + function, + ) else: weight += function(((strength * alpha) * lora_diff).type(weight.dtype)) except Exception as e: logging.error("ERROR {} {} {}".format(self.name, key, e)) return weight + + def h(self, x: torch.Tensor, base_out: torch.Tensor) -> torch.Tensor: + """ + Additive bypass component for LoKr: efficient Kronecker product application. + + Note: + Does not access original model weights - bypass mode is designed + for quantized models where weights may not be accessible. + + Args: + x: Input tensor + base_out: Output from base forward (unused, for API consistency) + + Reference: LyCORIS functional/lokr.py bypass_forward_diff + """ + # FUNC_LIST: [None, None, F.linear, F.conv1d, F.conv2d, F.conv3d] + FUNC_LIST = [None, None, F.linear, F.conv1d, F.conv2d, F.conv3d] + + v = self.weights + # v[0]=w1, v[1]=w2, v[2]=alpha, v[3]=w1_a, v[4]=w1_b, v[5]=w2_a, v[6]=w2_b, v[7]=t2, v[8]=dora + w1 = v[0] + w2 = v[1] + alpha = v[2] + w1_a = v[3] + w1_b = v[4] + w2_a = v[5] + w2_b = v[6] + t2 = v[7] + + use_w1 = w1 is not None + use_w2 = w2 is not None + tucker = t2 is not None + + # Use module info from bypass injection, not weight dimension + is_conv = getattr(self, "is_conv", False) + conv_dim = getattr(self, "conv_dim", 0) + kw_dict = getattr(self, "kw_dict", {}) if is_conv else {} + + if is_conv: + op = FUNC_LIST[conv_dim + 2] + else: + op = F.linear + + # Determine rank and scale + rank = w1_b.size(0) if not use_w1 else w2_b.size(0) if not use_w2 else alpha + scale = (alpha / rank if alpha is not None else 1.0) * getattr( + self, "multiplier", 1.0 + ) + + # Build c (w1) + if use_w1: + c = w1.to(dtype=x.dtype) + else: + c = w1_a.to(dtype=x.dtype) @ w1_b.to(dtype=x.dtype) + uq = c.size(1) + + # Build w2 components + if use_w2: + ba = w2.to(dtype=x.dtype) + else: + a = w2_b.to(dtype=x.dtype) + b = w2_a.to(dtype=x.dtype) + if is_conv: + if tucker: + # Tucker: a, b get 1s appended (kernel is in t2) + if a.dim() == 2: + a = a.view(*a.shape, *([1] * conv_dim)) + if b.dim() == 2: + b = b.view(*b.shape, *([1] * conv_dim)) + else: + # Non-tucker conv: b may need 1s appended + if b.dim() == 2: + b = b.view(*b.shape, *([1] * conv_dim)) + + # Reshape input by uq groups + if is_conv: + B, _, *rest = x.shape + h_in_group = x.reshape(B * uq, -1, *rest) + else: + h_in_group = x.reshape(*x.shape[:-1], uq, -1) + + # Apply w2 path + if use_w2: + hb = op(h_in_group, ba, **kw_dict) + else: + if is_conv: + if tucker: + t = t2.to(dtype=x.dtype) + if t.dim() == 2: + t = t.view(*t.shape, *([1] * conv_dim)) + ha = op(h_in_group, a) + ht = op(ha, t, **kw_dict) + hb = op(ht, b) + else: + ha = op(h_in_group, a, **kw_dict) + hb = op(ha, b) + else: + ha = op(h_in_group, a) + hb = op(ha, b) + + # Reshape and apply c (w1) + if is_conv: + hb = hb.view(B, -1, *hb.shape[1:]) + h_cross_group = hb.transpose(1, -1) + else: + h_cross_group = hb.transpose(-1, -2) + + hc = F.linear(h_cross_group, c) + + if is_conv: + hc = hc.transpose(1, -1) + out = hc.reshape(B, -1, *hc.shape[3:]) + else: + hc = hc.transpose(-1, -2) + out = hc.reshape(*hc.shape[:-2], -1) + + return out * scale diff --git a/comfy/weight_adapter/lora.py b/comfy/weight_adapter/lora.py index 3cc60bb1b..bc4260a8f 100644 --- a/comfy/weight_adapter/lora.py +++ b/comfy/weight_adapter/lora.py @@ -2,6 +2,7 @@ import logging from typing import Optional import torch +import torch.nn.functional as F import comfy.model_management from .base import ( WeightAdapterBase, @@ -20,11 +21,7 @@ class LoraDiff(WeightAdapterTrainBase): rank, in_dim = mat2.shape[0], mat2.shape[1] if mid is not None: convdim = mid.ndim - 2 - layer = ( - torch.nn.Conv1d, - torch.nn.Conv2d, - torch.nn.Conv3d - )[convdim] + layer = (torch.nn.Conv1d, torch.nn.Conv2d, torch.nn.Conv3d)[convdim] else: layer = torch.nn.Linear self.lora_up = layer(rank, out_dim, bias=False) @@ -51,6 +48,78 @@ class LoraDiff(WeightAdapterTrainBase): weight = w + scale * diff.reshape(w.shape) return weight.to(org_dtype) + def h(self, x: torch.Tensor, base_out: torch.Tensor) -> torch.Tensor: + """ + Additive bypass component for LoRA training: h(x) = up(down(x)) * scale + + Simple implementation using the nn.Module weights directly. + No mid/dora/reshape branches (create_train doesn't create them). + + Args: + x: Input tensor + base_out: Output from base forward (unused, for API consistency) + """ + # Compute scale = alpha / rank * multiplier + scale = (self.alpha / self.rank) * getattr(self, "multiplier", 1.0) + + # Get module info from bypass injection + is_conv = getattr(self, "is_conv", False) + conv_dim = getattr(self, "conv_dim", 0) + kw_dict = getattr(self, "kw_dict", {}) + + # Get weights (keep in original dtype for numerical stability) + down_weight = self.lora_down.weight + up_weight = self.lora_up.weight + + if is_conv: + # Conv path: use functional conv + # conv_dim: 1=conv1d, 2=conv2d, 3=conv3d + conv_fn = (F.conv1d, F.conv2d, F.conv3d)[conv_dim - 1] + + # Reshape 2D weights to conv format if needed + # down: [rank, in_features] -> [rank, in_channels, *kernel_size] + # up: [out_features, rank] -> [out_features, rank, 1, 1, ...] + if down_weight.dim() == 2: + kernel_size = getattr(self, "kernel_size", (1,) * conv_dim) + in_channels = getattr(self, "in_channels", None) + if in_channels is not None: + down_weight = down_weight.view( + down_weight.shape[0], in_channels, *kernel_size + ) + else: + # Fallback: assume 1x1 kernel + down_weight = down_weight.view( + *down_weight.shape, *([1] * conv_dim) + ) + if up_weight.dim() == 2: + # up always uses 1x1 kernel + up_weight = up_weight.view(*up_weight.shape, *([1] * conv_dim)) + + # down conv uses stride/padding from module, up is 1x1 + hidden = conv_fn(x, down_weight, **kw_dict) + + # mid layer if exists (tucker decomposition) + if self.lora_mid is not None: + mid_weight = self.lora_mid.weight + if mid_weight.dim() == 2: + mid_weight = mid_weight.view(*mid_weight.shape, *([1] * conv_dim)) + hidden = conv_fn(hidden, mid_weight) + + # up conv is always 1x1 (no stride/padding) + out = conv_fn(hidden, up_weight) + else: + # Linear path: simple matmul chain + hidden = F.linear(x, down_weight) + + # mid layer if exists + if self.lora_mid is not None: + mid_weight = self.lora_mid.weight + hidden = F.linear(hidden, mid_weight) + + out = F.linear(hidden, up_weight) + + return out * scale + def passive_memory_usage(self): return sum(param.numel() * param.element_size() for param in self.parameters()) @@ -70,9 +139,7 @@ class LoRAAdapter(WeightAdapterBase): mat2 = torch.empty(rank, in_dim, device=weight.device, dtype=torch.float32) torch.nn.init.kaiming_uniform_(mat1, a=5**0.5) torch.nn.init.constant_(mat2, 0.0) - return LoraDiff( - (mat1, mat2, alpha, None, None, None) - ) + return LoraDiff((mat1, mat2, alpha, None, None, None)) def to_train(self): return LoraDiff(self.weights) @@ -210,3 +277,85 @@ class LoRAAdapter(WeightAdapterBase): except Exception as e: logging.error("ERROR {} {} {}".format(self.name, key, e)) return weight + + def h(self, x: torch.Tensor, base_out: torch.Tensor) -> torch.Tensor: + """ + Additive bypass component for LoRA: h(x) = up(down(x)) * scale + + Note: + Does not access original model weights - bypass mode is designed + for quantized models where weights may not be accessible. + + Args: + x: Input tensor + base_out: Output from base forward (unused, for API consistency) + + Reference: LyCORIS functional/locon.py bypass_forward_diff + """ + # FUNC_LIST: [None, None, F.linear, F.conv1d, F.conv2d, F.conv3d] + FUNC_LIST = [None, None, F.linear, F.conv1d, F.conv2d, F.conv3d] + + v = self.weights + # v[0]=up, v[1]=down, v[2]=alpha, v[3]=mid, v[4]=dora_scale, v[5]=reshape + up = v[0] + down = v[1] + alpha = v[2] + mid = v[3] + + # Compute scale = alpha / rank + rank = down.shape[0] + if alpha is not None: + scale = alpha / rank + else: + scale = 1.0 + scale = scale * getattr(self, "multiplier", 1.0) + + # Cast dtype + up = up.to(dtype=x.dtype) + down = down.to(dtype=x.dtype) + + # Use module info from bypass injection, not weight dimension + is_conv = getattr(self, "is_conv", False) + conv_dim = getattr(self, "conv_dim", 0) + kw_dict = getattr(self, "kw_dict", {}) + + if is_conv: + op = FUNC_LIST[ + conv_dim + 2 + ] # conv_dim 1->conv1d(3), 2->conv2d(4), 3->conv3d(5) + kernel_size = getattr(self, "kernel_size", (1,) * conv_dim) + in_channels = getattr(self, "in_channels", None) + + # Reshape 2D weights to conv format using kernel_size + # down: [rank, in_channels * prod(kernel_size)] -> [rank, in_channels, *kernel_size] + # up: [out_channels, rank] -> [out_channels, rank, 1, 1, ...] (1x1 kernel) + if down.dim() == 2: + # down.shape[1] = in_channels * prod(kernel_size) + if in_channels is not None: + down = down.view(down.shape[0], in_channels, *kernel_size) + else: + # Fallback: assume 1x1 kernel if in_channels unknown + down = down.view(*down.shape, *([1] * conv_dim)) + if up.dim() == 2: + # up always uses 1x1 kernel + up = up.view(*up.shape, *([1] * conv_dim)) + if mid is not None: + mid = mid.to(dtype=x.dtype) + if mid.dim() == 2: + mid = mid.view(*mid.shape, *([1] * conv_dim)) + else: + op = F.linear + kw_dict = {} # linear doesn't take stride/padding + + # Simple chain: down -> mid (if tucker) -> up + if mid is not None: + if not is_conv: + mid = mid.to(dtype=x.dtype) + hidden = op(x, down) + hidden = op(hidden, mid, **kw_dict) + out = op(hidden, up) + else: + hidden = op(x, down, **kw_dict) + out = op(hidden, up) + + return out * scale diff --git a/comfy/weight_adapter/oft.py b/comfy/weight_adapter/oft.py index c0aab9635..bc83cf8e8 100644 --- a/comfy/weight_adapter/oft.py +++ b/comfy/weight_adapter/oft.py @@ -3,13 +3,18 @@ from typing import Optional import torch import comfy.model_management -from .base import WeightAdapterBase, WeightAdapterTrainBase, weight_decompose, factorization +from .base import ( + WeightAdapterBase, + WeightAdapterTrainBase, + weight_decompose, + factorization, +) class OFTDiff(WeightAdapterTrainBase): def __init__(self, weights): super().__init__() - # Unpack weights tuple from LoHaAdapter + # Unpack weights tuple from OFTAdapter blocks, rescale, alpha, _ = weights # Create trainable parameters @@ -52,6 +57,78 @@ class OFTDiff(WeightAdapterTrainBase): weight = self.rescale * weight return weight.to(org_dtype) + def _get_orthogonal_matrix(self, device, dtype): + """Compute the orthogonal rotation matrix R from OFT blocks.""" + blocks = self.oft_blocks.to(device=device, dtype=dtype) + I = torch.eye(self.block_size, device=device, dtype=dtype) + + # Q = blocks - blocks^T (skew-symmetric) + q = blocks - blocks.transpose(1, 2) + normed_q = q + + # Apply constraint if set + if self.constraint: + q_norm = torch.norm(q) + 1e-8 + if q_norm > self.constraint: + normed_q = q * self.constraint / q_norm + + # Cayley transform: R = (I + Q)(I - Q)^-1 + r = (I + normed_q) @ (I - normed_q).float().inverse() + return r.to(dtype) + + def h(self, x: torch.Tensor, base_out: torch.Tensor) -> torch.Tensor: + """ + OFT has no additive component - returns zeros matching base_out shape. + + OFT only transforms the output via g(), it doesn't add to it. + """ + return torch.zeros_like(base_out) + + def g(self, y: torch.Tensor) -> torch.Tensor: + """ + Output transformation for OFT: applies orthogonal rotation. + + OFT transforms output channels using block-diagonal orthogonal matrices. + """ + r = self._get_orthogonal_matrix(y.device, y.dtype) + + # Apply multiplier to interpolate between identity and full transform + multiplier = getattr(self, "multiplier", 1.0) + I = torch.eye(self.block_size, device=y.device, dtype=y.dtype) + r = r * multiplier + (1 - multiplier) * I + + # Use module info from bypass injection + is_conv = getattr(self, "is_conv", y.dim() > 2) + + if is_conv: + # Conv output: (N, C, H, W, ...) -> transpose to (N, H, W, ..., C) + y = y.transpose(1, -1) + + # y now has channels in last dim + *batch_shape, out_features = y.shape + + # Reshape to apply block-diagonal transform + # (*, out_features) -> (*, block_num, block_size) + y_blocked = y.reshape(*batch_shape, self.block_num, self.block_size) + + # Apply orthogonal transform: R @ y for each block + # r: (block_num, block_size, block_size), y_blocked: (*, block_num, block_size) + out_blocked = torch.einsum("k n m, ... k n -> ... k m", r, y_blocked) + + # Reshape back: (*, block_num, block_size) -> (*, out_features) + out = out_blocked.reshape(*batch_shape, out_features) + + # Apply rescale if present + if self.rescaled: + rescale = self.rescale.to(device=y.device, dtype=y.dtype) + out = out * rescale.view(-1) + + if is_conv: + # Transpose back: (N, H, W, ..., C) -> (N, C, H, W, ...) + out = out.transpose(1, -1) + + return out + def passive_memory_usage(self): """Calculates memory usage of the trainable parameters.""" return sum(param.numel() * param.element_size() for param in self.parameters()) @@ -68,10 +145,10 @@ class OFTAdapter(WeightAdapterBase): def create_train(cls, weight, rank=1, alpha=1.0): out_dim = weight.shape[0] block_size, block_num = factorization(out_dim, rank) - block = torch.zeros(block_num, block_size, block_size, device=weight.device, dtype=torch.float32) - return OFTDiff( - (block, None, alpha, None) + block = torch.zeros( + block_num, block_size, block_size, device=weight.device, dtype=torch.float32 ) + return OFTDiff((block, None, alpha, None)) def to_train(self): return OFTDiff(self.weights) @@ -127,9 +204,13 @@ class OFTAdapter(WeightAdapterBase): alpha = 0 dora_scale = v[3] - blocks = comfy.model_management.cast_to_device(blocks, weight.device, intermediate_dtype) + blocks = comfy.model_management.cast_to_device( + blocks, weight.device, intermediate_dtype + ) if rescale is not None: - rescale = comfy.model_management.cast_to_device(rescale, weight.device, intermediate_dtype) + rescale = comfy.model_management.cast_to_device( + rescale, weight.device, intermediate_dtype + ) block_num, block_size, *_ = blocks.shape @@ -139,23 +220,108 @@ class OFTAdapter(WeightAdapterBase): # for Q = -Q^T q = blocks - blocks.transpose(1, 2) normed_q = q - if alpha > 0: # alpha in oft/boft is for constraint + if alpha > 0: # alpha in oft/boft is for constraint q_norm = torch.norm(q) + 1e-8 if q_norm > alpha: normed_q = q * alpha / q_norm # use float() to prevent unsupported type in .inverse() r = (I + normed_q) @ (I - normed_q).float().inverse() r = r.to(weight) + # Create I in weight's dtype for the einsum + I_w = torch.eye(block_size, device=weight.device, dtype=weight.dtype) _, *shape = weight.shape lora_diff = torch.einsum( "k n m, k n ... -> k m ...", - (r * strength) - strength * I, + (r * strength) - strength * I_w, weight.view(block_num, block_size, *shape), ).view(-1, *shape) if dora_scale is not None: - weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function) + weight = weight_decompose( + dora_scale, + weight, + lora_diff, + alpha, + strength, + intermediate_dtype, + function, + ) else: weight += function((strength * lora_diff).type(weight.dtype)) except Exception as e: logging.error("ERROR {} {} {}".format(self.name, key, e)) return weight + + def _get_orthogonal_matrix(self, device, dtype): + """Compute the orthogonal rotation matrix R from OFT blocks.""" + v = self.weights + blocks = v[0].to(device=device, dtype=dtype) + alpha = v[2] + if alpha is None: + alpha = 0 + + block_num, block_size, _ = blocks.shape + I = torch.eye(block_size, device=device, dtype=dtype) + + # Q = blocks - blocks^T (skew-symmetric) + q = blocks - blocks.transpose(1, 2) + normed_q = q + + # Apply constraint if alpha > 0 + if alpha > 0: + q_norm = torch.norm(q) + 1e-8 + if q_norm > alpha: + normed_q = q * alpha / q_norm + + # Cayley transform: R = (I + Q)(I - Q)^-1 + r = (I + normed_q) @ (I - normed_q).float().inverse() + return r, block_num, block_size + + def g(self, y: torch.Tensor) -> torch.Tensor: + """ + Output transformation for OFT: applies orthogonal rotation to output. + + OFT transforms the output channels using block-diagonal orthogonal matrices. + + Reference: LyCORIS DiagOFTModule._bypass_forward + """ + v = self.weights + rescale = v[1] + + r, block_num, block_size = self._get_orthogonal_matrix(y.device, y.dtype) + + # Apply multiplier to interpolate between identity and full transform + multiplier = getattr(self, "multiplier", 1.0) + I = torch.eye(block_size, device=y.device, dtype=y.dtype) + r = r * multiplier + (1 - multiplier) * I + + # Use module info from bypass injection to determine conv vs linear + is_conv = getattr(self, "is_conv", y.dim() > 2) + + if is_conv: + # Conv output: (N, C, H, W, ...) -> transpose to (N, H, W, ..., C) + y = y.transpose(1, -1) + + # y now has channels in last dim + *batch_shape, out_features = y.shape + + # Reshape to apply block-diagonal transform + # (*, out_features) -> (*, block_num, block_size) + y_blocked = y.view(*batch_shape, block_num, block_size) + + # Apply orthogonal transform: R @ y for each block + # r: (block_num, block_size, block_size), y_blocked: (*, block_num, block_size) + out_blocked = torch.einsum("k n m, ... k n -> ... k m", r, y_blocked) + + # Reshape back: (*, block_num, block_size) -> (*, out_features) + out = out_blocked.view(*batch_shape, out_features) + + # Apply rescale if present + if rescale is not None: + rescale = rescale.to(device=y.device, dtype=y.dtype) + out = out * rescale.view(-1) + + if is_conv: + # Transpose back: (N, H, W, ..., C) -> (N, C, H, W, ...) + out = out.transpose(1, -1) + + return out diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index a60020ca8..eeea9781a 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -754,7 +754,7 @@ class AnyType(ComfyTypeIO): Type = Any @comfytype(io_type="MODEL_PATCH") -class MODEL_PATCH(ComfyTypeIO): +class ModelPatch(ComfyTypeIO): Type = Any @comfytype(io_type="AUDIO_ENCODER") @@ -1146,6 +1146,20 @@ class ImageCompare(ComfyTypeI): def as_dict(self): return super().as_dict() + +@comfytype(io_type="COLOR") +class Color(ComfyTypeIO): + Type = str + + class Input(WidgetInput): + def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None, + socketless: bool=True, advanced: bool=None, default: str="#ffffff"): + super().__init__(id, display_name, optional, tooltip, None, default, socketless, None, None, None, None, advanced) + self.default: str + + def as_dict(self): + return super().as_dict() + DYNAMIC_INPUT_LOOKUP: dict[str, Callable[[dict[str, Any], dict[str, Any], tuple[str, dict[str, Any]], str, list[str] | None], None]] = {} def register_dynamic_input_func(io_type: str, func: Callable[[dict[str, Any], dict[str, Any], tuple[str, dict[str, Any]], str, list[str] | None], None]): DYNAMIC_INPUT_LOOKUP[io_type] = func @@ -1234,6 +1248,7 @@ class Hidden(str, Enum): class NodeInfoV1: input: dict=None input_order: dict[str, list[str]]=None + is_input_list: bool=None output: list[str]=None output_is_list: list[bool]=None output_name: list[str]=None @@ -1247,26 +1262,11 @@ class NodeInfoV1: output_node: bool=None deprecated: bool=None experimental: bool=None + dev_only: bool=None api_node: bool=None price_badge: dict | None = None search_aliases: list[str]=None -@dataclass -class NodeInfoV3: - input: dict=None - output: dict=None - hidden: list[str]=None - name: str=None - display_name: str=None - description: str=None - python_module: Any = None - category: str=None - output_node: bool=None - deprecated: bool=None - experimental: bool=None - api_node: bool=None - price_badge: dict | None = None - @dataclass class PriceBadgeDepends: @@ -1375,6 +1375,8 @@ class Schema: """Flags a node as deprecated, indicating to users that they should find alternatives to this node.""" is_experimental: bool=False """Flags a node as experimental, informing users that it may change or not work as expected.""" + is_dev_only: bool=False + """Flags a node as dev-only, hiding it from search/menus unless dev mode is enabled.""" is_api_node: bool=False """Flags a node as an API node. See: https://docs.comfy.org/tutorials/api-nodes/overview.""" price_badge: PriceBadge | None = None @@ -1383,6 +1385,8 @@ class Schema: """Flags a node as not idempotent; when True, the node will run and not reuse the cached outputs when identical inputs are provided on a different node in the graph.""" enable_expand: bool=False """Flags a node as expandable, allowing NodeOutput to include 'expand' property.""" + accept_all_inputs: bool=False + """When True, all inputs from the prompt will be passed to the node as kwargs, even if not defined in the schema.""" def validate(self): '''Validate the schema: @@ -1471,6 +1475,7 @@ class Schema: info = NodeInfoV1( input=input, input_order={key: list(value.keys()) for (key, value) in input.items()}, + is_input_list=self.is_input_list, output=output, output_is_list=output_is_list, output_name=output_name, @@ -1483,6 +1488,7 @@ class Schema: output_node=self.is_output_node, deprecated=self.is_deprecated, experimental=self.is_experimental, + dev_only=self.is_dev_only, api_node=self.is_api_node, python_module=getattr(cls, "RELATIVE_PYTHON_MODULE", "nodes"), price_badge=self.price_badge.as_dict(self.inputs) if self.price_badge is not None else None, @@ -1490,39 +1496,6 @@ class Schema: ) return info - - def get_v3_info(self, cls) -> NodeInfoV3: - input_dict = {} - output_dict = {} - hidden_list = [] - # TODO: make sure dynamic types will be handled correctly - if self.inputs: - for input in self.inputs: - add_to_dict_v3(input, input_dict) - if self.outputs: - for output in self.outputs: - add_to_dict_v3(output, output_dict) - if self.hidden: - for hidden in self.hidden: - hidden_list.append(hidden.value) - - info = NodeInfoV3( - input=input_dict, - output=output_dict, - hidden=hidden_list, - name=self.node_id, - display_name=self.display_name, - description=self.description, - category=self.category, - output_node=self.is_output_node, - deprecated=self.is_deprecated, - experimental=self.is_experimental, - api_node=self.is_api_node, - python_module=getattr(cls, "RELATIVE_PYTHON_MODULE", "nodes"), - price_badge=self.price_badge.as_dict(self.inputs) if self.price_badge is not None else None, - ) - return info - def get_finalized_class_inputs(d: dict[str, Any], live_inputs: dict[str, Any], include_hidden=False) -> tuple[dict[str, Any], V3Data]: out_dict = { "required": {}, @@ -1577,9 +1550,6 @@ def add_to_dict_v1(i: Input, d: dict): as_dict.pop("optional", None) d.setdefault(key, {})[i.id] = (i.get_io_type(), as_dict) -def add_to_dict_v3(io: Input | Output, d: dict): - d[io.id] = (io.get_io_type(), io.as_dict()) - class DynamicPathsDefaultValue: EMPTY_DICT = "empty_dict" @@ -1740,13 +1710,6 @@ class _ComfyNodeBaseInternal(_ComfyNodeInternal): # set hidden type_clone.hidden = HiddenHolder.from_v3_data(v3_data) return type_clone - - @final - @classmethod - def GET_NODE_INFO_V3(cls) -> dict[str, Any]: - schema = cls.GET_SCHEMA() - info = schema.get_v3_info(cls) - return asdict(info) ############################################# # V1 Backwards Compatibility code #-------------------------------------------- @@ -1789,6 +1752,14 @@ class _ComfyNodeBaseInternal(_ComfyNodeInternal): cls.GET_SCHEMA() return cls._DEPRECATED + _DEV_ONLY = None + @final + @classproperty + def DEV_ONLY(cls): # noqa + if cls._DEV_ONLY is None: + cls.GET_SCHEMA() + return cls._DEV_ONLY + _API_NODE = None @final @classproperty @@ -1853,6 +1824,14 @@ class _ComfyNodeBaseInternal(_ComfyNodeInternal): cls.GET_SCHEMA() return cls._NOT_IDEMPOTENT + _ACCEPT_ALL_INPUTS = None + @final + @classproperty + def ACCEPT_ALL_INPUTS(cls): # noqa + if cls._ACCEPT_ALL_INPUTS is None: + cls.GET_SCHEMA() + return cls._ACCEPT_ALL_INPUTS + @final @classmethod def INPUT_TYPES(cls) -> dict[str, dict]: @@ -1883,6 +1862,8 @@ class _ComfyNodeBaseInternal(_ComfyNodeInternal): cls._EXPERIMENTAL = schema.is_experimental if cls._DEPRECATED is None: cls._DEPRECATED = schema.is_deprecated + if cls._DEV_ONLY is None: + cls._DEV_ONLY = schema.is_dev_only if cls._API_NODE is None: cls._API_NODE = schema.is_api_node if cls._OUTPUT_NODE is None: @@ -1891,6 +1872,8 @@ class _ComfyNodeBaseInternal(_ComfyNodeInternal): cls._INPUT_IS_LIST = schema.is_input_list if cls._NOT_IDEMPOTENT is None: cls._NOT_IDEMPOTENT = schema.not_idempotent + if cls._ACCEPT_ALL_INPUTS is None: + cls._ACCEPT_ALL_INPUTS = schema.accept_all_inputs if cls._RETURN_TYPES is None: output = [] @@ -2038,6 +2021,7 @@ __all__ = [ "ControlNet", "Vae", "Model", + "ModelPatch", "ClipVision", "ClipVisionOutput", "AudioEncoder", @@ -2070,6 +2054,7 @@ __all__ = [ "AnyType", "MultiType", "Tracks", + "Color", # Dynamic Types "MatchType", "DynamicCombo", @@ -2078,12 +2063,10 @@ __all__ = [ "HiddenHolder", "Hidden", "NodeInfoV1", - "NodeInfoV3", "Schema", "ComfyNode", "NodeOutput", "add_to_dict_v1", - "add_to_dict_v3", "V3Data", "ImageCompare", "PriceBadgeDepends", diff --git a/comfy_api_nodes/apis/bytedance.py b/comfy_api_nodes/apis/bytedance.py index 400648cca..23cbe2372 100644 --- a/comfy_api_nodes/apis/bytedance.py +++ b/comfy_api_nodes/apis/bytedance.py @@ -13,17 +13,6 @@ class Text2ImageTaskCreationRequest(BaseModel): watermark: bool | None = Field(False) -class Image2ImageTaskCreationRequest(BaseModel): - model: str = Field(...) - prompt: str = Field(...) - response_format: str | None = Field("url") - image: str = Field(..., description="Base64 encoded string or image URL") - size: str | None = Field("adaptive") - seed: int | None = Field(..., ge=0, le=2147483647) - guidance_scale: float | None = Field(..., ge=1.0, le=10.0) - watermark: bool | None = Field(False) - - class Seedream4Options(BaseModel): max_images: int = Field(15) diff --git a/comfy_api_nodes/apis/grok.py b/comfy_api_nodes/apis/grok.py new file mode 100644 index 000000000..8e3c79ab9 --- /dev/null +++ b/comfy_api_nodes/apis/grok.py @@ -0,0 +1,67 @@ +from pydantic import BaseModel, Field + + +class ImageGenerationRequest(BaseModel): + model: str = Field(...) + prompt: str = Field(...) + aspect_ratio: str = Field(...) + n: int = Field(...) + seed: int = Field(...) + response_for: str = Field("url") + + +class InputUrlObject(BaseModel): + url: str = Field(...) + + +class ImageEditRequest(BaseModel): + model: str = Field(...) + image: InputUrlObject = Field(...) + prompt: str = Field(...) + resolution: str = Field(...) + n: int = Field(...) + seed: int = Field(...) + response_for: str = Field("url") + + +class VideoGenerationRequest(BaseModel): + model: str = Field(...) + prompt: str = Field(...) + image: InputUrlObject | None = Field(...) + duration: int = Field(...) + aspect_ratio: str | None = Field(...) + resolution: str = Field(...) + seed: int = Field(...) + + +class VideoEditRequest(BaseModel): + model: str = Field(...) + prompt: str = Field(...) + video: InputUrlObject = Field(...) + seed: int = Field(...) + + +class ImageResponseObject(BaseModel): + url: str | None = Field(None) + b64_json: str | None = Field(None) + revised_prompt: str | None = Field(None) + + +class ImageGenerationResponse(BaseModel): + data: list[ImageResponseObject] = Field(...) + + +class VideoGenerationResponse(BaseModel): + request_id: str = Field(...) + + +class VideoResponseObject(BaseModel): + url: str = Field(...) + upsampled_prompt: str | None = Field(None) + duration: int = Field(...) + + +class VideoStatusResponse(BaseModel): + status: str | None = Field(None) + video: VideoResponseObject | None = Field(None) + model: str | None = Field(None) diff --git a/comfy_api_nodes/apis/hunyuan3d.py b/comfy_api_nodes/apis/hunyuan3d.py new file mode 100644 index 000000000..6421c9bd5 --- /dev/null +++ b/comfy_api_nodes/apis/hunyuan3d.py @@ -0,0 +1,66 @@ +from typing import TypedDict + +from pydantic import BaseModel, Field, model_validator + + +class InputGenerateType(TypedDict): + generate_type: str + polygon_type: str + pbr: bool + + +class Hunyuan3DViewImage(BaseModel): + ViewType: str = Field(..., description="Valid values: back, left, right.") + ViewImageUrl: str = Field(...) + + +class To3DProTaskRequest(BaseModel): + Model: str = Field(...) + Prompt: str | None = Field(None) + ImageUrl: str | None = Field(None) + MultiViewImages: list[Hunyuan3DViewImage] | None = Field(None) + EnablePBR: bool | None = Field(...) + FaceCount: int | None = Field(...) + GenerateType: str | None = Field(...) + PolygonType: str | None = Field(...) + + +class RequestError(BaseModel): + Code: str = Field("") + Message: str = Field("") + + +class To3DProTaskCreateResponse(BaseModel): + JobId: str | None = Field(None) + Error: RequestError | None = Field(None) + + @model_validator(mode="before") + @classmethod + def unwrap_data(cls, values: dict) -> dict: + if "Response" in values and isinstance(values["Response"], dict): + return values["Response"] + return values + + +class ResultFile3D(BaseModel): + Type: str = Field(...) + Url: str = Field(...) + PreviewImageUrl: str = Field("") + + +class To3DProTaskResultResponse(BaseModel): + ErrorCode: str = Field("") + ErrorMessage: str = Field("") + ResultFile3Ds: list[ResultFile3D] = Field([]) + Status: str = Field(...) + + @model_validator(mode="before") + @classmethod + def unwrap_data(cls, values: dict) -> dict: + if "Response" in values and isinstance(values["Response"], dict): + return values["Response"] + return values + + +class To3DProTaskQueryRequest(BaseModel): + JobId: str = Field(...) diff --git a/comfy_api_nodes/apis/magnific.py b/comfy_api_nodes/apis/magnific.py new file mode 100644 index 000000000..b9f148def --- /dev/null +++ b/comfy_api_nodes/apis/magnific.py @@ -0,0 +1,122 @@ +from typing import TypedDict + +from pydantic import AliasChoices, BaseModel, Field, model_validator + + +class InputPortraitMode(TypedDict): + portrait_mode: str + portrait_style: str + portrait_beautifier: str + + +class InputAdvancedSettings(TypedDict): + advanced_settings: str + whites: int + blacks: int + brightness: int + contrast: int + saturation: int + engine: str + transfer_light_a: str + transfer_light_b: str + fixed_generation: bool + + +class InputSkinEnhancerMode(TypedDict): + mode: str + skin_detail: int + optimized_for: str + + +class ImageUpscalerCreativeRequest(BaseModel): + image: str = Field(...) + scale_factor: str = Field(...) + optimized_for: str = Field(...) + prompt: str | None = Field(None) + creativity: int = Field(...) + hdr: int = Field(...) + resemblance: int = Field(...) + fractality: int = Field(...) + engine: str = Field(...) + + +class ImageUpscalerPrecisionV2Request(BaseModel): + image: str = Field(...) + sharpen: int = Field(...) + smart_grain: int = Field(...) + ultra_detail: int = Field(...) + flavor: str = Field(...) + scale_factor: int = Field(...) + + +class ImageRelightAdvancedSettingsRequest(BaseModel): + whites: int = Field(...) + blacks: int = Field(...) + brightness: int = Field(...) + contrast: int = Field(...) + saturation: int = Field(...) + engine: str = Field(...) + transfer_light_a: str = Field(...) + transfer_light_b: str = Field(...) + fixed_generation: bool = Field(...) + + +class ImageRelightRequest(BaseModel): + image: str = Field(...) + prompt: str | None = Field(None) + transfer_light_from_reference_image: str | None = Field(None) + light_transfer_strength: int = Field(...) + interpolate_from_original: bool = Field(...) + change_background: bool = Field(...) + style: str = Field(...) + preserve_details: bool = Field(...) + advanced_settings: ImageRelightAdvancedSettingsRequest | None = Field(...) + + +class ImageStyleTransferRequest(BaseModel): + image: str = Field(...) + reference_image: str = Field(...) + prompt: str | None = Field(None) + style_strength: int = Field(...) + structure_strength: int = Field(...) + is_portrait: bool = Field(...) + portrait_style: str | None = Field(...) + portrait_beautifier: str | None = Field(...) + flavor: str = Field(...) + engine: str = Field(...) + fixed_generation: bool = Field(...) + + +class ImageSkinEnhancerCreativeRequest(BaseModel): + image: str = Field(...) + sharpen: int = Field(...) + smart_grain: int = Field(...) + + +class ImageSkinEnhancerFaithfulRequest(BaseModel): + image: str = Field(...) + sharpen: int = Field(...) + smart_grain: int = Field(...) + skin_detail: int = Field(...) + + +class ImageSkinEnhancerFlexibleRequest(BaseModel): + image: str = Field(...) + sharpen: int = Field(...) + smart_grain: int = Field(...) + optimized_for: str = Field(...) + + +class TaskResponse(BaseModel): + """Unified response model that handles both wrapped and unwrapped API responses.""" + + task_id: str = Field(...) + status: str = Field(validation_alias=AliasChoices("status", "task_status")) + generated: list[str] | None = Field(None) + + @model_validator(mode="before") + @classmethod + def unwrap_data(cls, values: dict) -> dict: + if "data" in values and isinstance(values["data"], dict): + return values["data"] + return values diff --git a/comfy_api_nodes/apis/recraft.py b/comfy_api_nodes/apis/recraft.py index c36d95f24..0bd7d23b3 100644 --- a/comfy_api_nodes/apis/recraft.py +++ b/comfy_api_nodes/apis/recraft.py @@ -1,11 +1,8 @@ from __future__ import annotations - - from enum import Enum -from typing import Optional -from pydantic import BaseModel, Field, conint, confloat +from pydantic import BaseModel, Field class RecraftColor: @@ -229,24 +226,24 @@ class RecraftColorObject(BaseModel): class RecraftControlsObject(BaseModel): - colors: Optional[list[RecraftColorObject]] = Field(None, description='An array of preferable colors') - background_color: Optional[RecraftColorObject] = Field(None, description='Use given color as a desired background color') - no_text: Optional[bool] = Field(None, description='Do not embed text layouts') - artistic_level: Optional[conint(ge=0, le=5)] = Field(None, description='Defines artistic tone of your image. At a simple level, the person looks straight at the camera in a static and clean style. Dynamic and eccentric levels introduce movement and creativity. The value should be in range [0..5].') + colors: list[RecraftColorObject] | None = Field(None, description='An array of preferable colors') + background_color: RecraftColorObject | None = Field(None, description='Use given color as a desired background color') + no_text: bool | None = Field(None, description='Do not embed text layouts') + artistic_level: int | None = Field(None, description='Defines artistic tone of your image. At a simple level, the person looks straight at the camera in a static and clean style. Dynamic and eccentric levels introduce movement and creativity. The value should be in range [0..5].') class RecraftImageGenerationRequest(BaseModel): prompt: str = Field(..., description='The text prompt describing the image to generate') - size: Optional[RecraftImageSize] = Field(None, description='The size of the generated image (e.g., "1024x1024")') - n: conint(ge=1, le=6) = Field(..., description='The number of images to generate') - negative_prompt: Optional[str] = Field(None, description='A text description of undesired elements on an image') - model: Optional[RecraftModel] = Field(RecraftModel.recraftv3, description='The model to use for generation (e.g., "recraftv3")') - style: Optional[str] = Field(None, description='The style to apply to the generated image (e.g., "digital_illustration")') - substyle: Optional[str] = Field(None, description='The substyle to apply to the generated image, depending on the style input') - controls: Optional[RecraftControlsObject] = Field(None, description='A set of custom parameters to tweak generation process') - style_id: Optional[str] = Field(None, description='Use a previously uploaded style as a reference; UUID') - strength: Optional[confloat(ge=0.0, le=1.0)] = Field(None, description='Defines the difference with the original image, should lie in [0, 1], where 0 means almost identical, and 1 means miserable similarity') - random_seed: Optional[int] = Field(None, description="Seed for video generation") + size: RecraftImageSize | None = Field(None, description='The size of the generated image (e.g., "1024x1024")') + n: int = Field(..., description='The number of images to generate') + negative_prompt: str | None = Field(None, description='A text description of undesired elements on an image') + model: RecraftModel | None = Field(RecraftModel.recraftv3, description='The model to use for generation (e.g., "recraftv3")') + style: str | None = Field(None, description='The style to apply to the generated image (e.g., "digital_illustration")') + substyle: str | None = Field(None, description='The substyle to apply to the generated image, depending on the style input') + controls: RecraftControlsObject | None = Field(None, description='A set of custom parameters to tweak generation process') + style_id: str | None = Field(None, description='Use a previously uploaded style as a reference; UUID') + strength: float | None = Field(None, description='Defines the difference with the original image, should lie in [0, 1], where 0 means almost identical, and 1 means miserable similarity') + random_seed: int | None = Field(None, description="Seed for video generation") # text_layout @@ -258,5 +255,13 @@ class RecraftReturnedObject(BaseModel): class RecraftImageGenerationResponse(BaseModel): created: int = Field(..., description='Unix timestamp when the generation was created') credits: int = Field(..., description='Number of credits used for the generation') - data: Optional[list[RecraftReturnedObject]] = Field(None, description='Array of generated image information') - image: Optional[RecraftReturnedObject] = Field(None, description='Single generated image') + data: list[RecraftReturnedObject] | None = Field(None, description='Array of generated image information') + image: RecraftReturnedObject | None = Field(None, description='Single generated image') + + +class RecraftCreateStyleRequest(BaseModel): + style: str = Field(..., description="realistic_image, digital_illustration, vector_illustration, or icon") + + +class RecraftCreateStyleResponse(BaseModel): + id: str = Field(..., description="UUID of the created style") diff --git a/comfy_api_nodes/apis/vidu.py b/comfy_api_nodes/apis/vidu.py index a9bb6f7ce..469adcdbc 100644 --- a/comfy_api_nodes/apis/vidu.py +++ b/comfy_api_nodes/apis/vidu.py @@ -6,6 +6,30 @@ class SubjectReference(BaseModel): images: list[str] = Field(...) +class FrameSetting(BaseModel): + prompt: str = Field(...) + key_image: str = Field(...) + duration: int = Field(...) + + +class TaskMultiFrameCreationRequest(BaseModel): + model: str = Field(...) + seed: int = Field(..., ge=0, le=2147483647) + resolution: str = Field(...) + start_image: str = Field(...) + image_settings: list[FrameSetting] = Field(...) + + +class TaskExtendCreationRequest(BaseModel): + model: str = Field(...) + prompt: str = Field(..., max_length=2000) + duration: int = Field(...) + seed: int = Field(..., ge=0, le=2147483647) + resolution: str = Field(...) + images: list[str] | None = Field(None, description="Base64 encoded string or image URL") + video_url: str = Field(..., description="URL of the video to extend") + + class TaskCreationRequest(BaseModel): model: str = Field(...) prompt: str = Field(..., max_length=2000) diff --git a/comfy_api_nodes/nodes_bria.py b/comfy_api_nodes/nodes_bria.py index 72a3055a7..d3a52bc1b 100644 --- a/comfy_api_nodes/nodes_bria.py +++ b/comfy_api_nodes/nodes_bria.py @@ -24,7 +24,7 @@ class BriaImageEditNode(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="BriaImageEditNode", - display_name="Bria Image Edit", + display_name="Bria FIBO Image Edit", category="api node/image/Bria", description="Edit images using Bria latest model", inputs=[ diff --git a/comfy_api_nodes/nodes_bytedance.py b/comfy_api_nodes/nodes_bytedance.py index 486801150..0cb5e3be8 100644 --- a/comfy_api_nodes/nodes_bytedance.py +++ b/comfy_api_nodes/nodes_bytedance.py @@ -9,7 +9,6 @@ from comfy_api_nodes.apis.bytedance import ( RECOMMENDED_PRESETS, RECOMMENDED_PRESETS_SEEDREAM_4, VIDEO_TASKS_EXECUTION_TIME, - Image2ImageTaskCreationRequest, Image2VideoTaskCreationRequest, ImageTaskCreationResponse, Seedream4Options, @@ -174,99 +173,6 @@ class ByteDanceImageNode(IO.ComfyNode): return IO.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response))) -class ByteDanceImageEditNode(IO.ComfyNode): - - @classmethod - def define_schema(cls): - return IO.Schema( - node_id="ByteDanceImageEditNode", - display_name="ByteDance Image Edit", - category="api node/image/ByteDance", - description="Edit images using ByteDance models via api based on prompt", - inputs=[ - IO.Combo.Input("model", options=["seededit-3-0-i2i-250628"]), - IO.Image.Input( - "image", - tooltip="The base image to edit", - ), - IO.String.Input( - "prompt", - multiline=True, - default="", - tooltip="Instruction to edit image", - ), - IO.Int.Input( - "seed", - default=0, - min=0, - max=2147483647, - step=1, - display_mode=IO.NumberDisplay.number, - control_after_generate=True, - tooltip="Seed to use for generation", - optional=True, - ), - IO.Float.Input( - "guidance_scale", - default=5.5, - min=1.0, - max=10.0, - step=0.01, - display_mode=IO.NumberDisplay.number, - tooltip="Higher value makes the image follow the prompt more closely", - optional=True, - ), - IO.Boolean.Input( - "watermark", - default=False, - tooltip='Whether to add an "AI generated" watermark to the image', - optional=True, - ), - ], - outputs=[ - IO.Image.Output(), - ], - hidden=[ - IO.Hidden.auth_token_comfy_org, - IO.Hidden.api_key_comfy_org, - IO.Hidden.unique_id, - ], - is_api_node=True, - is_deprecated=True, - ) - - @classmethod - async def execute( - cls, - model: str, - image: Input.Image, - prompt: str, - seed: int, - guidance_scale: float, - watermark: bool, - ) -> IO.NodeOutput: - validate_string(prompt, strip_whitespace=True, min_length=1) - if get_number_of_images(image) != 1: - raise ValueError("Exactly one input image is required.") - validate_image_aspect_ratio(image, (1, 3), (3, 1)) - source_url = (await upload_images_to_comfyapi(cls, image, max_images=1, mime_type="image/png"))[0] - payload = Image2ImageTaskCreationRequest( - model=model, - prompt=prompt, - image=source_url, - seed=seed, - guidance_scale=guidance_scale, - watermark=watermark, - ) - response = await sync_op( - cls, - ApiEndpoint(path=BYTEPLUS_IMAGE_ENDPOINT, method="POST"), - data=payload, - response_model=ImageTaskCreationResponse, - ) - return IO.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response))) - - class ByteDanceSeedreamNode(IO.ComfyNode): @classmethod @@ -1101,7 +1007,6 @@ class ByteDanceExtension(ComfyExtension): async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ ByteDanceImageNode, - ByteDanceImageEditNode, ByteDanceSeedreamNode, ByteDanceTextToVideoNode, ByteDanceImageToVideoNode, diff --git a/comfy_api_nodes/nodes_grok.py b/comfy_api_nodes/nodes_grok.py new file mode 100644 index 000000000..da15e97ea --- /dev/null +++ b/comfy_api_nodes/nodes_grok.py @@ -0,0 +1,417 @@ +import torch +from typing_extensions import override + +from comfy_api.latest import IO, ComfyExtension, Input +from comfy_api_nodes.apis.grok import ( + ImageEditRequest, + ImageGenerationRequest, + ImageGenerationResponse, + InputUrlObject, + VideoEditRequest, + VideoGenerationRequest, + VideoGenerationResponse, + VideoStatusResponse, +) +from comfy_api_nodes.util import ( + ApiEndpoint, + download_url_to_image_tensor, + download_url_to_video_output, + get_fs_object_size, + get_number_of_images, + poll_op, + sync_op, + tensor_to_base64_string, + upload_video_to_comfyapi, + validate_string, + validate_video_duration, +) + + +class GrokImageNode(IO.ComfyNode): + + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="GrokImageNode", + display_name="Grok Image", + category="api node/image/Grok", + description="Generate images using Grok based on a text prompt", + inputs=[ + IO.Combo.Input("model", options=["grok-imagine-image-beta"]), + IO.String.Input( + "prompt", + multiline=True, + tooltip="The text prompt used to generate the image", + ), + IO.Combo.Input( + "aspect_ratio", + options=[ + "1:1", + "2:3", + "3:2", + "3:4", + "4:3", + "9:16", + "16:9", + "9:19.5", + "19.5:9", + "9:20", + "20:9", + "1:2", + "2:1", + ], + ), + IO.Int.Input( + "number_of_images", + default=1, + min=1, + max=10, + step=1, + tooltip="Number of images to generate", + display_mode=IO.NumberDisplay.number, + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed to determine if node should re-run; " + "actual results are nondeterministic regardless of seed.", + ), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["number_of_images"]), + expr="""{"type":"usd","usd":0.033 * widgets.number_of_images}""", + ), + ) + + @classmethod + async def execute( + cls, + model: str, + prompt: str, + aspect_ratio: str, + number_of_images: int, + seed: int, + ) -> IO.NodeOutput: + validate_string(prompt, strip_whitespace=True, min_length=1) + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/xai/v1/images/generations", method="POST"), + data=ImageGenerationRequest( + model=model, + prompt=prompt, + aspect_ratio=aspect_ratio, + n=number_of_images, + seed=seed, + ), + response_model=ImageGenerationResponse, + ) + if len(response.data) == 1: + return IO.NodeOutput(await download_url_to_image_tensor(response.data[0].url)) + return IO.NodeOutput( + torch.cat( + [await download_url_to_image_tensor(i) for i in [str(d.url) for d in response.data if d.url]], + ) + ) + + +class GrokImageEditNode(IO.ComfyNode): + + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="GrokImageEditNode", + display_name="Grok Image Edit", + category="api node/image/Grok", + description="Modify an existing image based on a text prompt", + inputs=[ + IO.Combo.Input("model", options=["grok-imagine-image-beta"]), + IO.Image.Input("image"), + IO.String.Input( + "prompt", + multiline=True, + tooltip="The text prompt used to generate the image", + ), + IO.Combo.Input("resolution", options=["1K"]), + IO.Int.Input( + "number_of_images", + default=1, + min=1, + max=10, + step=1, + tooltip="Number of edited images to generate", + display_mode=IO.NumberDisplay.number, + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed to determine if node should re-run; " + "actual results are nondeterministic regardless of seed.", + ), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["number_of_images"]), + expr="""{"type":"usd","usd":0.002 + 0.033 * widgets.number_of_images}""", + ), + ) + + @classmethod + async def execute( + cls, + model: str, + image: Input.Image, + prompt: str, + resolution: str, + number_of_images: int, + seed: int, + ) -> IO.NodeOutput: + validate_string(prompt, strip_whitespace=True, min_length=1) + if get_number_of_images(image) != 1: + raise ValueError("Only one input image is supported.") + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/xai/v1/images/edits", method="POST"), + data=ImageEditRequest( + model=model, + image=InputUrlObject(url=f"data:image/png;base64,{tensor_to_base64_string(image)}"), + prompt=prompt, + resolution=resolution.lower(), + n=number_of_images, + seed=seed, + ), + response_model=ImageGenerationResponse, + ) + if len(response.data) == 1: + return IO.NodeOutput(await download_url_to_image_tensor(response.data[0].url)) + return IO.NodeOutput( + torch.cat( + [await download_url_to_image_tensor(i) for i in [str(d.url) for d in response.data if d.url]], + ) + ) + + +class GrokVideoNode(IO.ComfyNode): + + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="GrokVideoNode", + display_name="Grok Video", + category="api node/video/Grok", + description="Generate video from a prompt or an image", + inputs=[ + IO.Combo.Input("model", options=["grok-imagine-video-beta"]), + IO.String.Input( + "prompt", + multiline=True, + tooltip="Text description of the desired video.", + ), + IO.Combo.Input( + "resolution", + options=["480p", "720p"], + tooltip="The resolution of the output video.", + ), + IO.Combo.Input( + "aspect_ratio", + options=["auto", "16:9", "4:3", "3:2", "1:1", "2:3", "3:4", "9:16"], + tooltip="The aspect ratio of the output video.", + ), + IO.Int.Input( + "duration", + default=6, + min=1, + max=15, + step=1, + tooltip="The duration of the output video in seconds.", + display_mode=IO.NumberDisplay.slider, + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed to determine if node should re-run; " + "actual results are nondeterministic regardless of seed.", + ), + IO.Image.Input("image", optional=True), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["duration"], inputs=["image"]), + expr=""" + ( + $base := 0.181 * widgets.duration; + {"type":"usd","usd": inputs.image.connected ? $base + 0.002 : $base} + ) + """, + ), + ) + + @classmethod + async def execute( + cls, + model: str, + prompt: str, + resolution: str, + aspect_ratio: str, + duration: int, + seed: int, + image: Input.Image | None = None, + ) -> IO.NodeOutput: + image_url = None + if image is not None: + if get_number_of_images(image) != 1: + raise ValueError("Only one input image is supported.") + image_url = InputUrlObject(url=f"data:image/png;base64,{tensor_to_base64_string(image)}") + validate_string(prompt, strip_whitespace=True, min_length=1) + initial_response = await sync_op( + cls, + ApiEndpoint(path="/proxy/xai/v1/videos/generations", method="POST"), + data=VideoGenerationRequest( + model=model, + image=image_url, + prompt=prompt, + resolution=resolution, + duration=duration, + aspect_ratio=None if aspect_ratio == "auto" else aspect_ratio, + seed=seed, + ), + response_model=VideoGenerationResponse, + ) + response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/xai/v1/videos/{initial_response.request_id}"), + status_extractor=lambda r: r.status if r.status is not None else "complete", + response_model=VideoStatusResponse, + ) + return IO.NodeOutput(await download_url_to_video_output(response.video.url)) + + +class GrokVideoEditNode(IO.ComfyNode): + + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="GrokVideoEditNode", + display_name="Grok Video Edit", + category="api node/video/Grok", + description="Edit an existing video based on a text prompt.", + inputs=[ + IO.Combo.Input("model", options=["grok-imagine-video-beta"]), + IO.String.Input( + "prompt", + multiline=True, + tooltip="Text description of the desired video.", + ), + IO.Video.Input("video", tooltip="Maximum supported duration is 8.7 seconds and 50MB file size."), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed to determine if node should re-run; " + "actual results are nondeterministic regardless of seed.", + ), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + expr="""{"type":"usd","usd": 0.191, "format": {"suffix": "/sec", "approximate": true}}""", + ), + ) + + @classmethod + async def execute( + cls, + model: str, + prompt: str, + video: Input.Video, + seed: int, + ) -> IO.NodeOutput: + validate_string(prompt, strip_whitespace=True, min_length=1) + validate_video_duration(video, min_duration=1, max_duration=8.7) + video_stream = video.get_stream_source() + video_size = get_fs_object_size(video_stream) + if video_size > 50 * 1024 * 1024: + raise ValueError(f"Video size ({video_size / 1024 / 1024:.1f}MB) exceeds 50MB limit.") + initial_response = await sync_op( + cls, + ApiEndpoint(path="/proxy/xai/v1/videos/edits", method="POST"), + data=VideoEditRequest( + model=model, + video=InputUrlObject(url=await upload_video_to_comfyapi(cls, video)), + prompt=prompt, + seed=seed, + ), + response_model=VideoGenerationResponse, + ) + response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/xai/v1/videos/{initial_response.request_id}"), + status_extractor=lambda r: r.status if r.status is not None else "complete", + response_model=VideoStatusResponse, + ) + return IO.NodeOutput(await download_url_to_video_output(response.video.url)) + + +class GrokExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + GrokImageNode, + GrokImageEditNode, + GrokVideoNode, + GrokVideoEditNode, + ] + + +async def comfy_entrypoint() -> GrokExtension: + return GrokExtension() diff --git a/comfy_api_nodes/nodes_hunyuan3d.py b/comfy_api_nodes/nodes_hunyuan3d.py new file mode 100644 index 000000000..b3a736643 --- /dev/null +++ b/comfy_api_nodes/nodes_hunyuan3d.py @@ -0,0 +1,297 @@ +import os + +from typing_extensions import override + +from comfy_api.latest import IO, ComfyExtension, Input +from comfy_api_nodes.apis.hunyuan3d import ( + Hunyuan3DViewImage, + InputGenerateType, + ResultFile3D, + To3DProTaskCreateResponse, + To3DProTaskQueryRequest, + To3DProTaskRequest, + To3DProTaskResultResponse, +) +from comfy_api_nodes.util import ( + ApiEndpoint, + download_url_to_bytesio, + downscale_image_tensor_by_max_side, + poll_op, + sync_op, + upload_image_to_comfyapi, + validate_image_dimensions, + validate_string, +) +from folder_paths import get_output_directory + + +def get_glb_obj_from_response(response_objs: list[ResultFile3D]) -> ResultFile3D: + for i in response_objs: + if i.Type.lower() == "glb": + return i + raise ValueError("No GLB file found in response. Please report this to the developers.") + + +class TencentTextToModelNode(IO.ComfyNode): + + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="TencentTextToModelNode", + display_name="Hunyuan3D: Text to Model (Pro)", + category="api node/3d/Tencent", + inputs=[ + IO.Combo.Input( + "model", + options=["3.0", "3.1"], + tooltip="The LowPoly option is unavailable for the `3.1` model.", + ), + IO.String.Input("prompt", multiline=True, default="", tooltip="Supports up to 1024 characters."), + IO.Int.Input("face_count", default=500000, min=40000, max=1500000), + IO.DynamicCombo.Input( + "generate_type", + options=[ + IO.DynamicCombo.Option("Normal", [IO.Boolean.Input("pbr", default=False)]), + IO.DynamicCombo.Option( + "LowPoly", + [ + IO.Combo.Input("polygon_type", options=["triangle", "quadrilateral"]), + IO.Boolean.Input("pbr", default=False), + ], + ), + IO.DynamicCombo.Option("Geometry", []), + ], + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed controls whether the node should re-run; " + "results are non-deterministic regardless of seed.", + ), + ], + outputs=[ + IO.String.Output(display_name="model_file"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + is_output_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["generate_type", "generate_type.pbr", "face_count"]), + expr=""" + ( + $base := widgets.generate_type = "normal" ? 25 : widgets.generate_type = "lowpoly" ? 30 : 15; + $pbr := $lookup(widgets, "generate_type.pbr") ? 10 : 0; + $face := widgets.face_count != 500000 ? 10 : 0; + {"type":"usd","usd": ($base + $pbr + $face) * 0.02} + ) + """, + ), + ) + + @classmethod + async def execute( + cls, + model: str, + prompt: str, + face_count: int, + generate_type: InputGenerateType, + seed: int, + ) -> IO.NodeOutput: + _ = seed + validate_string(prompt, field_name="prompt", min_length=1, max_length=1024) + if model == "3.1" and generate_type["generate_type"].lower() == "lowpoly": + raise ValueError("The LowPoly option is currently unavailable for the 3.1 model.") + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/tencent/hunyuan/3d-pro", method="POST"), + response_model=To3DProTaskCreateResponse, + data=To3DProTaskRequest( + Model=model, + Prompt=prompt, + FaceCount=face_count, + GenerateType=generate_type["generate_type"], + EnablePBR=generate_type.get("pbr", None), + PolygonType=generate_type.get("polygon_type", None), + ), + ) + if response.Error: + raise ValueError(f"Task creation failed with code {response.Error.Code}: {response.Error.Message}") + result = await poll_op( + cls, + ApiEndpoint(path="/proxy/tencent/hunyuan/3d-pro/query", method="POST"), + data=To3DProTaskQueryRequest(JobId=response.JobId), + response_model=To3DProTaskResultResponse, + status_extractor=lambda r: r.Status, + ) + model_file = f"hunyuan_model_{response.JobId}.glb" + await download_url_to_bytesio( + get_glb_obj_from_response(result.ResultFile3Ds).Url, + os.path.join(get_output_directory(), model_file), + ) + return IO.NodeOutput(model_file) + + +class TencentImageToModelNode(IO.ComfyNode): + + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="TencentImageToModelNode", + display_name="Hunyuan3D: Image(s) to Model (Pro)", + category="api node/3d/Tencent", + inputs=[ + IO.Combo.Input( + "model", + options=["3.0", "3.1"], + tooltip="The LowPoly option is unavailable for the `3.1` model.", + ), + IO.Image.Input("image"), + IO.Image.Input("image_left", optional=True), + IO.Image.Input("image_right", optional=True), + IO.Image.Input("image_back", optional=True), + IO.Int.Input("face_count", default=500000, min=40000, max=1500000), + IO.DynamicCombo.Input( + "generate_type", + options=[ + IO.DynamicCombo.Option("Normal", [IO.Boolean.Input("pbr", default=False)]), + IO.DynamicCombo.Option( + "LowPoly", + [ + IO.Combo.Input("polygon_type", options=["triangle", "quadrilateral"]), + IO.Boolean.Input("pbr", default=False), + ], + ), + IO.DynamicCombo.Option("Geometry", []), + ], + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed controls whether the node should re-run; " + "results are non-deterministic regardless of seed.", + ), + ], + outputs=[ + IO.String.Output(display_name="model_file"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + is_output_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends( + widgets=["generate_type", "generate_type.pbr", "face_count"], + inputs=["image_left", "image_right", "image_back"], + ), + expr=""" + ( + $base := widgets.generate_type = "normal" ? 25 : widgets.generate_type = "lowpoly" ? 30 : 15; + $multiview := ( + inputs.image_left.connected or inputs.image_right.connected or inputs.image_back.connected + ) ? 10 : 0; + $pbr := $lookup(widgets, "generate_type.pbr") ? 10 : 0; + $face := widgets.face_count != 500000 ? 10 : 0; + {"type":"usd","usd": ($base + $multiview + $pbr + $face) * 0.02} + ) + """, + ), + ) + + @classmethod + async def execute( + cls, + model: str, + image: Input.Image, + face_count: int, + generate_type: InputGenerateType, + seed: int, + image_left: Input.Image | None = None, + image_right: Input.Image | None = None, + image_back: Input.Image | None = None, + ) -> IO.NodeOutput: + _ = seed + if model == "3.1" and generate_type["generate_type"].lower() == "lowpoly": + raise ValueError("The LowPoly option is currently unavailable for the 3.1 model.") + validate_image_dimensions(image, min_width=128, min_height=128) + multiview_images = [] + for k, v in { + "left": image_left, + "right": image_right, + "back": image_back, + }.items(): + if v is None: + continue + validate_image_dimensions(v, min_width=128, min_height=128) + multiview_images.append( + Hunyuan3DViewImage( + ViewType=k, + ViewImageUrl=await upload_image_to_comfyapi( + cls, + downscale_image_tensor_by_max_side(v, max_side=4900), + mime_type="image/webp", + total_pixels=24_010_000, + ), + ) + ) + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/tencent/hunyuan/3d-pro", method="POST"), + response_model=To3DProTaskCreateResponse, + data=To3DProTaskRequest( + Model=model, + FaceCount=face_count, + GenerateType=generate_type["generate_type"], + ImageUrl=await upload_image_to_comfyapi( + cls, + downscale_image_tensor_by_max_side(image, max_side=4900), + mime_type="image/webp", + total_pixels=24_010_000, + ), + MultiViewImages=multiview_images if multiview_images else None, + EnablePBR=generate_type.get("pbr", None), + PolygonType=generate_type.get("polygon_type", None), + ), + ) + if response.Error: + raise ValueError(f"Task creation failed with code {response.Error.Code}: {response.Error.Message}") + result = await poll_op( + cls, + ApiEndpoint(path="/proxy/tencent/hunyuan/3d-pro/query", method="POST"), + data=To3DProTaskQueryRequest(JobId=response.JobId), + response_model=To3DProTaskResultResponse, + status_extractor=lambda r: r.Status, + ) + model_file = f"hunyuan_model_{response.JobId}.glb" + await download_url_to_bytesio( + get_glb_obj_from_response(result.ResultFile3Ds).Url, + os.path.join(get_output_directory(), model_file), + ) + return IO.NodeOutput(model_file) + + +class TencentHunyuan3DExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + TencentTextToModelNode, + TencentImageToModelNode, + ] + + +async def comfy_entrypoint() -> TencentHunyuan3DExtension: + return TencentHunyuan3DExtension() diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index 3ec71530b..739fe1855 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -249,7 +249,6 @@ async def finish_omni_video_task(cls: type[IO.ComfyNode], response: TaskStatusRe ApiEndpoint(path=f"/proxy/kling/v1/videos/omni-video/{response.data.task_id}"), response_model=TaskStatusResponse, status_extractor=lambda r: (r.data.task_status if r.data else None), - max_poll_attempts=160, ) return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url)) diff --git a/comfy_api_nodes/nodes_magnific.py b/comfy_api_nodes/nodes_magnific.py new file mode 100644 index 000000000..013e71cc8 --- /dev/null +++ b/comfy_api_nodes/nodes_magnific.py @@ -0,0 +1,889 @@ +import math + +from typing_extensions import override + +from comfy_api.latest import IO, ComfyExtension, Input +from comfy_api_nodes.apis.magnific import ( + ImageRelightAdvancedSettingsRequest, + ImageRelightRequest, + ImageSkinEnhancerCreativeRequest, + ImageSkinEnhancerFaithfulRequest, + ImageSkinEnhancerFlexibleRequest, + ImageStyleTransferRequest, + ImageUpscalerCreativeRequest, + ImageUpscalerPrecisionV2Request, + InputAdvancedSettings, + InputPortraitMode, + InputSkinEnhancerMode, + TaskResponse, +) +from comfy_api_nodes.util import ( + ApiEndpoint, + download_url_to_image_tensor, + downscale_image_tensor, + get_image_dimensions, + get_number_of_images, + poll_op, + sync_op, + upload_images_to_comfyapi, + validate_image_aspect_ratio, + validate_image_dimensions, +) + + +class MagnificImageUpscalerCreativeNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="MagnificImageUpscalerCreativeNode", + display_name="Magnific Image Upscale (Creative)", + category="api node/image/Magnific", + description="Prompt‑guided enhancement, stylization, and 2x/4x/8x/16x upscaling. " + "Maximum output: 25.3 megapixels.", + inputs=[ + IO.Image.Input("image"), + IO.String.Input("prompt", multiline=True, default=""), + IO.Combo.Input("scale_factor", options=["2x", "4x", "8x", "16x"]), + IO.Combo.Input( + "optimized_for", + options=[ + "standard", + "soft_portraits", + "hard_portraits", + "art_n_illustration", + "videogame_assets", + "nature_n_landscapes", + "films_n_photography", + "3d_renders", + "science_fiction_n_horror", + ], + ), + IO.Int.Input("creativity", min=-10, max=10, default=0, display_mode=IO.NumberDisplay.slider), + IO.Int.Input( + "hdr", + min=-10, + max=10, + default=0, + tooltip="The level of definition and detail.", + display_mode=IO.NumberDisplay.slider, + ), + IO.Int.Input( + "resemblance", + min=-10, + max=10, + default=0, + tooltip="The level of resemblance to the original image.", + display_mode=IO.NumberDisplay.slider, + ), + IO.Int.Input( + "fractality", + min=-10, + max=10, + default=0, + tooltip="The strength of the prompt and intricacy per square pixel.", + display_mode=IO.NumberDisplay.slider, + ), + IO.Combo.Input( + "engine", + options=["automatic", "magnific_illusio", "magnific_sharpy", "magnific_sparkle"], + ), + IO.Boolean.Input( + "auto_downscale", + default=False, + tooltip="Automatically downscale input image if output would exceed maximum pixel limit.", + ), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["scale_factor"]), + expr=""" + ( + $max := widgets.scale_factor = "2x" ? 1.326 : 1.657; + {"type": "range_usd", "min_usd": 0.11, "max_usd": $max} + ) + """, + ), + ) + + @classmethod + async def execute( + cls, + image: Input.Image, + prompt: str, + scale_factor: str, + optimized_for: str, + creativity: int, + hdr: int, + resemblance: int, + fractality: int, + engine: str, + auto_downscale: bool, + ) -> IO.NodeOutput: + if get_number_of_images(image) != 1: + raise ValueError("Exactly one input image is required.") + validate_image_aspect_ratio(image, (1, 3), (3, 1), strict=False) + validate_image_dimensions(image, min_height=160, min_width=160) + + max_output_pixels = 25_300_000 + height, width = get_image_dimensions(image) + requested_scale = int(scale_factor.rstrip("x")) + output_pixels = height * width * requested_scale * requested_scale + + if output_pixels > max_output_pixels: + if auto_downscale: + # Find optimal scale factor that doesn't require >2x downscale. + # Server upscales in 2x steps, so aggressive downscaling degrades quality. + input_pixels = width * height + scale = 2 + max_input_pixels = max_output_pixels // 4 + for candidate in [16, 8, 4, 2]: + if candidate > requested_scale: + continue + scale_output_pixels = input_pixels * candidate * candidate + if scale_output_pixels <= max_output_pixels: + scale = candidate + max_input_pixels = None + break + downscale_ratio = math.sqrt(scale_output_pixels / max_output_pixels) + if downscale_ratio <= 2.0: + scale = candidate + max_input_pixels = max_output_pixels // (candidate * candidate) + break + + if max_input_pixels is not None: + image = downscale_image_tensor(image, total_pixels=max_input_pixels) + scale_factor = f"{scale}x" + else: + raise ValueError( + f"Output size ({width * requested_scale}x{height * requested_scale} = {output_pixels:,} pixels) " + f"exceeds maximum allowed size of {max_output_pixels:,} pixels. " + f"Use a smaller input image or lower scale factor." + ) + + initial_res = await sync_op( + cls, + ApiEndpoint(path="/proxy/freepik/v1/ai/image-upscaler", method="POST"), + response_model=TaskResponse, + data=ImageUpscalerCreativeRequest( + image=(await upload_images_to_comfyapi(cls, image, max_images=1, total_pixels=None))[0], + scale_factor=scale_factor, + optimized_for=optimized_for, + creativity=creativity, + hdr=hdr, + resemblance=resemblance, + fractality=fractality, + engine=engine, + prompt=prompt if prompt else None, + ), + ) + final_response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/freepik/v1/ai/image-upscaler/{initial_res.task_id}"), + response_model=TaskResponse, + status_extractor=lambda x: x.status, + poll_interval=10.0, + max_poll_attempts=480, + ) + return IO.NodeOutput(await download_url_to_image_tensor(final_response.generated[0])) + + +class MagnificImageUpscalerPreciseV2Node(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="MagnificImageUpscalerPreciseV2Node", + display_name="Magnific Image Upscale (Precise V2)", + category="api node/image/Magnific", + description="High-fidelity upscaling with fine control over sharpness, grain, and detail. " + "Maximum output: 10060×10060 pixels.", + inputs=[ + IO.Image.Input("image"), + IO.Combo.Input("scale_factor", options=["2x", "4x", "8x", "16x"]), + IO.Combo.Input( + "flavor", + options=["sublime", "photo", "photo_denoiser"], + tooltip="Processing style: " + "sublime for general use, photo for photographs, photo_denoiser for noisy photos.", + ), + IO.Int.Input( + "sharpen", + min=0, + max=100, + default=7, + tooltip="Image sharpness intensity. Higher values increase edge definition and clarity.", + display_mode=IO.NumberDisplay.slider, + ), + IO.Int.Input( + "smart_grain", + min=0, + max=100, + default=7, + tooltip="Intelligent grain/texture enhancement to prevent the image from " + "looking too smooth or artificial.", + display_mode=IO.NumberDisplay.slider, + ), + IO.Int.Input( + "ultra_detail", + min=0, + max=100, + default=30, + tooltip="Controls fine detail, textures, and micro-details added during upscaling.", + display_mode=IO.NumberDisplay.slider, + ), + IO.Boolean.Input( + "auto_downscale", + default=False, + tooltip="Automatically downscale input image if output would exceed maximum resolution.", + ), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["scale_factor"]), + expr=""" + ( + $max := widgets.scale_factor = "2x" ? 1.326 : 1.657; + {"type": "range_usd", "min_usd": 0.11, "max_usd": $max} + ) + """, + ), + ) + + @classmethod + async def execute( + cls, + image: Input.Image, + scale_factor: str, + flavor: str, + sharpen: int, + smart_grain: int, + ultra_detail: int, + auto_downscale: bool, + ) -> IO.NodeOutput: + if get_number_of_images(image) != 1: + raise ValueError("Exactly one input image is required.") + validate_image_aspect_ratio(image, (1, 3), (3, 1), strict=False) + validate_image_dimensions(image, min_height=160, min_width=160) + + max_output_dimension = 10060 + height, width = get_image_dimensions(image) + requested_scale = int(scale_factor.strip("x")) + output_width = width * requested_scale + output_height = height * requested_scale + + if output_width > max_output_dimension or output_height > max_output_dimension: + if auto_downscale: + # Find optimal scale factor that doesn't require >2x downscale. + # Server upscales in 2x steps, so aggressive downscaling degrades quality. + max_dim = max(width, height) + scale = 2 + max_input_dim = max_output_dimension // 2 + scale_ratio = max_input_dim / max_dim + max_input_pixels = int(width * height * scale_ratio * scale_ratio) + for candidate in [16, 8, 4, 2]: + if candidate > requested_scale: + continue + output_dim = max_dim * candidate + if output_dim <= max_output_dimension: + scale = candidate + max_input_pixels = None + break + downscale_ratio = output_dim / max_output_dimension + if downscale_ratio <= 2.0: + scale = candidate + max_input_dim = max_output_dimension // candidate + scale_ratio = max_input_dim / max_dim + max_input_pixels = int(width * height * scale_ratio * scale_ratio) + break + + if max_input_pixels is not None: + image = downscale_image_tensor(image, total_pixels=max_input_pixels) + requested_scale = scale + else: + raise ValueError( + f"Output dimensions ({output_width}x{output_height}) exceed maximum allowed " + f"resolution of {max_output_dimension}x{max_output_dimension} pixels. " + f"Use a smaller input image or lower scale factor." + ) + + initial_res = await sync_op( + cls, + ApiEndpoint(path="/proxy/freepik/v1/ai/image-upscaler-precision-v2", method="POST"), + response_model=TaskResponse, + data=ImageUpscalerPrecisionV2Request( + image=(await upload_images_to_comfyapi(cls, image, max_images=1, total_pixels=None))[0], + scale_factor=requested_scale, + flavor=flavor, + sharpen=sharpen, + smart_grain=smart_grain, + ultra_detail=ultra_detail, + ), + ) + final_response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/freepik/v1/ai/image-upscaler-precision-v2/{initial_res.task_id}"), + response_model=TaskResponse, + status_extractor=lambda x: x.status, + poll_interval=10.0, + max_poll_attempts=480, + ) + return IO.NodeOutput(await download_url_to_image_tensor(final_response.generated[0])) + + +class MagnificImageStyleTransferNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="MagnificImageStyleTransferNode", + display_name="Magnific Image Style Transfer", + category="api node/image/Magnific", + description="Transfer the style from a reference image to your input image.", + inputs=[ + IO.Image.Input("image", tooltip="The image to apply style transfer to."), + IO.Image.Input("reference_image", tooltip="The reference image to extract style from."), + IO.String.Input("prompt", multiline=True, default=""), + IO.Int.Input( + "style_strength", + min=0, + max=100, + default=100, + tooltip="Percentage of style strength.", + display_mode=IO.NumberDisplay.slider, + ), + IO.Int.Input( + "structure_strength", + min=0, + max=100, + default=50, + tooltip="Maintains the structure of the original image.", + display_mode=IO.NumberDisplay.slider, + ), + IO.Combo.Input( + "flavor", + options=["faithful", "gen_z", "psychedelia", "detaily", "clear", "donotstyle", "donotstyle_sharp"], + tooltip="Style transfer flavor.", + ), + IO.Combo.Input( + "engine", + options=[ + "balanced", + "definio", + "illusio", + "3d_cartoon", + "colorful_anime", + "caricature", + "real", + "super_real", + "softy", + ], + tooltip="Processing engine selection.", + ), + IO.DynamicCombo.Input( + "portrait_mode", + options=[ + IO.DynamicCombo.Option("disabled", []), + IO.DynamicCombo.Option( + "enabled", + [ + IO.Combo.Input( + "portrait_style", + options=["standard", "pop", "super_pop"], + tooltip="Visual style applied to portrait images.", + ), + IO.Combo.Input( + "portrait_beautifier", + options=["none", "beautify_face", "beautify_face_max"], + tooltip="Facial beautification intensity on portraits.", + ), + ], + ), + ], + tooltip="Enable portrait mode for facial enhancements.", + ), + IO.Boolean.Input( + "fixed_generation", + default=True, + tooltip="When disabled, expect each generation to introduce a degree of randomness, " + "leading to more diverse outcomes.", + ), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + expr="""{"type":"usd","usd":0.11}""", + ), + ) + + @classmethod + async def execute( + cls, + image: Input.Image, + reference_image: Input.Image, + prompt: str, + style_strength: int, + structure_strength: int, + flavor: str, + engine: str, + portrait_mode: InputPortraitMode, + fixed_generation: bool, + ) -> IO.NodeOutput: + if get_number_of_images(image) != 1: + raise ValueError("Exactly one input image is required.") + if get_number_of_images(reference_image) != 1: + raise ValueError("Exactly one reference image is required.") + validate_image_aspect_ratio(image, (1, 3), (3, 1), strict=False) + validate_image_aspect_ratio(reference_image, (1, 3), (3, 1), strict=False) + validate_image_dimensions(image, min_height=160, min_width=160) + validate_image_dimensions(reference_image, min_height=160, min_width=160) + + is_portrait = portrait_mode["portrait_mode"] == "enabled" + portrait_style = portrait_mode.get("portrait_style", "standard") + portrait_beautifier = portrait_mode.get("portrait_beautifier", "none") + + uploaded_urls = await upload_images_to_comfyapi(cls, [image, reference_image], max_images=2) + + initial_res = await sync_op( + cls, + ApiEndpoint(path="/proxy/freepik/v1/ai/image-style-transfer", method="POST"), + response_model=TaskResponse, + data=ImageStyleTransferRequest( + image=uploaded_urls[0], + reference_image=uploaded_urls[1], + prompt=prompt if prompt else None, + style_strength=style_strength, + structure_strength=structure_strength, + is_portrait=is_portrait, + portrait_style=portrait_style if is_portrait else None, + portrait_beautifier=portrait_beautifier if is_portrait and portrait_beautifier != "none" else None, + flavor=flavor, + engine=engine, + fixed_generation=fixed_generation, + ), + ) + final_response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/freepik/v1/ai/image-style-transfer/{initial_res.task_id}"), + response_model=TaskResponse, + status_extractor=lambda x: x.status, + poll_interval=10.0, + max_poll_attempts=480, + ) + return IO.NodeOutput(await download_url_to_image_tensor(final_response.generated[0])) + + +class MagnificImageRelightNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="MagnificImageRelightNode", + display_name="Magnific Image Relight", + category="api node/image/Magnific", + description="Relight an image with lighting adjustments and optional reference-based light transfer.", + inputs=[ + IO.Image.Input("image", tooltip="The image to relight."), + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Descriptive guidance for lighting. Supports emphasis notation (1-1.4).", + ), + IO.Int.Input( + "light_transfer_strength", + min=0, + max=100, + default=100, + tooltip="Intensity of light transfer application.", + display_mode=IO.NumberDisplay.slider, + ), + IO.Combo.Input( + "style", + options=[ + "standard", + "darker_but_realistic", + "clean", + "smooth", + "brighter", + "contrasted_n_hdr", + "just_composition", + ], + tooltip="Stylistic output preference.", + ), + IO.Boolean.Input( + "interpolate_from_original", + default=False, + tooltip="Restricts generation freedom to match original more closely.", + ), + IO.Boolean.Input( + "change_background", + default=True, + tooltip="Modifies background based on prompt/reference.", + ), + IO.Boolean.Input( + "preserve_details", + default=True, + tooltip="Maintains texture and fine details from original.", + ), + IO.DynamicCombo.Input( + "advanced_settings", + options=[ + IO.DynamicCombo.Option("disabled", []), + IO.DynamicCombo.Option( + "enabled", + [ + IO.Int.Input( + "whites", + min=0, + max=100, + default=50, + tooltip="Adjusts the brightest tones in the image.", + display_mode=IO.NumberDisplay.slider, + ), + IO.Int.Input( + "blacks", + min=0, + max=100, + default=50, + tooltip="Adjusts the darkest tones in the image.", + display_mode=IO.NumberDisplay.slider, + ), + IO.Int.Input( + "brightness", + min=0, + max=100, + default=50, + tooltip="Overall brightness adjustment.", + display_mode=IO.NumberDisplay.slider, + ), + IO.Int.Input( + "contrast", + min=0, + max=100, + default=50, + tooltip="Contrast adjustment.", + display_mode=IO.NumberDisplay.slider, + ), + IO.Int.Input( + "saturation", + min=0, + max=100, + default=50, + tooltip="Color saturation adjustment.", + display_mode=IO.NumberDisplay.slider, + ), + IO.Combo.Input( + "engine", + options=[ + "automatic", + "balanced", + "cool", + "real", + "illusio", + "fairy", + "colorful_anime", + "hard_transform", + "softy", + ], + tooltip="Processing engine selection.", + ), + IO.Combo.Input( + "transfer_light_a", + options=["automatic", "low", "medium", "normal", "high", "high_on_faces"], + tooltip="The intensity of light transfer.", + ), + IO.Combo.Input( + "transfer_light_b", + options=[ + "automatic", + "composition", + "straight", + "smooth_in", + "smooth_out", + "smooth_both", + "reverse_both", + "soft_in", + "soft_out", + "soft_mid", + # "strong_mid", # Commented out because requests fail when this is set. + "style_shift", + "strong_shift", + ], + tooltip="Also modifies light transfer intensity. " + "Can be combined with the previous control for varied effects.", + ), + IO.Boolean.Input( + "fixed_generation", + default=True, + tooltip="Ensures consistent output with the same settings.", + ), + ], + ), + ], + tooltip="Fine-tuning options for advanced lighting control.", + ), + IO.Image.Input( + "reference_image", + optional=True, + tooltip="Optional reference image to transfer lighting from.", + ), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + expr="""{"type":"usd","usd":0.11}""", + ), + ) + + @classmethod + async def execute( + cls, + image: Input.Image, + prompt: str, + light_transfer_strength: int, + style: str, + interpolate_from_original: bool, + change_background: bool, + preserve_details: bool, + advanced_settings: InputAdvancedSettings, + reference_image: Input.Image | None = None, + ) -> IO.NodeOutput: + if get_number_of_images(image) != 1: + raise ValueError("Exactly one input image is required.") + if reference_image is not None and get_number_of_images(reference_image) != 1: + raise ValueError("Exactly one reference image is required.") + validate_image_aspect_ratio(image, (1, 3), (3, 1), strict=False) + validate_image_dimensions(image, min_height=160, min_width=160) + if reference_image is not None: + validate_image_aspect_ratio(reference_image, (1, 3), (3, 1), strict=False) + validate_image_dimensions(reference_image, min_height=160, min_width=160) + + image_url = (await upload_images_to_comfyapi(cls, image, max_images=1))[0] + reference_url = None + if reference_image is not None: + reference_url = (await upload_images_to_comfyapi(cls, reference_image, max_images=1))[0] + + adv_settings = None + if advanced_settings["advanced_settings"] == "enabled": + adv_settings = ImageRelightAdvancedSettingsRequest( + whites=advanced_settings["whites"], + blacks=advanced_settings["blacks"], + brightness=advanced_settings["brightness"], + contrast=advanced_settings["contrast"], + saturation=advanced_settings["saturation"], + engine=advanced_settings["engine"], + transfer_light_a=advanced_settings["transfer_light_a"], + transfer_light_b=advanced_settings["transfer_light_b"], + fixed_generation=advanced_settings["fixed_generation"], + ) + + initial_res = await sync_op( + cls, + ApiEndpoint(path="/proxy/freepik/v1/ai/image-relight", method="POST"), + response_model=TaskResponse, + data=ImageRelightRequest( + image=image_url, + prompt=prompt if prompt else None, + transfer_light_from_reference_image=reference_url, + light_transfer_strength=light_transfer_strength, + interpolate_from_original=interpolate_from_original, + change_background=change_background, + style=style, + preserve_details=preserve_details, + advanced_settings=adv_settings, + ), + ) + final_response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/freepik/v1/ai/image-relight/{initial_res.task_id}"), + response_model=TaskResponse, + status_extractor=lambda x: x.status, + poll_interval=10.0, + max_poll_attempts=480, + ) + return IO.NodeOutput(await download_url_to_image_tensor(final_response.generated[0])) + + +class MagnificImageSkinEnhancerNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="MagnificImageSkinEnhancerNode", + display_name="Magnific Image Skin Enhancer", + category="api node/image/Magnific", + description="Skin enhancement for portraits with multiple processing modes.", + inputs=[ + IO.Image.Input("image", tooltip="The portrait image to enhance."), + IO.Int.Input( + "sharpen", + min=0, + max=100, + default=0, + tooltip="Sharpening intensity level.", + display_mode=IO.NumberDisplay.slider, + ), + IO.Int.Input( + "smart_grain", + min=0, + max=100, + default=2, + tooltip="Smart grain intensity level.", + display_mode=IO.NumberDisplay.slider, + ), + IO.DynamicCombo.Input( + "mode", + options=[ + IO.DynamicCombo.Option("creative", []), + IO.DynamicCombo.Option( + "faithful", + [ + IO.Int.Input( + "skin_detail", + min=0, + max=100, + default=80, + tooltip="Skin detail enhancement level.", + display_mode=IO.NumberDisplay.slider, + ), + ], + ), + IO.DynamicCombo.Option( + "flexible", + [ + IO.Combo.Input( + "optimized_for", + options=[ + "enhance_skin", + "improve_lighting", + "enhance_everything", + "transform_to_real", + "no_make_up", + ], + tooltip="Enhancement optimization target.", + ), + ], + ), + ], + tooltip="Processing mode: creative for artistic enhancement, " + "faithful for preserving original appearance, " + "flexible for targeted optimization.", + ), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["mode"]), + expr=""" + ( + $rates := {"creative": 0.29, "faithful": 0.37, "flexible": 0.45}; + {"type":"usd","usd": $lookup($rates, widgets.mode)} + ) + """, + ), + ) + + @classmethod + async def execute( + cls, + image: Input.Image, + sharpen: int, + smart_grain: int, + mode: InputSkinEnhancerMode, + ) -> IO.NodeOutput: + if get_number_of_images(image) != 1: + raise ValueError("Exactly one input image is required.") + validate_image_aspect_ratio(image, (1, 3), (3, 1), strict=False) + validate_image_dimensions(image, min_height=160, min_width=160) + + image_url = (await upload_images_to_comfyapi(cls, image, max_images=1, total_pixels=4096 * 4096))[0] + selected_mode = mode["mode"] + + if selected_mode == "creative": + endpoint = "creative" + data = ImageSkinEnhancerCreativeRequest( + image=image_url, + sharpen=sharpen, + smart_grain=smart_grain, + ) + elif selected_mode == "faithful": + endpoint = "faithful" + data = ImageSkinEnhancerFaithfulRequest( + image=image_url, + sharpen=sharpen, + smart_grain=smart_grain, + skin_detail=mode["skin_detail"], + ) + else: # flexible + endpoint = "flexible" + data = ImageSkinEnhancerFlexibleRequest( + image=image_url, + sharpen=sharpen, + smart_grain=smart_grain, + optimized_for=mode["optimized_for"], + ) + + initial_res = await sync_op( + cls, + ApiEndpoint(path=f"/proxy/freepik/v1/ai/skin-enhancer/{endpoint}", method="POST"), + response_model=TaskResponse, + data=data, + ) + final_response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/freepik/v1/ai/skin-enhancer/{initial_res.task_id}"), + response_model=TaskResponse, + status_extractor=lambda x: x.status, + poll_interval=10.0, + max_poll_attempts=480, + ) + return IO.NodeOutput(await download_url_to_image_tensor(final_response.generated[0])) + + +class MagnificExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + # MagnificImageUpscalerCreativeNode, + # MagnificImageUpscalerPreciseV2Node, + MagnificImageStyleTransferNode, + MagnificImageRelightNode, + MagnificImageSkinEnhancerNode, + ] + + +async def comfy_entrypoint() -> MagnificExtension: + return MagnificExtension() diff --git a/comfy_api_nodes/nodes_openai.py b/comfy_api_nodes/nodes_openai.py index a12acc06b..f05aaab7b 100644 --- a/comfy_api_nodes/nodes_openai.py +++ b/comfy_api_nodes/nodes_openai.py @@ -364,9 +364,9 @@ class OpenAIGPTImage1(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="OpenAIGPTImage1", - display_name="OpenAI GPT Image 1", + display_name="OpenAI GPT Image 1.5", category="api node/image/OpenAI", - description="Generates images synchronously via OpenAI's GPT Image 1 endpoint.", + description="Generates images synchronously via OpenAI's GPT Image endpoint.", inputs=[ IO.String.Input( "prompt", @@ -429,6 +429,7 @@ class OpenAIGPTImage1(IO.ComfyNode): IO.Combo.Input( "model", options=["gpt-image-1", "gpt-image-1.5"], + default="gpt-image-1.5", optional=True, ), ], diff --git a/comfy_api_nodes/nodes_recraft.py b/comfy_api_nodes/nodes_recraft.py index c01bcaece..3a1f32263 100644 --- a/comfy_api_nodes/nodes_recraft.py +++ b/comfy_api_nodes/nodes_recraft.py @@ -12,6 +12,8 @@ from comfy_api_nodes.apis.recraft import ( RecraftColor, RecraftColorChain, RecraftControls, + RecraftCreateStyleRequest, + RecraftCreateStyleResponse, RecraftImageGenerationRequest, RecraftImageGenerationResponse, RecraftImageSize, @@ -323,6 +325,75 @@ class RecraftStyleInfiniteStyleLibrary(IO.ComfyNode): return IO.NodeOutput(RecraftStyle(style_id=style_id)) +class RecraftCreateStyleNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftCreateStyleNode", + display_name="Recraft Create Style", + category="api node/image/Recraft", + description="Create a custom style from reference images. " + "Upload 1-5 images to use as style references. " + "Total size of all images is limited to 5 MB.", + inputs=[ + IO.Combo.Input( + "style", + options=["realistic_image", "digital_illustration"], + tooltip="The base style of the generated images.", + ), + IO.Autogrow.Input( + "images", + template=IO.Autogrow.TemplatePrefix( + IO.Image.Input("image"), + prefix="image", + min=1, + max=5, + ), + ), + ], + outputs=[ + IO.String.Output(display_name="style_id"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + expr="""{"type":"usd","usd": 0.04}""", + ), + ) + + @classmethod + async def execute( + cls, + style: str, + images: IO.Autogrow.Type, + ) -> IO.NodeOutput: + files = [] + total_size = 0 + max_total_size = 5 * 1024 * 1024 # 5 MB limit + for i, img in enumerate(list(images.values())): + file_bytes = tensor_to_bytesio(img, total_pixels=2048 * 2048, mime_type="image/webp").read() + total_size += len(file_bytes) + if total_size > max_total_size: + raise Exception("Total size of all images exceeds 5 MB limit.") + files.append((f"file{i + 1}", file_bytes)) + + response = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/recraft/styles", method="POST"), + response_model=RecraftCreateStyleResponse, + files=files, + data=RecraftCreateStyleRequest(style=style), + content_type="multipart/form-data", + max_retries=1, + ) + + return IO.NodeOutput(response.id) + + class RecraftTextToImageNode(IO.ComfyNode): @classmethod def define_schema(cls): @@ -395,7 +466,7 @@ class RecraftTextToImageNode(IO.ComfyNode): negative_prompt: str = None, recraft_controls: RecraftControls = None, ) -> IO.NodeOutput: - validate_string(prompt, strip_whitespace=False, max_length=1000) + validate_string(prompt, strip_whitespace=False, min_length=1, max_length=1000) default_style = RecraftStyle(RecraftStyleV3.realistic_image) if recraft_style is None: recraft_style = default_style @@ -1024,6 +1095,7 @@ class RecraftExtension(ComfyExtension): RecraftStyleV3DigitalIllustrationNode, RecraftStyleV3LogoRasterNode, RecraftStyleInfiniteStyleLibrary, + RecraftCreateStyleNode, RecraftColorRGBNode, RecraftControlsNode, ] diff --git a/comfy_api_nodes/nodes_sora.py b/comfy_api_nodes/nodes_sora.py index 87e663845..afc18bb25 100644 --- a/comfy_api_nodes/nodes_sora.py +++ b/comfy_api_nodes/nodes_sora.py @@ -149,7 +149,6 @@ class OpenAIVideoSora2(IO.ComfyNode): response_model=Sora2GenerationResponse, status_extractor=lambda x: x.status, poll_interval=8.0, - max_poll_attempts=160, estimated_duration=int(45 * (duration / 4) * model_time_multiplier), ) return IO.NodeOutput( diff --git a/comfy_api_nodes/nodes_topaz.py b/comfy_api_nodes/nodes_topaz.py index c052e7656..8fccde25a 100644 --- a/comfy_api_nodes/nodes_topaz.py +++ b/comfy_api_nodes/nodes_topaz.py @@ -203,7 +203,6 @@ class TopazImageEnhance(IO.ComfyNode): progress_extractor=lambda x: getattr(x, "progress", 0), price_extractor=lambda x: x.credits * 0.08, poll_interval=8.0, - max_poll_attempts=160, estimated_duration=60, ) diff --git a/comfy_api_nodes/nodes_vidu.py b/comfy_api_nodes/nodes_vidu.py index b9114c4bb..80de14dfe 100644 --- a/comfy_api_nodes/nodes_vidu.py +++ b/comfy_api_nodes/nodes_vidu.py @@ -2,9 +2,12 @@ from typing_extensions import override from comfy_api.latest import IO, ComfyExtension, Input from comfy_api_nodes.apis.vidu import ( + FrameSetting, SubjectReference, TaskCreationRequest, TaskCreationResponse, + TaskExtendCreationRequest, + TaskMultiFrameCreationRequest, TaskResult, TaskStatusResponse, ) @@ -14,11 +17,14 @@ from comfy_api_nodes.util import ( get_number_of_images, poll_op, sync_op, + upload_image_to_comfyapi, upload_images_to_comfyapi, + upload_video_to_comfyapi, validate_image_aspect_ratio, validate_image_dimensions, validate_images_aspect_ratio_closeness, validate_string, + validate_video_duration, ) VIDU_TEXT_TO_VIDEO = "/proxy/vidu/text2video" @@ -31,7 +37,8 @@ VIDU_GET_GENERATION_STATUS = "/proxy/vidu/tasks/%s/creations" async def execute_task( cls: type[IO.ComfyNode], vidu_endpoint: str, - payload: TaskCreationRequest, + payload: TaskCreationRequest | TaskExtendCreationRequest | TaskMultiFrameCreationRequest, + max_poll_attempts: int = 320, ) -> list[TaskResult]: task_creation_response = await sync_op( cls, @@ -47,7 +54,7 @@ async def execute_task( response_model=TaskStatusResponse, status_extractor=lambda r: r.state, progress_extractor=lambda r: r.progress, - max_poll_attempts=320, + max_poll_attempts=max_poll_attempts, ) if not response.creations: raise RuntimeError( @@ -940,6 +947,540 @@ class Vidu2StartEndToVideoNode(IO.ComfyNode): return IO.NodeOutput(await download_url_to_video_output(results[0].url)) +class ViduExtendVideoNode(IO.ComfyNode): + + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="ViduExtendVideoNode", + display_name="Vidu Video Extension", + category="api node/video/Vidu", + description="Extend an existing video by generating additional frames.", + inputs=[ + IO.DynamicCombo.Input( + "model", + options=[ + IO.DynamicCombo.Option( + "viduq2-pro", + [ + IO.Int.Input( + "duration", + default=4, + min=1, + max=7, + step=1, + display_mode=IO.NumberDisplay.slider, + tooltip="Duration of the extended video in seconds.", + ), + IO.Combo.Input( + "resolution", + options=["720p", "1080p"], + tooltip="Resolution of the output video.", + ), + ], + ), + IO.DynamicCombo.Option( + "viduq2-turbo", + [ + IO.Int.Input( + "duration", + default=4, + min=1, + max=7, + step=1, + display_mode=IO.NumberDisplay.slider, + tooltip="Duration of the extended video in seconds.", + ), + IO.Combo.Input( + "resolution", + options=["720p", "1080p"], + tooltip="Resolution of the output video.", + ), + ], + ), + ], + tooltip="Model to use for video extension.", + ), + IO.Video.Input( + "video", + tooltip="The source video to extend.", + ), + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="An optional text prompt for the extended video (max 2000 characters).", + ), + IO.Int.Input( + "seed", + default=1, + min=0, + max=2147483647, + step=1, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + ), + IO.Image.Input("end_frame", optional=True), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["model", "model.duration", "model.resolution"]), + expr=""" + ( + $m := widgets.model; + $d := $lookup(widgets, "model.duration"); + $res := $lookup(widgets, "model.resolution"); + $contains($m, "pro") + ? ( + $base := $lookup({"720p": 0.15, "1080p": 0.3}, $res); + $perSec := $lookup({"720p": 0.05, "1080p": 0.075}, $res); + {"type":"usd","usd": $base + $perSec * ($d - 1)} + ) + : ( + $base := $lookup({"720p": 0.075, "1080p": 0.2}, $res); + $perSec := $lookup({"720p": 0.025, "1080p": 0.05}, $res); + {"type":"usd","usd": $base + $perSec * ($d - 1)} + ) + ) + """, + ), + ) + + @classmethod + async def execute( + cls, + model: dict, + video: Input.Video, + prompt: str, + seed: int, + end_frame: Input.Image | None = None, + ) -> IO.NodeOutput: + validate_string(prompt, max_length=2000) + validate_video_duration(video, min_duration=4, max_duration=55) + image_url = None + if end_frame is not None: + validate_image_aspect_ratio(end_frame, (1, 4), (4, 1)) + validate_image_dimensions(end_frame, min_width=128, min_height=128) + image_url = await upload_image_to_comfyapi(cls, end_frame, wait_label="Uploading end frame") + results = await execute_task( + cls, + "/proxy/vidu/extend", + TaskExtendCreationRequest( + model=model["model"], + prompt=prompt, + duration=model["duration"], + seed=seed, + resolution=model["resolution"], + video_url=await upload_video_to_comfyapi(cls, video, wait_label="Uploading video"), + images=[image_url] if image_url else None, + ), + max_poll_attempts=480, + ) + return IO.NodeOutput(await download_url_to_video_output(results[0].url)) + + +def _generate_frame_inputs(count: int) -> list: + """Generate input widgets for a given number of frames.""" + inputs = [] + for i in range(1, count + 1): + inputs.extend( + [ + IO.String.Input( + f"prompt{i}", + multiline=True, + default="", + tooltip=f"Text prompt for frame {i} transition.", + ), + IO.Image.Input( + f"end_image{i}", + tooltip=f"End frame image for segment {i}. Aspect ratio must be between 1:4 and 4:1.", + ), + IO.Int.Input( + f"duration{i}", + default=4, + min=2, + max=7, + step=1, + display_mode=IO.NumberDisplay.slider, + tooltip=f"Duration for segment {i} in seconds.", + ), + ] + ) + return inputs + + +class ViduMultiFrameVideoNode(IO.ComfyNode): + + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="ViduMultiFrameVideoNode", + display_name="Vidu Multi-Frame Video Generation", + category="api node/video/Vidu", + description="Generate a video with multiple keyframe transitions.", + inputs=[ + IO.Combo.Input("model", options=["viduq2-pro", "viduq2-turbo"]), + IO.Image.Input( + "start_image", + tooltip="The starting frame image. Aspect ratio must be between 1:4 and 4:1.", + ), + IO.Int.Input( + "seed", + default=1, + min=0, + max=2147483647, + step=1, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + ), + IO.Combo.Input("resolution", options=["720p", "1080p"]), + IO.DynamicCombo.Input( + "frames", + options=[ + IO.DynamicCombo.Option("2", _generate_frame_inputs(2)), + IO.DynamicCombo.Option("3", _generate_frame_inputs(3)), + IO.DynamicCombo.Option("4", _generate_frame_inputs(4)), + IO.DynamicCombo.Option("5", _generate_frame_inputs(5)), + IO.DynamicCombo.Option("6", _generate_frame_inputs(6)), + IO.DynamicCombo.Option("7", _generate_frame_inputs(7)), + IO.DynamicCombo.Option("8", _generate_frame_inputs(8)), + IO.DynamicCombo.Option("9", _generate_frame_inputs(9)), + ], + tooltip="Number of keyframe transitions (2-9).", + ), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends( + widgets=[ + "model", + "resolution", + "frames", + "frames.duration1", + "frames.duration2", + "frames.duration3", + "frames.duration4", + "frames.duration5", + "frames.duration6", + "frames.duration7", + "frames.duration8", + "frames.duration9", + ] + ), + expr=""" + ( + $m := widgets.model; + $n := $number(widgets.frames); + $is1080 := widgets.resolution = "1080p"; + $d1 := $lookup(widgets, "frames.duration1"); + $d2 := $lookup(widgets, "frames.duration2"); + $d3 := $n >= 3 ? $lookup(widgets, "frames.duration3") : 0; + $d4 := $n >= 4 ? $lookup(widgets, "frames.duration4") : 0; + $d5 := $n >= 5 ? $lookup(widgets, "frames.duration5") : 0; + $d6 := $n >= 6 ? $lookup(widgets, "frames.duration6") : 0; + $d7 := $n >= 7 ? $lookup(widgets, "frames.duration7") : 0; + $d8 := $n >= 8 ? $lookup(widgets, "frames.duration8") : 0; + $d9 := $n >= 9 ? $lookup(widgets, "frames.duration9") : 0; + $totalDuration := $d1 + $d2 + $d3 + $d4 + $d5 + $d6 + $d7 + $d8 + $d9; + $contains($m, "pro") + ? ( + $base := $is1080 ? 0.3 : 0.15; + $perSec := $is1080 ? 0.075 : 0.05; + {"type":"usd","usd": $n * $base + $perSec * $totalDuration} + ) + : ( + $base := $is1080 ? 0.2 : 0.075; + $perSec := $is1080 ? 0.05 : 0.025; + {"type":"usd","usd": $n * $base + $perSec * $totalDuration} + ) + ) + """, + ), + ) + + @classmethod + async def execute( + cls, + model: str, + start_image: Input.Image, + seed: int, + resolution: str, + frames: dict, + ) -> IO.NodeOutput: + validate_image_aspect_ratio(start_image, (1, 4), (4, 1)) + frame_count = int(frames["frames"]) + image_settings: list[FrameSetting] = [] + for i in range(1, frame_count + 1): + validate_image_aspect_ratio(frames[f"end_image{i}"], (1, 4), (4, 1)) + validate_string(frames[f"prompt{i}"], max_length=2000) + start_image_url = await upload_image_to_comfyapi( + cls, + start_image, + mime_type="image/png", + wait_label="Uploading start image", + ) + for i in range(1, frame_count + 1): + image_settings.append( + FrameSetting( + prompt=frames[f"prompt{i}"], + key_image=await upload_image_to_comfyapi( + cls, + frames[f"end_image{i}"], + mime_type="image/png", + wait_label=f"Uploading end image({i})", + ), + duration=frames[f"duration{i}"], + ) + ) + results = await execute_task( + cls, + "/proxy/vidu/multiframe", + TaskMultiFrameCreationRequest( + model=model, + seed=seed, + resolution=resolution, + start_image=start_image_url, + image_settings=image_settings, + ), + max_poll_attempts=480 * frame_count, + ) + return IO.NodeOutput(await download_url_to_video_output(results[0].url)) + + +class Vidu3TextToVideoNode(IO.ComfyNode): + + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="Vidu3TextToVideoNode", + display_name="Vidu Q3 Text-to-Video Generation", + category="api node/video/Vidu", + description="Generate video from a text prompt.", + inputs=[ + IO.DynamicCombo.Input( + "model", + options=[ + IO.DynamicCombo.Option( + "viduq3-pro", + [ + IO.Combo.Input( + "aspect_ratio", + options=["16:9", "9:16", "3:4", "4:3", "1:1"], + tooltip="The aspect ratio of the output video.", + ), + IO.Combo.Input( + "resolution", + options=["720p", "1080p"], + tooltip="Resolution of the output video.", + ), + IO.Int.Input( + "duration", + default=5, + min=1, + max=16, + step=1, + display_mode=IO.NumberDisplay.slider, + tooltip="Duration of the output video in seconds.", + ), + IO.Boolean.Input( + "audio", + default=False, + tooltip="When enabled, outputs video with sound " + "(including dialogue and sound effects).", + ), + ], + ), + ], + tooltip="Model to use for video generation.", + ), + IO.String.Input( + "prompt", + multiline=True, + tooltip="A textual description for video generation, with a maximum length of 2000 characters.", + ), + IO.Int.Input( + "seed", + default=1, + min=0, + max=2147483647, + step=1, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + ), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["model.duration", "model.resolution"]), + expr=""" + ( + $res := $lookup(widgets, "model.resolution"); + $base := $lookup({"720p": 0.075, "1080p": 0.1}, $res); + $perSec := $lookup({"720p": 0.025, "1080p": 0.05}, $res); + {"type":"usd","usd": $base + $perSec * ($lookup(widgets, "model.duration") - 1)} + ) + """, + ), + ) + + @classmethod + async def execute( + cls, + model: dict, + prompt: str, + seed: int, + ) -> IO.NodeOutput: + validate_string(prompt, min_length=1, max_length=2000) + results = await execute_task( + cls, + VIDU_TEXT_TO_VIDEO, + TaskCreationRequest( + model=model["model"], + prompt=prompt, + duration=model["duration"], + seed=seed, + aspect_ratio=model["aspect_ratio"], + resolution=model["resolution"], + audio=model["audio"], + ), + max_poll_attempts=640, + ) + return IO.NodeOutput(await download_url_to_video_output(results[0].url)) + + +class Vidu3ImageToVideoNode(IO.ComfyNode): + + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="Vidu3ImageToVideoNode", + display_name="Vidu Q3 Image-to-Video Generation", + category="api node/video/Vidu", + description="Generate a video from an image and an optional prompt.", + inputs=[ + IO.DynamicCombo.Input( + "model", + options=[ + IO.DynamicCombo.Option( + "viduq3-pro", + [ + IO.Combo.Input( + "resolution", + options=["720p", "1080p", "2K"], + tooltip="Resolution of the output video.", + ), + IO.Int.Input( + "duration", + default=5, + min=1, + max=16, + step=1, + display_mode=IO.NumberDisplay.slider, + tooltip="Duration of the output video in seconds.", + ), + IO.Boolean.Input( + "audio", + default=False, + tooltip="When enabled, outputs video with sound " + "(including dialogue and sound effects).", + ), + ], + ), + ], + tooltip="Model to use for video generation.", + ), + IO.Image.Input( + "image", + tooltip="An image to be used as the start frame of the generated video.", + ), + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="An optional text prompt for video generation (max 2000 characters).", + ), + IO.Int.Input( + "seed", + default=1, + min=0, + max=2147483647, + step=1, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + ), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["model.duration", "model.resolution"]), + expr=""" + ( + $res := $lookup(widgets, "model.resolution"); + $base := $lookup({"720p": 0.075, "1080p": 0.275, "2k": 0.35}, $res); + $perSec := $lookup({"720p": 0.05, "1080p": 0.075, "2k": 0.075}, $res); + {"type":"usd","usd": $base + $perSec * ($lookup(widgets, "model.duration") - 1)} + ) + """, + ), + ) + + @classmethod + async def execute( + cls, + model: dict, + image: Input.Image, + prompt: str, + seed: int, + ) -> IO.NodeOutput: + validate_image_aspect_ratio(image, (1, 4), (4, 1)) + validate_string(prompt, max_length=2000) + results = await execute_task( + cls, + VIDU_IMAGE_TO_VIDEO, + TaskCreationRequest( + model=model["model"], + prompt=prompt, + duration=model["duration"], + seed=seed, + resolution=model["resolution"], + audio=model["audio"], + images=[await upload_image_to_comfyapi(cls, image)], + ), + max_poll_attempts=720, + ) + return IO.NodeOutput(await download_url_to_video_output(results[0].url)) + + class ViduExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[IO.ComfyNode]]: @@ -952,6 +1493,10 @@ class ViduExtension(ComfyExtension): Vidu2ImageToVideoNode, Vidu2ReferenceVideoNode, Vidu2StartEndToVideoNode, + ViduExtendVideoNode, + ViduMultiFrameVideoNode, + Vidu3TextToVideoNode, + Vidu3ImageToVideoNode, ] diff --git a/comfy_api_nodes/util/__init__.py b/comfy_api_nodes/util/__init__.py index 364976000..c3c9ff4bf 100644 --- a/comfy_api_nodes/util/__init__.py +++ b/comfy_api_nodes/util/__init__.py @@ -13,6 +13,7 @@ from .conversions import ( bytesio_to_image_tensor, convert_mask_to_image, downscale_image_tensor, + downscale_image_tensor_by_max_side, image_tensor_pair_to_batch, pil_to_bytesio, resize_mask_to_image, @@ -33,6 +34,7 @@ from .download_helpers import ( from .upload_helpers import ( upload_audio_to_comfyapi, upload_file_to_comfyapi, + upload_image_to_comfyapi, upload_images_to_comfyapi, upload_video_to_comfyapi, ) @@ -61,6 +63,7 @@ __all__ = [ # Upload helpers "upload_audio_to_comfyapi", "upload_file_to_comfyapi", + "upload_image_to_comfyapi", "upload_images_to_comfyapi", "upload_video_to_comfyapi", # Download helpers @@ -75,6 +78,7 @@ __all__ = [ "bytesio_to_image_tensor", "convert_mask_to_image", "downscale_image_tensor", + "downscale_image_tensor_by_max_side", "image_tensor_pair_to_batch", "pil_to_bytesio", "resize_mask_to_image", diff --git a/comfy_api_nodes/util/client.py b/comfy_api_nodes/util/client.py index f372ec7b5..8a1259506 100644 --- a/comfy_api_nodes/util/client.py +++ b/comfy_api_nodes/util/client.py @@ -141,7 +141,7 @@ async def poll_op( queued_statuses: list[str | int] | None = None, data: BaseModel | None = None, poll_interval: float = 5.0, - max_poll_attempts: int = 120, + max_poll_attempts: int = 160, timeout_per_poll: float = 120.0, max_retries_per_poll: int = 3, retry_delay_per_poll: float = 1.0, @@ -238,7 +238,7 @@ async def poll_op_raw( queued_statuses: list[str | int] | None = None, data: dict[str, Any] | BaseModel | None = None, poll_interval: float = 5.0, - max_poll_attempts: int = 120, + max_poll_attempts: int = 160, timeout_per_poll: float = 120.0, max_retries_per_poll: int = 3, retry_delay_per_poll: float = 1.0, diff --git a/comfy_api_nodes/util/conversions.py b/comfy_api_nodes/util/conversions.py index 546741b7b..3e37e8a8c 100644 --- a/comfy_api_nodes/util/conversions.py +++ b/comfy_api_nodes/util/conversions.py @@ -56,15 +56,14 @@ def image_tensor_pair_to_batch(image1: torch.Tensor, image2: torch.Tensor) -> to def tensor_to_bytesio( image: torch.Tensor, *, - total_pixels: int = 2048 * 2048, + total_pixels: int | None = 2048 * 2048, mime_type: str = "image/png", ) -> BytesIO: """Converts a torch.Tensor image to a named BytesIO object. Args: image: Input torch.Tensor image. - name: Optional filename for the BytesIO object. - total_pixels: Maximum total pixels for potential downscaling. + total_pixels: Maximum total pixels for downscaling. If None, no downscaling is performed. mime_type: Target image MIME type (e.g., 'image/png', 'image/jpeg', 'image/webp', 'video/mp4'). Returns: @@ -79,13 +78,14 @@ def tensor_to_bytesio( return img_binary -def tensor_to_pil(image: torch.Tensor, total_pixels: int = 2048 * 2048) -> Image.Image: +def tensor_to_pil(image: torch.Tensor, total_pixels: int | None = 2048 * 2048) -> Image.Image: """Converts a single torch.Tensor image [H, W, C] to a PIL Image, optionally downscaling.""" if len(image.shape) > 3: image = image[0] # TODO: remove alpha if not allowed and present input_tensor = image.cpu() - input_tensor = downscale_image_tensor(input_tensor.unsqueeze(0), total_pixels=total_pixels).squeeze() + if total_pixels is not None: + input_tensor = downscale_image_tensor(input_tensor.unsqueeze(0), total_pixels=total_pixels).squeeze() image_np = (input_tensor.numpy() * 255).astype(np.uint8) img = Image.fromarray(image_np) return img @@ -93,14 +93,14 @@ def tensor_to_pil(image: torch.Tensor, total_pixels: int = 2048 * 2048) -> Image def tensor_to_base64_string( image_tensor: torch.Tensor, - total_pixels: int = 2048 * 2048, + total_pixels: int | None = 2048 * 2048, mime_type: str = "image/png", ) -> str: """Convert [B, H, W, C] or [H, W, C] tensor to a base64 string. Args: image_tensor: Input torch.Tensor image. - total_pixels: Maximum total pixels for potential downscaling. + total_pixels: Maximum total pixels for downscaling. If None, no downscaling is performed. mime_type: Target image MIME type (e.g., 'image/png', 'image/jpeg', 'image/webp', 'video/mp4'). Returns: @@ -144,16 +144,31 @@ def downscale_image_tensor(image: torch.Tensor, total_pixels: int = 1536 * 1024) return s +def downscale_image_tensor_by_max_side(image: torch.Tensor, *, max_side: int) -> torch.Tensor: + """Downscale input image tensor so the largest dimension is at most max_side pixels.""" + samples = image.movedim(-1, 1) + height, width = samples.shape[2], samples.shape[3] + max_dim = max(width, height) + if max_dim <= max_side: + return image + scale_by = max_side / max_dim + new_width = round(width * scale_by) + new_height = round(height * scale_by) + s = common_upscale(samples, new_width, new_height, "lanczos", "disabled") + s = s.movedim(1, -1) + return s + + def tensor_to_data_uri( image_tensor: torch.Tensor, - total_pixels: int = 2048 * 2048, + total_pixels: int | None = 2048 * 2048, mime_type: str = "image/png", ) -> str: """Converts a tensor image to a Data URI string. Args: image_tensor: Input torch.Tensor image. - total_pixels: Maximum total pixels for potential downscaling. + total_pixels: Maximum total pixels for downscaling. If None, no downscaling is performed. mime_type: Target image MIME type (e.g., 'image/png', 'image/jpeg', 'image/webp'). Returns: diff --git a/comfy_api_nodes/util/upload_helpers.py b/comfy_api_nodes/util/upload_helpers.py index 2794be35c..3153f2b98 100644 --- a/comfy_api_nodes/util/upload_helpers.py +++ b/comfy_api_nodes/util/upload_helpers.py @@ -49,7 +49,7 @@ async def upload_images_to_comfyapi( mime_type: str | None = None, wait_label: str | None = "Uploading", show_batch_index: bool = True, - total_pixels: int = 2048 * 2048, + total_pixels: int | None = 2048 * 2048, ) -> list[str]: """ Uploads images to ComfyUI API and returns download URLs. @@ -88,6 +88,28 @@ async def upload_images_to_comfyapi( return download_urls +async def upload_image_to_comfyapi( + cls: type[IO.ComfyNode], + image: torch.Tensor, + *, + mime_type: str | None = None, + wait_label: str | None = "Uploading", + total_pixels: int = 2048 * 2048, +) -> str: + """Uploads a single image to ComfyUI API and returns its download URL.""" + return ( + await upload_images_to_comfyapi( + cls, + image, + max_images=1, + mime_type=mime_type, + wait_label=wait_label, + show_batch_index=False, + total_pixels=total_pixels, + ) + )[0] + + async def upload_audio_to_comfyapi( cls: type[IO.ComfyNode], audio: Input.Audio, diff --git a/comfy_execution/jobs.py b/comfy_execution/jobs.py index 97fd803b8..bf091a448 100644 --- a/comfy_execution/jobs.py +++ b/comfy_execution/jobs.py @@ -171,9 +171,10 @@ def get_outputs_summary(outputs: dict) -> tuple[int, Optional[dict]]: continue for item in items: + count += 1 + if not isinstance(item, dict): continue - count += 1 if preview_output is None and is_previewable(media_type, item): enriched = { diff --git a/comfy_extras/nodes_align_your_steps.py b/comfy_extras/nodes_align_your_steps.py index edd5dadd4..4fc511d2c 100644 --- a/comfy_extras/nodes_align_your_steps.py +++ b/comfy_extras/nodes_align_your_steps.py @@ -28,6 +28,7 @@ class AlignYourStepsScheduler(io.ComfyNode): def define_schema(cls) -> io.Schema: return io.Schema( node_id="AlignYourStepsScheduler", + search_aliases=["AYS scheduler"], category="sampling/custom_sampling/schedulers", inputs=[ io.Combo.Input("model_type", options=["SD1", "SDXL", "SVD"]), diff --git a/comfy_extras/nodes_attention_multiply.py b/comfy_extras/nodes_attention_multiply.py index c0e494c2a..67c4e2ed0 100644 --- a/comfy_extras/nodes_attention_multiply.py +++ b/comfy_extras/nodes_attention_multiply.py @@ -71,6 +71,7 @@ class CLIPAttentionMultiply(io.ComfyNode): def define_schema(cls) -> io.Schema: return io.Schema( node_id="CLIPAttentionMultiply", + search_aliases=["clip attention scale", "text encoder attention"], category="_for_testing/attention_experiments", inputs=[ io.Clip.Input("clip"), diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py index 15b3aa401..271b75fbd 100644 --- a/comfy_extras/nodes_audio.py +++ b/comfy_extras/nodes_audio.py @@ -69,6 +69,7 @@ class VAEEncodeAudio(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="VAEEncodeAudio", + search_aliases=["audio to latent"], display_name="VAE Encode Audio", category="latent/audio", inputs=[ @@ -97,6 +98,7 @@ class VAEDecodeAudio(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="VAEDecodeAudio", + search_aliases=["latent to audio"], display_name="VAE Decode Audio", category="latent/audio", inputs=[ @@ -122,6 +124,7 @@ class SaveAudio(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="SaveAudio", + search_aliases=["export flac"], display_name="Save Audio (FLAC)", category="audio", inputs=[ @@ -146,6 +149,7 @@ class SaveAudioMP3(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="SaveAudioMP3", + search_aliases=["export mp3"], display_name="Save Audio (MP3)", category="audio", inputs=[ @@ -173,6 +177,7 @@ class SaveAudioOpus(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="SaveAudioOpus", + search_aliases=["export opus"], display_name="Save Audio (Opus)", category="audio", inputs=[ @@ -200,6 +205,7 @@ class PreviewAudio(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="PreviewAudio", + search_aliases=["play audio"], display_name="Preview Audio", category="audio", inputs=[ @@ -259,6 +265,7 @@ class LoadAudio(IO.ComfyNode): files = folder_paths.filter_files_content_types(os.listdir(input_dir), ["audio", "video"]) return IO.Schema( node_id="LoadAudio", + search_aliases=["import audio", "open audio", "audio file"], display_name="Load Audio", category="audio", inputs=[ @@ -296,6 +303,7 @@ class RecordAudio(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="RecordAudio", + search_aliases=["microphone input", "audio capture", "voice input"], display_name="Record Audio", category="audio", inputs=[ @@ -320,6 +328,7 @@ class TrimAudioDuration(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="TrimAudioDuration", + search_aliases=["cut audio", "audio clip", "shorten audio"], display_name="Trim Audio Duration", description="Trim audio tensor into chosen time range.", category="audio", @@ -372,6 +381,7 @@ class SplitAudioChannels(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="SplitAudioChannels", + search_aliases=["stereo to mono"], display_name="Split Audio Channels", description="Separates the audio into left and right channels.", category="audio", @@ -472,6 +482,7 @@ class AudioConcat(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="AudioConcat", + search_aliases=["join audio", "combine audio", "append audio"], display_name="Audio Concat", description="Concatenates the audio1 to audio2 in the specified direction.", category="audio", @@ -519,6 +530,7 @@ class AudioMerge(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="AudioMerge", + search_aliases=["mix audio", "overlay audio", "layer audio"], display_name="Audio Merge", description="Combine two audio tracks by overlaying their waveforms.", category="audio", @@ -579,6 +591,7 @@ class AudioAdjustVolume(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="AudioAdjustVolume", + search_aliases=["audio gain", "loudness", "audio level"], display_name="Audio Adjust Volume", category="audio", inputs=[ @@ -614,6 +627,7 @@ class EmptyAudio(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="EmptyAudio", + search_aliases=["blank audio"], display_name="Empty Audio", category="audio", inputs=[ diff --git a/comfy_extras/nodes_canny.py b/comfy_extras/nodes_canny.py index 576f3640a..6e0fadca5 100644 --- a/comfy_extras/nodes_canny.py +++ b/comfy_extras/nodes_canny.py @@ -10,6 +10,7 @@ class Canny(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="Canny", + search_aliases=["edge detection", "outline", "contour detection", "line art"], category="image/preprocessors", inputs=[ io.Image.Input("image"), diff --git a/comfy_extras/nodes_color.py b/comfy_extras/nodes_color.py new file mode 100644 index 000000000..80ba121cd --- /dev/null +++ b/comfy_extras/nodes_color.py @@ -0,0 +1,42 @@ +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io + + +class ColorToRGBInt(io.ComfyNode): + @classmethod + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="ColorToRGBInt", + display_name="Color to RGB Int", + category="utils", + description="Convert a color to a RGB integer value.", + inputs=[ + io.Color.Input("color"), + ], + outputs=[ + io.Int.Output(display_name="rgb_int"), + ], + ) + + @classmethod + def execute( + cls, + color: str, + ) -> io.NodeOutput: + # expect format #RRGGBB + if len(color) != 7 or color[0] != "#": + raise ValueError("Color must be in format #RRGGBB") + r = int(color[1:3], 16) + g = int(color[3:5], 16) + b = int(color[5:7], 16) + return io.NodeOutput(r * 256 * 256 + g * 256 + b) + + +class ColorExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ColorToRGBInt] + + +async def comfy_entrypoint() -> ColorExtension: + return ColorExtension() diff --git a/comfy_extras/nodes_compositing.py b/comfy_extras/nodes_compositing.py index e4e4e1cbc..3bc9fccb3 100644 --- a/comfy_extras/nodes_compositing.py +++ b/comfy_extras/nodes_compositing.py @@ -109,6 +109,7 @@ class PorterDuffImageComposite(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="PorterDuffImageComposite", + search_aliases=["alpha composite", "blend modes", "layer blend", "transparency blend"], display_name="Porter-Duff Image Composite", category="mask/compositing", inputs=[ @@ -165,6 +166,7 @@ class SplitImageWithAlpha(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SplitImageWithAlpha", + search_aliases=["extract alpha", "separate transparency", "remove alpha"], display_name="Split Image with Alpha", category="mask/compositing", inputs=[ @@ -188,6 +190,7 @@ class JoinImageWithAlpha(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="JoinImageWithAlpha", + search_aliases=["add transparency", "apply alpha", "composite alpha", "RGBA"], display_name="Join Image with Alpha", category="mask/compositing", inputs=[ diff --git a/comfy_extras/nodes_controlnet.py b/comfy_extras/nodes_controlnet.py index e835feed7..0c1d7f0d4 100644 --- a/comfy_extras/nodes_controlnet.py +++ b/comfy_extras/nodes_controlnet.py @@ -38,6 +38,7 @@ class ControlNetInpaintingAliMamaApply(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="ControlNetInpaintingAliMamaApply", + search_aliases=["masked controlnet"], category="conditioning/controlnet", inputs=[ io.Conditioning.Input("positive"), diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py index f19adf4b9..8afd13acf 100644 --- a/comfy_extras/nodes_custom_sampler.py +++ b/comfy_extras/nodes_custom_sampler.py @@ -297,6 +297,7 @@ class ExtendIntermediateSigmas(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="ExtendIntermediateSigmas", + search_aliases=["interpolate sigmas"], category="sampling/custom_sampling/sigmas", inputs=[ io.Sigmas.Input("sigmas"), @@ -700,7 +701,14 @@ class Noise_EmptyNoise: def generate_noise(self, input_latent): latent_image = input_latent["samples"] - return torch.zeros(latent_image.shape, dtype=latent_image.dtype, layout=latent_image.layout, device="cpu") + if latent_image.is_nested: + tensors = latent_image.unbind() + zeros = [] + for t in tensors: + zeros.append(torch.zeros(t.shape, dtype=t.dtype, layout=t.layout, device="cpu")) + return comfy.nested_tensor.NestedTensor(zeros) + else: + return torch.zeros(latent_image.shape, dtype=latent_image.dtype, layout=latent_image.layout, device="cpu") class Noise_RandomNoise: @@ -740,7 +748,7 @@ class SamplerCustom(io.ComfyNode): latent = latent_image latent_image = latent["samples"] latent = latent.copy() - latent_image = comfy.sample.fix_empty_latent_channels(model, latent_image) + latent_image = comfy.sample.fix_empty_latent_channels(model, latent_image, latent.get("downscale_ratio_spacial", None)) latent["samples"] = latent_image if not add_noise: @@ -759,6 +767,7 @@ class SamplerCustom(io.ComfyNode): samples = comfy.sample.sample_custom(model, noise, cfg, sampler, sigmas, positive, negative, latent_image, noise_mask=noise_mask, callback=callback, disable_pbar=disable_pbar, seed=noise_seed) out = latent.copy() + out.pop("downscale_ratio_spacial", None) out["samples"] = samples if "x0" in x0_output: x0_out = model.model.process_latent_out(x0_output["x0"].cpu()) @@ -856,6 +865,7 @@ class DualCFGGuider(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="DualCFGGuider", + search_aliases=["dual prompt guidance"], category="sampling/custom_sampling/guiders", inputs=[ io.Model.Input("model"), @@ -883,6 +893,7 @@ class DisableNoise(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="DisableNoise", + search_aliases=["zero noise"], category="sampling/custom_sampling/noise", inputs=[], outputs=[io.Noise.Output()] @@ -936,7 +947,7 @@ class SamplerCustomAdvanced(io.ComfyNode): latent = latent_image latent_image = latent["samples"] latent = latent.copy() - latent_image = comfy.sample.fix_empty_latent_channels(guider.model_patcher, latent_image) + latent_image = comfy.sample.fix_empty_latent_channels(guider.model_patcher, latent_image, latent.get("downscale_ratio_spacial", None)) latent["samples"] = latent_image noise_mask = None @@ -951,6 +962,7 @@ class SamplerCustomAdvanced(io.ComfyNode): samples = samples.to(comfy.model_management.intermediate_device()) out = latent.copy() + out.pop("downscale_ratio_spacial", None) out["samples"] = samples if "x0" in x0_output: x0_out = guider.model_patcher.model.process_latent_out(x0_output["x0"].cpu()) @@ -1019,6 +1031,7 @@ class ManualSigmas(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="ManualSigmas", + search_aliases=["custom noise schedule", "define sigmas"], category="_for_testing/custom_sampling", is_experimental=True, inputs=[ diff --git a/comfy_extras/nodes_dataset.py b/comfy_extras/nodes_dataset.py index 5ef851bd0..fb9409ac3 100644 --- a/comfy_extras/nodes_dataset.py +++ b/comfy_extras/nodes_dataset.py @@ -1223,11 +1223,11 @@ class ResolutionBucket(io.ComfyNode): class MakeTrainingDataset(io.ComfyNode): """Encode images with VAE and texts with CLIP to create a training dataset.""" - @classmethod def define_schema(cls): return io.Schema( node_id="MakeTrainingDataset", + search_aliases=["encode dataset"], display_name="Make Training Dataset", category="dataset", is_experimental=True, @@ -1309,11 +1309,11 @@ class MakeTrainingDataset(io.ComfyNode): class SaveTrainingDataset(io.ComfyNode): """Save encoded training dataset (latents + conditioning) to disk.""" - @classmethod def define_schema(cls): return io.Schema( node_id="SaveTrainingDataset", + search_aliases=["export training data"], display_name="Save Training Dataset", category="dataset", is_experimental=True, @@ -1410,11 +1410,11 @@ class SaveTrainingDataset(io.ComfyNode): class LoadTrainingDataset(io.ComfyNode): """Load encoded training dataset from disk.""" - @classmethod def define_schema(cls): return io.Schema( node_id="LoadTrainingDataset", + search_aliases=["import dataset", "training data"], display_name="Load Training Dataset", category="dataset", is_experimental=True, diff --git a/comfy_extras/nodes_differential_diffusion.py b/comfy_extras/nodes_differential_diffusion.py index 6dfdf466c..34ffb9a89 100644 --- a/comfy_extras/nodes_differential_diffusion.py +++ b/comfy_extras/nodes_differential_diffusion.py @@ -11,6 +11,7 @@ class DifferentialDiffusion(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="DifferentialDiffusion", + search_aliases=["inpaint gradient", "variable denoise strength"], display_name="Differential Diffusion", category="_for_testing", inputs=[ diff --git a/comfy_extras/nodes_easycache.py b/comfy_extras/nodes_easycache.py index 11b23ffdb..90d730df6 100644 --- a/comfy_extras/nodes_easycache.py +++ b/comfy_extras/nodes_easycache.py @@ -29,8 +29,10 @@ def easycache_forward_wrapper(executor, *args, **kwargs): do_easycache = easycache.should_do_easycache(sigmas) if do_easycache: easycache.check_metadata(x) + # if there isn't a cache diff for current conds, we cannot skip this step + can_apply_cache_diff = easycache.can_apply_cache_diff(uuids) # if first cond marked this step for skipping, skip it and use appropriate cached values - if easycache.skip_current_step: + if easycache.skip_current_step and can_apply_cache_diff: if easycache.verbose: logging.info(f"EasyCache [verbose] - was marked to skip this step by {easycache.first_cond_uuid}. Present uuids: {uuids}") return easycache.apply_cache_diff(x, uuids) @@ -44,7 +46,7 @@ def easycache_forward_wrapper(executor, *args, **kwargs): if easycache.has_output_prev_norm() and easycache.has_relative_transformation_rate(): approx_output_change_rate = (easycache.relative_transformation_rate * input_change) / easycache.output_prev_norm easycache.cumulative_change_rate += approx_output_change_rate - if easycache.cumulative_change_rate < easycache.reuse_threshold: + if easycache.cumulative_change_rate < easycache.reuse_threshold and can_apply_cache_diff: if easycache.verbose: logging.info(f"EasyCache [verbose] - skipping step; cumulative_change_rate: {easycache.cumulative_change_rate}, reuse_threshold: {easycache.reuse_threshold}") # other conds should also skip this step, and instead use their cached values @@ -240,6 +242,9 @@ class EasyCacheHolder: return to_return.clone() return to_return + def can_apply_cache_diff(self, uuids: list[UUID]) -> bool: + return all(uuid in self.uuid_cache_diffs for uuid in uuids) + def apply_cache_diff(self, x: torch.Tensor, uuids: list[UUID]): if self.first_cond_uuid in uuids: self.total_steps_skipped += 1 diff --git a/comfy_extras/nodes_fresca.py b/comfy_extras/nodes_fresca.py index f308eb0c1..3d590af4b 100644 --- a/comfy_extras/nodes_fresca.py +++ b/comfy_extras/nodes_fresca.py @@ -58,6 +58,7 @@ class FreSca(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="FreSca", + search_aliases=["frequency guidance"], display_name="FreSca", category="_for_testing", description="Applies frequency-dependent scaling to the guidance", diff --git a/comfy_extras/nodes_hidream.py b/comfy_extras/nodes_hidream.py index eee683ee1..e345fe51d 100644 --- a/comfy_extras/nodes_hidream.py +++ b/comfy_extras/nodes_hidream.py @@ -38,6 +38,7 @@ class CLIPTextEncodeHiDream(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="CLIPTextEncodeHiDream", + search_aliases=["hidream prompt"], category="advanced/conditioning", inputs=[ io.Clip.Input("clip"), diff --git a/comfy_extras/nodes_hooks.py b/comfy_extras/nodes_hooks.py index 1edc06f3d..58e511ef5 100644 --- a/comfy_extras/nodes_hooks.py +++ b/comfy_extras/nodes_hooks.py @@ -259,6 +259,7 @@ class SetClipHooks: return (clip,) class ConditioningTimestepsRange: + SEARCH_ALIASES = ["prompt scheduling", "timestep segments", "conditioning phases"] NodeId = 'ConditioningTimestepsRange' NodeName = 'Timesteps Range' @classmethod @@ -468,6 +469,7 @@ class SetHookKeyframes: return (hooks,) class CreateHookKeyframe: + SEARCH_ALIASES = ["hook scheduling", "strength animation", "timed hook"] NodeId = 'CreateHookKeyframe' NodeName = 'Create Hook Keyframe' @classmethod @@ -497,6 +499,7 @@ class CreateHookKeyframe: return (prev_hook_kf,) class CreateHookKeyframesInterpolated: + SEARCH_ALIASES = ["ease hook strength", "smooth hook transition", "interpolate keyframes"] NodeId = 'CreateHookKeyframesInterpolated' NodeName = 'Create Hook Keyframes Interp.' @classmethod @@ -544,6 +547,7 @@ class CreateHookKeyframesInterpolated: return (prev_hook_kf,) class CreateHookKeyframesFromFloats: + SEARCH_ALIASES = ["batch keyframes", "strength list to keyframes"] NodeId = 'CreateHookKeyframesFromFloats' NodeName = 'Create Hook Keyframes From Floats' @classmethod @@ -618,6 +622,7 @@ class SetModelHooksOnCond: # Combine Hooks #------------------------------------------ class CombineHooks: + SEARCH_ALIASES = ["merge hooks"] NodeId = 'CombineHooks2' NodeName = 'Combine Hooks [2]' @classmethod diff --git a/comfy_extras/nodes_hunyuan.py b/comfy_extras/nodes_hunyuan.py index ceff657d3..774da75a3 100644 --- a/comfy_extras/nodes_hunyuan.py +++ b/comfy_extras/nodes_hunyuan.py @@ -56,7 +56,7 @@ class EmptyHunyuanLatentVideo(io.ComfyNode): @classmethod def execute(cls, width, height, length, batch_size=1) -> io.NodeOutput: latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) - return io.NodeOutput({"samples":latent}) + return io.NodeOutput({"samples": latent, "downscale_ratio_spacial": 8}) generate = execute # TODO: remove @@ -73,7 +73,7 @@ class EmptyHunyuanVideo15Latent(EmptyHunyuanLatentVideo): def execute(cls, width, height, length, batch_size=1) -> io.NodeOutput: # Using scale factor of 16 instead of 8 latent = torch.zeros([batch_size, 32, ((length - 1) // 4) + 1, height // 16, width // 16], device=comfy.model_management.intermediate_device()) - return io.NodeOutput({"samples": latent}) + return io.NodeOutput({"samples": latent, "downscale_ratio_spacial": 16}) class HunyuanVideo15ImageToVideo(io.ComfyNode): diff --git a/comfy_extras/nodes_hunyuan3d.py b/comfy_extras/nodes_hunyuan3d.py index adca14f62..5bb5df48e 100644 --- a/comfy_extras/nodes_hunyuan3d.py +++ b/comfy_extras/nodes_hunyuan3d.py @@ -618,6 +618,7 @@ class SaveGLB(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="SaveGLB", + search_aliases=["export 3d model", "save mesh"], category="3d", is_output_node=True, inputs=[ diff --git a/comfy_extras/nodes_images.py b/comfy_extras/nodes_images.py index ce21caade..cb4fb24a1 100644 --- a/comfy_extras/nodes_images.py +++ b/comfy_extras/nodes_images.py @@ -22,6 +22,7 @@ class ImageCrop(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="ImageCrop", + search_aliases=["trim"], display_name="Image Crop", category="image/transform", inputs=[ @@ -51,6 +52,7 @@ class RepeatImageBatch(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="RepeatImageBatch", + search_aliases=["duplicate image", "clone image"], category="image/batch", inputs=[ IO.Image.Input("image"), @@ -72,6 +74,7 @@ class ImageFromBatch(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="ImageFromBatch", + search_aliases=["select image", "pick from batch", "extract image"], category="image/batch", inputs=[ IO.Image.Input("image"), @@ -97,6 +100,7 @@ class ImageAddNoise(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="ImageAddNoise", + search_aliases=["film grain"], category="image", inputs=[ IO.Image.Input("image"), @@ -194,11 +198,11 @@ class SaveAnimatedPNG(IO.ComfyNode): class ImageStitch(IO.ComfyNode): """Upstreamed from https://github.com/kijai/ComfyUI-KJNodes""" - @classmethod def define_schema(cls): return IO.Schema( node_id="ImageStitch", + search_aliases=["combine images", "join images", "concatenate images", "side by side"], display_name="Image Stitch", description="Stitches image2 to image1 in the specified direction.\n" "If image2 is not provided, returns image1 unchanged.\n" @@ -369,11 +373,11 @@ class ImageStitch(IO.ComfyNode): class ResizeAndPadImage(IO.ComfyNode): - @classmethod def define_schema(cls): return IO.Schema( node_id="ResizeAndPadImage", + search_aliases=["fit to size"], category="image/transform", inputs=[ IO.Image.Input("image"), @@ -420,11 +424,11 @@ class ResizeAndPadImage(IO.ComfyNode): class SaveSVGNode(IO.ComfyNode): - @classmethod def define_schema(cls): return IO.Schema( node_id="SaveSVGNode", + search_aliases=["export vector", "save vector graphics"], description="Save SVG files on disk.", category="image/save", inputs=[ @@ -492,11 +496,11 @@ class SaveSVGNode(IO.ComfyNode): class GetImageSize(IO.ComfyNode): - @classmethod def define_schema(cls): return IO.Schema( node_id="GetImageSize", + search_aliases=["dimensions", "resolution", "image info"], display_name="Get Image Size", description="Returns width and height of the image, and passes it through unchanged.", category="image", @@ -527,11 +531,11 @@ class GetImageSize(IO.ComfyNode): class ImageRotate(IO.ComfyNode): - @classmethod def define_schema(cls): return IO.Schema( node_id="ImageRotate", + search_aliases=["turn", "flip orientation"], category="image/transform", inputs=[ IO.Image.Input("image"), @@ -557,11 +561,11 @@ class ImageRotate(IO.ComfyNode): class ImageFlip(IO.ComfyNode): - @classmethod def define_schema(cls): return IO.Schema( node_id="ImageFlip", + search_aliases=["mirror", "reflect"], category="image/transform", inputs=[ IO.Image.Input("image"), diff --git a/comfy_extras/nodes_kandinsky5.py b/comfy_extras/nodes_kandinsky5.py index 9cb234be1..346c50cde 100644 --- a/comfy_extras/nodes_kandinsky5.py +++ b/comfy_extras/nodes_kandinsky5.py @@ -104,6 +104,7 @@ class CLIPTextEncodeKandinsky5(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="CLIPTextEncodeKandinsky5", + search_aliases=["kandinsky prompt"], category="advanced/conditioning/kandinsky5", inputs=[ io.Clip.Input("clip"), diff --git a/comfy_extras/nodes_latent.py b/comfy_extras/nodes_latent.py index 9ba1c4ba8..6aecf1561 100644 --- a/comfy_extras/nodes_latent.py +++ b/comfy_extras/nodes_latent.py @@ -21,6 +21,7 @@ class LatentAdd(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LatentAdd", + search_aliases=["combine latents", "sum latents"], category="latent/advanced", inputs=[ io.Latent.Input("samples1"), @@ -47,6 +48,7 @@ class LatentSubtract(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LatentSubtract", + search_aliases=["difference latent", "remove features"], category="latent/advanced", inputs=[ io.Latent.Input("samples1"), @@ -73,6 +75,7 @@ class LatentMultiply(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LatentMultiply", + search_aliases=["scale latent", "amplify latent", "latent gain"], category="latent/advanced", inputs=[ io.Latent.Input("samples"), @@ -96,6 +99,7 @@ class LatentInterpolate(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LatentInterpolate", + search_aliases=["blend latent", "mix latent", "lerp latent", "transition"], category="latent/advanced", inputs=[ io.Latent.Input("samples1"), @@ -134,6 +138,7 @@ class LatentConcat(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LatentConcat", + search_aliases=["join latents", "stitch latents"], category="latent/advanced", inputs=[ io.Latent.Input("samples1"), @@ -173,6 +178,7 @@ class LatentCut(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LatentCut", + search_aliases=["crop latent", "slice latent", "extract region"], category="latent/advanced", inputs=[ io.Latent.Input("samples"), @@ -213,6 +219,7 @@ class LatentCutToBatch(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LatentCutToBatch", + search_aliases=["slice to batch", "split latent", "tile latent"], category="latent/advanced", inputs=[ io.Latent.Input("samples"), @@ -254,6 +261,7 @@ class LatentBatch(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LatentBatch", + search_aliases=["combine latents", "merge latents", "join latents"], category="latent/batch", is_deprecated=True, inputs=[ @@ -310,6 +318,7 @@ class LatentApplyOperation(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LatentApplyOperation", + search_aliases=["transform latent"], category="latent/advanced/operations", is_experimental=True, inputs=[ @@ -365,6 +374,7 @@ class LatentOperationTonemapReinhard(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LatentOperationTonemapReinhard", + search_aliases=["hdr latent"], category="latent/advanced/operations", is_experimental=True, inputs=[ diff --git a/comfy_extras/nodes_load_3d.py b/comfy_extras/nodes_load_3d.py index 545588ef8..4b8d950ae 100644 --- a/comfy_extras/nodes_load_3d.py +++ b/comfy_extras/nodes_load_3d.py @@ -24,7 +24,7 @@ class Load3D(IO.ComfyNode): files = [ normalize_path(str(file_path.relative_to(base_path))) for file_path in input_path.rglob("*") - if file_path.suffix.lower() in {'.gltf', '.glb', '.obj', '.fbx', '.stl'} + if file_path.suffix.lower() in {'.gltf', '.glb', '.obj', '.fbx', '.stl', '.spz', '.splat', '.ply', '.ksplat'} ] return IO.Schema( node_id="Load3D", @@ -75,6 +75,7 @@ class Preview3D(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="Preview3D", + search_aliases=["view mesh", "3d viewer"], display_name="Preview 3D & Animation", category="3d", is_experimental=True, diff --git a/comfy_extras/nodes_logic.py b/comfy_extras/nodes_logic.py index eb888316a..c066064ac 100644 --- a/comfy_extras/nodes_logic.py +++ b/comfy_extras/nodes_logic.py @@ -104,19 +104,23 @@ class CustomComboNode(io.ComfyNode): category="utils", is_experimental=True, inputs=[io.Combo.Input("choice", options=[])], - outputs=[io.String.Output()] + outputs=[ + io.String.Output(display_name="STRING"), + io.Int.Output(display_name="INDEX"), + ], + accept_all_inputs=True, ) @classmethod - def validate_inputs(cls, choice: io.Combo.Type) -> bool: + def validate_inputs(cls, choice: io.Combo.Type, index: int = 0, **kwargs) -> bool: # NOTE: DO NOT DO THIS unless you want to skip validation entirely on the node's inputs. # I am doing that here because the widgets (besides the combo dropdown) on this node are fully frontend defined. # I need to skip checking that the chosen combo option is in the options list, since those are defined by the user. return True @classmethod - def execute(cls, choice: io.Combo.Type) -> io.NodeOutput: - return io.NodeOutput(choice) + def execute(cls, choice: io.Combo.Type, index: int = 0, **kwargs) -> io.NodeOutput: + return io.NodeOutput(choice, index) class DCTestNode(io.ComfyNode): @@ -224,6 +228,7 @@ class ConvertStringToComboNode(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="ConvertStringToComboNode", + search_aliases=["string to dropdown", "text to combo"], display_name="Convert String to Combo", category="logic", inputs=[io.String.Input("string")], @@ -239,6 +244,7 @@ class InvertBooleanNode(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="InvertBooleanNode", + search_aliases=["not", "toggle", "negate", "flip boolean"], display_name="Invert Boolean", category="logic", inputs=[io.Boolean.Input("boolean")], diff --git a/comfy_extras/nodes_lora_debug.py b/comfy_extras/nodes_lora_debug.py new file mode 100644 index 000000000..937a0fbfb --- /dev/null +++ b/comfy_extras/nodes_lora_debug.py @@ -0,0 +1,79 @@ +import folder_paths +import comfy.utils +import comfy.sd + + +class LoraLoaderBypass: + """ + Apply LoRA in bypass mode without modifying base model weights. + + Bypass mode computes: output = base_forward(x) + lora_path(x) + This is useful for training and when model weights are offloaded. + """ + + def __init__(self): + self.loaded_lora = None + + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "model": ("MODEL", {"tooltip": "The diffusion model the LoRA will be applied to."}), + "clip": ("CLIP", {"tooltip": "The CLIP model the LoRA will be applied to."}), + "lora_name": (folder_paths.get_filename_list("loras"), {"tooltip": "The name of the LoRA."}), + "strength_model": ("FLOAT", {"default": 1.0, "min": -100.0, "max": 100.0, "step": 0.01, "tooltip": "How strongly to modify the diffusion model. This value can be negative."}), + "strength_clip": ("FLOAT", {"default": 1.0, "min": -100.0, "max": 100.0, "step": 0.01, "tooltip": "How strongly to modify the CLIP model. This value can be negative."}), + } + } + + RETURN_TYPES = ("MODEL", "CLIP") + OUTPUT_TOOLTIPS = ("The modified diffusion model.", "The modified CLIP model.") + FUNCTION = "load_lora" + + CATEGORY = "loaders" + DESCRIPTION = "Apply LoRA in bypass mode. Unlike regular LoRA, this doesn't modify model weights - instead it injects the LoRA computation during forward pass. Useful for training scenarios." + EXPERIMENTAL = True + + def load_lora(self, model, clip, lora_name, strength_model, strength_clip): + if strength_model == 0 and strength_clip == 0: + return (model, clip) + + lora_path = folder_paths.get_full_path_or_raise("loras", lora_name) + lora = None + if self.loaded_lora is not None: + if self.loaded_lora[0] == lora_path: + lora = self.loaded_lora[1] + else: + self.loaded_lora = None + + if lora is None: + lora = comfy.utils.load_torch_file(lora_path, safe_load=True) + self.loaded_lora = (lora_path, lora) + + model_lora, clip_lora = comfy.sd.load_bypass_lora_for_models(model, clip, lora, strength_model, strength_clip) + return (model_lora, clip_lora) + + +class LoraLoaderBypassModelOnly(LoraLoaderBypass): + @classmethod + def INPUT_TYPES(s): + return {"required": { "model": ("MODEL",), + "lora_name": (folder_paths.get_filename_list("loras"), ), + "strength_model": ("FLOAT", {"default": 1.0, "min": -100.0, "max": 100.0, "step": 0.01}), + }} + RETURN_TYPES = ("MODEL",) + FUNCTION = "load_lora_model_only" + + def load_lora_model_only(self, model, lora_name, strength_model): + return (self.load_lora(model, None, lora_name, strength_model, 0)[0],) + + +NODE_CLASS_MAPPINGS = { + "LoraLoaderBypass": LoraLoaderBypass, + "LoraLoaderBypassModelOnly": LoraLoaderBypassModelOnly, +} + +NODE_DISPLAY_NAME_MAPPINGS = { + "LoraLoaderBypass": "Load LoRA (Bypass) (For debugging)", + "LoraLoaderBypassModelOnly": "Load LoRA (Bypass, Model Only) (for debugging)", +} diff --git a/comfy_extras/nodes_lora_extract.py b/comfy_extras/nodes_lora_extract.py index a2375cba7..fb89e03f4 100644 --- a/comfy_extras/nodes_lora_extract.py +++ b/comfy_extras/nodes_lora_extract.py @@ -78,6 +78,7 @@ class LoraSave(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LoraSave", + search_aliases=["export lora"], display_name="Extract and Save Lora", category="_for_testing", inputs=[ diff --git a/comfy_extras/nodes_lt.py b/comfy_extras/nodes_lt.py index b91a22309..2aec62f61 100644 --- a/comfy_extras/nodes_lt.py +++ b/comfy_extras/nodes_lt.py @@ -223,11 +223,24 @@ class LTXVAddGuide(io.ComfyNode): return frame_idx, latent_idx @classmethod - def add_keyframe_index(cls, cond, frame_idx, guiding_latent, scale_factors): + def add_keyframe_index(cls, cond, frame_idx, guiding_latent, scale_factors, latent_downscale_factor=1): keyframe_idxs, _ = get_keyframe_idxs(cond) _, latent_coords = cls.PATCHIFIER.patchify(guiding_latent) pixel_coords = latent_to_pixel_coords(latent_coords, scale_factors, causal_fix=frame_idx == 0) # we need the causal fix only if we're placing the new latents at index 0 pixel_coords[:, 0] += frame_idx + + # The following adjusts keyframe end positions for small grid IC-LoRA. + # After dilation, the small grid has the same size and position as the large grid, + # but each token encodes a larger image patch. We adjust the end position (not start) + # so that RoPE represents the correct middle point of each token. + # keyframe_idxs dims: (batch, spatial_dim [t,h,w], token_id, [start, end]) + # We only adjust h,w (not t) in dim 1, and only end (not start) in dim 3. + spatial_end_offset = (latent_downscale_factor - 1) * torch.tensor( + scale_factors[1:], + device=pixel_coords.device, + ).view(1, -1, 1, 1) + pixel_coords[:, 1:, :, 1:] += spatial_end_offset.to(pixel_coords.dtype) + if keyframe_idxs is None: keyframe_idxs = pixel_coords else: @@ -235,12 +248,12 @@ class LTXVAddGuide(io.ComfyNode): return node_helpers.conditioning_set_values(cond, {"keyframe_idxs": keyframe_idxs}) @classmethod - def append_keyframe(cls, positive, negative, frame_idx, latent_image, noise_mask, guiding_latent, strength, scale_factors, guide_mask=None, in_channels=128): + def append_keyframe(cls, positive, negative, frame_idx, latent_image, noise_mask, guiding_latent, strength, scale_factors, guide_mask=None, in_channels=128, latent_downscale_factor=1): if latent_image.shape[1] != in_channels or guiding_latent.shape[1] != in_channels: raise ValueError("Adding guide to a combined AV latent is not supported.") - positive = cls.add_keyframe_index(positive, frame_idx, guiding_latent, scale_factors) - negative = cls.add_keyframe_index(negative, frame_idx, guiding_latent, scale_factors) + positive = cls.add_keyframe_index(positive, frame_idx, guiding_latent, scale_factors, latent_downscale_factor) + negative = cls.add_keyframe_index(negative, frame_idx, guiding_latent, scale_factors, latent_downscale_factor) if guide_mask is not None: target_h = max(noise_mask.shape[3], guide_mask.shape[3]) diff --git a/comfy_extras/nodes_lumina2.py b/comfy_extras/nodes_lumina2.py index 89ff2397a..2550475ae 100644 --- a/comfy_extras/nodes_lumina2.py +++ b/comfy_extras/nodes_lumina2.py @@ -79,6 +79,7 @@ class CLIPTextEncodeLumina2(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="CLIPTextEncodeLumina2", + search_aliases=["lumina prompt"], display_name="CLIP Text Encode for Lumina2", category="conditioning", description="Encodes a system prompt and a user prompt using a CLIP model into an embedding " diff --git a/comfy_extras/nodes_mask.py b/comfy_extras/nodes_mask.py index 290e6f55e..98e8fef8f 100644 --- a/comfy_extras/nodes_mask.py +++ b/comfy_extras/nodes_mask.py @@ -50,6 +50,7 @@ class LatentCompositeMasked(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="LatentCompositeMasked", + search_aliases=["overlay latent", "layer latent", "paste latent", "inpaint latent"], category="latent", inputs=[ IO.Latent.Input("destination"), @@ -78,6 +79,7 @@ class ImageCompositeMasked(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="ImageCompositeMasked", + search_aliases=["paste image", "overlay", "layer"], category="image", inputs=[ IO.Image.Input("destination"), @@ -105,6 +107,7 @@ class MaskToImage(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="MaskToImage", + search_aliases=["convert mask"], display_name="Convert Mask to Image", category="mask", inputs=[ @@ -126,6 +129,7 @@ class ImageToMask(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="ImageToMask", + search_aliases=["extract channel", "channel to mask"], display_name="Convert Image to Mask", category="mask", inputs=[ @@ -149,6 +153,7 @@ class ImageColorToMask(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="ImageColorToMask", + search_aliases=["color keying", "chroma key"], category="mask", inputs=[ IO.Image.Input("image"), @@ -194,6 +199,7 @@ class InvertMask(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="InvertMask", + search_aliases=["reverse mask", "flip mask"], category="mask", inputs=[ IO.Mask.Input("mask"), @@ -214,6 +220,7 @@ class CropMask(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="CropMask", + search_aliases=["cut mask", "extract mask region", "mask slice"], category="mask", inputs=[ IO.Mask.Input("mask"), @@ -239,6 +246,7 @@ class MaskComposite(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="MaskComposite", + search_aliases=["combine masks", "blend masks", "layer masks"], category="mask", inputs=[ IO.Mask.Input("destination"), @@ -287,6 +295,7 @@ class FeatherMask(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="FeatherMask", + search_aliases=["soft edge mask", "blur mask edges", "gradient mask edge"], category="mask", inputs=[ IO.Mask.Input("mask"), @@ -333,6 +342,7 @@ class GrowMask(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="GrowMask", + search_aliases=["expand mask", "shrink mask"], display_name="Grow Mask", category="mask", inputs=[ @@ -370,6 +380,7 @@ class ThresholdMask(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="ThresholdMask", + search_aliases=["binary mask"], category="mask", inputs=[ IO.Mask.Input("mask"), @@ -394,6 +405,7 @@ class MaskPreview(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="MaskPreview", + search_aliases=["show mask", "view mask", "inspect mask", "debug mask"], display_name="Preview Mask", category="mask", description="Saves the input images to your ComfyUI output directory.", diff --git a/comfy_extras/nodes_model_advanced.py b/comfy_extras/nodes_model_advanced.py index ae5d2c563..f22b333fc 100644 --- a/comfy_extras/nodes_model_advanced.py +++ b/comfy_extras/nodes_model_advanced.py @@ -299,6 +299,7 @@ class RescaleCFG: return (m, ) class ModelComputeDtype: + SEARCH_ALIASES = ["model precision", "change dtype"] @classmethod def INPUT_TYPES(s): return {"required": { "model": ("MODEL",), diff --git a/comfy_extras/nodes_model_merging.py b/comfy_extras/nodes_model_merging.py index f20beab7d..5384ed531 100644 --- a/comfy_extras/nodes_model_merging.py +++ b/comfy_extras/nodes_model_merging.py @@ -91,6 +91,7 @@ class CLIPMergeSimple: class CLIPSubtract: + SEARCH_ALIASES = ["clip difference", "text encoder subtract"] @classmethod def INPUT_TYPES(s): return {"required": { "clip1": ("CLIP",), @@ -113,6 +114,7 @@ class CLIPSubtract: class CLIPAdd: + SEARCH_ALIASES = ["combine clip"] @classmethod def INPUT_TYPES(s): return {"required": { "clip1": ("CLIP",), @@ -225,6 +227,7 @@ def save_checkpoint(model, clip=None, vae=None, clip_vision=None, filename_prefi comfy.sd.save_checkpoint(output_checkpoint, model, clip, vae, clip_vision, metadata=metadata, extra_keys=extra_keys) class CheckpointSave: + SEARCH_ALIASES = ["save model", "export checkpoint", "merge save"] def __init__(self): self.output_dir = folder_paths.get_output_directory() @@ -337,6 +340,7 @@ class VAESave: return {} class ModelSave: + SEARCH_ALIASES = ["export model", "checkpoint save"] def __init__(self): self.output_dir = folder_paths.get_output_directory() diff --git a/comfy_extras/nodes_model_patch.py b/comfy_extras/nodes_model_patch.py index f66d28fc9..82c4754a3 100644 --- a/comfy_extras/nodes_model_patch.py +++ b/comfy_extras/nodes_model_patch.py @@ -7,6 +7,7 @@ import comfy.model_management import comfy.ldm.common_dit import comfy.latent_formats import comfy.ldm.lumina.controlnet +from comfy.ldm.wan.model_multitalk import WanMultiTalkAttentionBlock, MultiTalkAudioProjModel class BlockWiseControlBlock(torch.nn.Module): @@ -257,6 +258,14 @@ class ModelPatchLoader: if torch.count_nonzero(ref_weight) == 0: config['broken'] = True model = comfy.ldm.lumina.controlnet.ZImage_Control(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast, **config) + elif "audio_proj.proj1.weight" in sd: + model = MultiTalkModelPatch( + audio_window=5, context_tokens=32, vae_scale=4, + in_dim=sd["blocks.0.audio_cross_attn.proj.weight"].shape[0], + intermediate_dim=sd["audio_proj.proj1.weight"].shape[0], + out_dim=sd["audio_proj.norm.weight"].shape[0], + device=comfy.model_management.unet_offload_device(), + operations=comfy.ops.manual_cast) model.load_state_dict(sd) model = comfy.model_patcher.ModelPatcher(model, load_device=comfy.model_management.get_torch_device(), offload_device=comfy.model_management.unet_offload_device()) @@ -524,6 +533,38 @@ class USOStyleReference: return (model_patched,) +class MultiTalkModelPatch(torch.nn.Module): + def __init__( + self, + audio_window: int = 5, + intermediate_dim: int = 512, + in_dim: int = 5120, + out_dim: int = 768, + context_tokens: int = 32, + vae_scale: int = 4, + num_layers: int = 40, + + device=None, dtype=None, operations=None + ): + super().__init__() + self.audio_proj = MultiTalkAudioProjModel( + seq_len=audio_window, + seq_len_vf=audio_window+vae_scale-1, + intermediate_dim=intermediate_dim, + out_dim=out_dim, + context_tokens=context_tokens, + device=device, + dtype=dtype, + operations=operations + ) + self.blocks = torch.nn.ModuleList( + [ + WanMultiTalkAttentionBlock(in_dim, out_dim, device=device, dtype=dtype, operations=operations) + for _ in range(num_layers) + ] + ) + + NODE_CLASS_MAPPINGS = { "ModelPatchLoader": ModelPatchLoader, "QwenImageDiffsynthControlnet": QwenImageDiffsynthControlnet, diff --git a/comfy_extras/nodes_morphology.py b/comfy_extras/nodes_morphology.py index 67377e1bc..4ab2fb7e8 100644 --- a/comfy_extras/nodes_morphology.py +++ b/comfy_extras/nodes_morphology.py @@ -12,6 +12,7 @@ class Morphology(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="Morphology", + search_aliases=["erode", "dilate"], display_name="ImageMorphology", category="image/postprocessing", inputs=[ @@ -57,6 +58,7 @@ class ImageRGBToYUV(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="ImageRGBToYUV", + search_aliases=["color space conversion"], category="image/batch", inputs=[ io.Image.Input("image"), @@ -78,6 +80,7 @@ class ImageYUVToRGB(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="ImageYUVToRGB", + search_aliases=["color space conversion"], category="image/batch", inputs=[ io.Image.Input("Y"), diff --git a/comfy_extras/nodes_pixart.py b/comfy_extras/nodes_pixart.py index a23e87b1f..2f1b73e60 100644 --- a/comfy_extras/nodes_pixart.py +++ b/comfy_extras/nodes_pixart.py @@ -7,6 +7,7 @@ class CLIPTextEncodePixArtAlpha(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="CLIPTextEncodePixArtAlpha", + search_aliases=["pixart prompt"], category="advanced/conditioning", description="Encodes text and sets the resolution conditioning for PixArt Alpha. Does not apply to PixArt Sigma.", inputs=[ diff --git a/comfy_extras/nodes_post_processing.py b/comfy_extras/nodes_post_processing.py index 6011275d6..a52a90e2c 100644 --- a/comfy_extras/nodes_post_processing.py +++ b/comfy_extras/nodes_post_processing.py @@ -402,7 +402,6 @@ def scale_to_multiple_cover(input: torch.Tensor, multiple: int, scale_method: st return input[:, y0:y1, x0:x1] class ResizeImageMaskNode(io.ComfyNode): - scale_methods = ["nearest-exact", "bilinear", "area", "bicubic", "lanczos"] crop_methods = ["disabled", "center"] @@ -421,46 +420,62 @@ class ResizeImageMaskNode(io.ComfyNode): @classmethod def define_schema(cls): template = io.MatchType.Template("input_type", [io.Image, io.Mask]) - crop_combo = io.Combo.Input("crop", options=cls.crop_methods, default="center") + crop_combo = io.Combo.Input( + "crop", + options=cls.crop_methods, + default="center", + tooltip="How to handle aspect ratio mismatch: 'disabled' stretches to fit, 'center' crops to maintain aspect ratio.", + ) return io.Schema( node_id="ResizeImageMaskNode", display_name="Resize Image/Mask", + description="Resize an image or mask using various scaling methods.", category="transform", + search_aliases=["resize", "resize image", "resize mask", "scale", "scale image", "scale mask", "image resize", "change size", "dimensions", "shrink", "enlarge"], inputs=[ io.MatchType.Input("input", template=template), - io.DynamicCombo.Input("resize_type", options=[ - io.DynamicCombo.Option(ResizeType.SCALE_BY, [ - io.Float.Input("multiplier", default=1.00, min=0.01, max=8.0, step=0.01), + io.DynamicCombo.Input( + "resize_type", + tooltip="Select how to resize: by exact dimensions, scale factor, matching another image, etc.", + options=[ + io.DynamicCombo.Option(ResizeType.SCALE_DIMENSIONS, [ + io.Int.Input("width", default=512, min=0, max=MAX_RESOLUTION, step=1, tooltip="Target width in pixels. Set to 0 to auto-calculate from height while preserving aspect ratio."), + io.Int.Input("height", default=512, min=0, max=MAX_RESOLUTION, step=1, tooltip="Target height in pixels. Set to 0 to auto-calculate from width while preserving aspect ratio."), + crop_combo, ]), - io.DynamicCombo.Option(ResizeType.SCALE_DIMENSIONS, [ - io.Int.Input("width", default=512, min=0, max=MAX_RESOLUTION, step=1), - io.Int.Input("height", default=512, min=0, max=MAX_RESOLUTION, step=1), - crop_combo, + io.DynamicCombo.Option(ResizeType.SCALE_BY, [ + io.Float.Input("multiplier", default=1.00, min=0.01, max=8.0, step=0.01, tooltip="Scale factor (e.g., 2.0 doubles size, 0.5 halves size)."), ]), - io.DynamicCombo.Option(ResizeType.SCALE_LONGER_DIMENSION, [ - io.Int.Input("longer_size", default=512, min=0, max=MAX_RESOLUTION, step=1), + io.DynamicCombo.Option(ResizeType.SCALE_LONGER_DIMENSION, [ + io.Int.Input("longer_size", default=512, min=0, max=MAX_RESOLUTION, step=1, tooltip="The longer edge will be resized to this value. Aspect ratio is preserved."), ]), - io.DynamicCombo.Option(ResizeType.SCALE_SHORTER_DIMENSION, [ - io.Int.Input("shorter_size", default=512, min=0, max=MAX_RESOLUTION, step=1), + io.DynamicCombo.Option(ResizeType.SCALE_SHORTER_DIMENSION, [ + io.Int.Input("shorter_size", default=512, min=0, max=MAX_RESOLUTION, step=1, tooltip="The shorter edge will be resized to this value. Aspect ratio is preserved."), ]), - io.DynamicCombo.Option(ResizeType.SCALE_WIDTH, [ - io.Int.Input("width", default=512, min=0, max=MAX_RESOLUTION, step=1), + io.DynamicCombo.Option(ResizeType.SCALE_WIDTH, [ + io.Int.Input("width", default=512, min=0, max=MAX_RESOLUTION, step=1, tooltip="Target width in pixels. Height auto-adjusts to preserve aspect ratio."), ]), - io.DynamicCombo.Option(ResizeType.SCALE_HEIGHT, [ - io.Int.Input("height", default=512, min=0, max=MAX_RESOLUTION, step=1), + io.DynamicCombo.Option(ResizeType.SCALE_HEIGHT, [ + io.Int.Input("height", default=512, min=0, max=MAX_RESOLUTION, step=1, tooltip="Target height in pixels. Width auto-adjusts to preserve aspect ratio."), ]), - io.DynamicCombo.Option(ResizeType.SCALE_TOTAL_PIXELS, [ - io.Float.Input("megapixels", default=1.0, min=0.01, max=16.0, step=0.01), + io.DynamicCombo.Option(ResizeType.SCALE_TOTAL_PIXELS, [ + io.Float.Input("megapixels", default=1.0, min=0.01, max=16.0, step=0.01, tooltip="Target total megapixels (e.g., 1.0 ≈ 1024×1024). Aspect ratio is preserved."), ]), - io.DynamicCombo.Option(ResizeType.MATCH_SIZE, [ - io.MultiType.Input("match", [io.Image, io.Mask]), - crop_combo, + io.DynamicCombo.Option(ResizeType.MATCH_SIZE, [ + io.MultiType.Input("match", [io.Image, io.Mask], tooltip="Resize input to match the dimensions of this reference image or mask."), + crop_combo, ]), - io.DynamicCombo.Option(ResizeType.SCALE_TO_MULTIPLE, [ - io.Int.Input("multiple", default=8, min=1, max=MAX_RESOLUTION, step=1), + io.DynamicCombo.Option(ResizeType.SCALE_TO_MULTIPLE, [ + io.Int.Input("multiple", default=8, min=1, max=MAX_RESOLUTION, step=1, tooltip="Resize so width and height are divisible by this number. Useful for latent alignment (e.g., 8 or 64)."), ]), - ]), - io.Combo.Input("scale_method", options=cls.scale_methods, default="area"), + ], + ), + io.Combo.Input( + "scale_method", + options=cls.scale_methods, + default="area", + tooltip="Interpolation algorithm. 'area' is best for downscaling, 'lanczos' for upscaling, 'nearest-exact' for pixel art.", + ), ], outputs=[io.MatchType.Output(template=template, display_name="resized")] ) @@ -569,6 +584,7 @@ class BatchMasksNode(io.ComfyNode): autogrow_template = io.Autogrow.TemplatePrefix(io.Mask.Input("mask"), prefix="mask", min=2, max=50) return io.Schema( node_id="BatchMasksNode", + search_aliases=["combine masks", "stack masks", "merge masks"], display_name="Batch Masks", category="mask", inputs=[ @@ -589,6 +605,7 @@ class BatchLatentsNode(io.ComfyNode): autogrow_template = io.Autogrow.TemplatePrefix(io.Latent.Input("latent"), prefix="latent", min=2, max=50) return io.Schema( node_id="BatchLatentsNode", + search_aliases=["combine latents", "stack latents", "merge latents"], display_name="Batch Latents", category="latent", inputs=[ @@ -612,6 +629,7 @@ class BatchImagesMasksLatentsNode(io.ComfyNode): prefix="input", min=1, max=50) return io.Schema( node_id="BatchImagesMasksLatentsNode", + search_aliases=["combine batch", "merge batch", "stack inputs"], display_name="Batch Images/Masks/Latents", category="util", inputs=[ diff --git a/comfy_extras/nodes_preview_any.py b/comfy_extras/nodes_preview_any.py index 91502ebf2..b0a6f279d 100644 --- a/comfy_extras/nodes_preview_any.py +++ b/comfy_extras/nodes_preview_any.py @@ -16,7 +16,7 @@ class PreviewAny(): OUTPUT_NODE = True CATEGORY = "utils" - SEARCH_ALIASES = ["preview", "show", "display", "view", "show text", "display text", "preview text", "show output", "inspect", "debug"] + SEARCH_ALIASES = ["show output", "inspect", "debug", "print value", "show text"] def main(self, source=None): value = 'None' diff --git a/comfy_extras/nodes_sd3.py b/comfy_extras/nodes_sd3.py index 14782cb2b..736213a47 100644 --- a/comfy_extras/nodes_sd3.py +++ b/comfy_extras/nodes_sd3.py @@ -55,7 +55,7 @@ class EmptySD3LatentImage(io.ComfyNode): @classmethod def execute(cls, width, height, batch_size=1) -> io.NodeOutput: latent = torch.zeros([batch_size, 16, height // 8, width // 8], device=comfy.model_management.intermediate_device()) - return io.NodeOutput({"samples":latent}) + return io.NodeOutput({"samples": latent, "downscale_ratio_spacial": 8}) generate = execute # TODO: remove @@ -65,6 +65,7 @@ class CLIPTextEncodeSD3(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="CLIPTextEncodeSD3", + search_aliases=["sd3 prompt"], category="advanced/conditioning", inputs=[ io.Clip.Input("clip"), diff --git a/comfy_extras/nodes_string.py b/comfy_extras/nodes_string.py index a2d5f0d94..8d3e65cc5 100644 --- a/comfy_extras/nodes_string.py +++ b/comfy_extras/nodes_string.py @@ -32,6 +32,7 @@ class StringSubstring(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="StringSubstring", + search_aliases=["extract text", "text portion"], display_name="Substring", category="utils/string", inputs=[ @@ -54,6 +55,7 @@ class StringLength(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="StringLength", + search_aliases=["character count", "text size"], display_name="Length", category="utils/string", inputs=[ @@ -74,6 +76,7 @@ class CaseConverter(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="CaseConverter", + search_aliases=["text case", "uppercase", "lowercase", "capitalize"], display_name="Case Converter", category="utils/string", inputs=[ @@ -106,6 +109,7 @@ class StringTrim(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="StringTrim", + search_aliases=["clean whitespace", "remove whitespace"], display_name="Trim", category="utils/string", inputs=[ @@ -136,6 +140,7 @@ class StringReplace(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="StringReplace", + search_aliases=["find and replace", "substitute", "swap text"], display_name="Replace", category="utils/string", inputs=[ @@ -158,6 +163,7 @@ class StringContains(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="StringContains", + search_aliases=["text includes", "string includes"], display_name="Contains", category="utils/string", inputs=[ @@ -185,6 +191,7 @@ class StringCompare(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="StringCompare", + search_aliases=["text match", "string equals", "starts with", "ends with"], display_name="Compare", category="utils/string", inputs=[ @@ -220,6 +227,7 @@ class RegexMatch(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="RegexMatch", + search_aliases=["pattern match", "text contains", "string match"], display_name="Regex Match", category="utils/string", inputs=[ @@ -260,6 +268,7 @@ class RegexExtract(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="RegexExtract", + search_aliases=["pattern extract", "text parser", "parse text"], display_name="Regex Extract", category="utils/string", inputs=[ @@ -334,6 +343,7 @@ class RegexReplace(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="RegexReplace", + search_aliases=["pattern replace", "find and replace", "substitution"], display_name="Regex Replace", category="utils/string", description="Find and replace text using regex patterns.", diff --git a/comfy_extras/nodes_train.py b/comfy_extras/nodes_train.py index 364804205..024a89391 100644 --- a/comfy_extras/nodes_train.py +++ b/comfy_extras/nodes_train.py @@ -18,6 +18,7 @@ import comfy_extras.nodes_custom_sampler import folder_paths import node_helpers from comfy.weight_adapter import adapters, adapter_maps +from comfy.weight_adapter.bypass import BypassInjectionManager from comfy_api.latest import ComfyExtension, io, ui from comfy.utils import ProgressBar @@ -339,6 +340,11 @@ class TrainSampler(comfy.samplers.Sampler): self._train_step_multires_mode(model_wrap, cond, extra_args, noisegen, latent_image, dataset_size, pbar) if (i + 1) % self.grad_acc == 0: + for param_groups in self.optimizer.param_groups: + for param in param_groups["params"]: + if param.grad is None: + continue + param.grad.data = param.grad.data.to(param.data.dtype) self.optimizer.step() self.optimizer.zero_grad() ui_pbar.update(1) @@ -498,9 +504,9 @@ def _prepare_latents_and_count(latents, dtype, bucket_mode): num_images = sum(t.shape[0] for t in latents) multi_res = False # Not using multi_res path in bucket mode - logging.info(f"Bucket mode: {num_buckets} buckets, {num_images} total samples") + logging.debug(f"Bucket mode: {num_buckets} buckets, {num_images} total samples") for i, lat in enumerate(latents): - logging.info(f" Bucket {i}: shape {lat.shape}") + logging.debug(f" Bucket {i}: shape {lat.shape}") return latents, num_images, multi_res # Non-bucket mode @@ -509,7 +515,7 @@ def _prepare_latents_and_count(latents, dtype, bucket_mode): latents = [t.to(dtype) for t in latents] for latent in latents: all_shapes.add(latent.shape) - logging.info(f"Latent shapes: {all_shapes}") + logging.debug(f"Latent shapes: {all_shapes}") if len(all_shapes) > 1: multi_res = True else: @@ -545,7 +551,7 @@ def _validate_and_expand_conditioning(positive, num_images, bucket_mode): if bucket_mode: return positive # Skip validation in bucket mode - logging.info(f"Total Images: {num_images}, Total Captions: {len(positive)}") + logging.debug(f"Total Images: {num_images}, Total Captions: {len(positive)}") if len(positive) == 1 and num_images > 1: return positive * num_images elif len(positive) != num_images: @@ -596,6 +602,8 @@ def _create_weight_adapter( shape = module.weight.shape lora_params = {} + logging.debug(f"Creating weight adapter for {key} with shape {shape}") + if len(shape) >= 2: alpha = float(existing_weights.get(f"{key}.alpha", 1.0)) dora_scale = existing_weights.get(f"{key}.dora_scale", None) @@ -690,6 +698,61 @@ def _setup_lora_adapters(mp, existing_weights, algorithm, lora_dtype, rank): return lora_sd, all_weight_adapters +def _setup_lora_adapters_bypass(mp, existing_weights, algorithm, lora_dtype, rank): + """Setup LoRA adapters in bypass mode. + + In bypass mode: + - Weight adapters (lora/lokr/oft) use bypass injection (forward hook) + - Bias/norm adapters (BiasDiff) still use weight wrapper (direct modification) + + This is useful when the base model weights are quantized and cannot be + directly modified. + + Args: + mp: Model patcher + existing_weights: Dict of existing LoRA weights + algorithm: Algorithm name for new adapters + lora_dtype: dtype for LoRA weights + rank: Rank for new LoRA adapters + + Returns: + tuple: (lora_sd dict, all_weight_adapters list, bypass_manager) + """ + lora_sd = {} + all_weight_adapters = [] + bypass_manager = BypassInjectionManager() + + for n, m in mp.model.named_modules(): + if hasattr(m, "weight_function"): + if m.weight is not None: + adapter, params = _create_weight_adapter( + m, n, existing_weights, algorithm, lora_dtype, rank + ) + lora_sd.update(params) + all_weight_adapters.append(adapter) + + key = f"{n}.weight" + # BiasDiff (for 1D weights like norm) uses weight wrapper, not bypass + # Only use bypass for adapters that have h() method (lora/lokr/oft) + if isinstance(adapter, BiasDiff): + mp.add_weight_wrapper(key, adapter) + logging.debug(f"[BypassMode] Added 1D weight adapter (weight wrapper) for {key}") + else: + bypass_manager.add_adapter(key, adapter, strength=1.0) + logging.debug(f"[BypassMode] Added weight adapter (bypass) for {key}") + + if hasattr(m, "bias") and m.bias is not None: + # Bias adapters still use weight wrapper (bias is usually not quantized) + bias_adapter, bias_params = _create_bias_adapter(m, n, lora_dtype) + lora_sd.update(bias_params) + key = f"{n}.bias" + mp.add_weight_wrapper(key, bias_adapter) + all_weight_adapters.append(bias_adapter) + logging.debug(f"[BypassMode] Added bias adapter (weight wrapper) for {key}") + + return lora_sd, all_weight_adapters, bypass_manager + + def _create_optimizer(optimizer_name, parameters, learning_rate): """Create optimizer based on name. @@ -884,11 +947,13 @@ class TrainLoraNode(io.ComfyNode): default=False, tooltip="Enable resolution bucket mode. When enabled, expects pre-bucketed latents from ResolutionBucket node.", ), + io.Boolean.Input( + "bypass_mode", + default=False, + tooltip="Enable bypass mode for training. When enabled, adapters are applied via forward hooks instead of weight modification. Useful for quantized models where weights cannot be directly modified.", + ), ], outputs=[ - io.Model.Output( - display_name="model", tooltip="Model with LoRA applied" - ), io.Custom("LORA_MODEL").Output( display_name="lora", tooltip="LoRA weights" ), @@ -919,6 +984,7 @@ class TrainLoraNode(io.ComfyNode): gradient_checkpointing, existing_lora, bucket_mode, + bypass_mode, ): # Extract scalars from lists (due to is_input_list=True) model = model[0] @@ -936,6 +1002,7 @@ class TrainLoraNode(io.ComfyNode): gradient_checkpointing = gradient_checkpointing[0] existing_lora = existing_lora[0] bucket_mode = bucket_mode[0] + bypass_mode = bypass_mode[0] # Process latents based on mode if bucket_mode: @@ -968,9 +1035,16 @@ class TrainLoraNode(io.ComfyNode): existing_weights, existing_steps = _load_existing_lora(existing_lora) # Setup LoRA adapters - lora_sd, all_weight_adapters = _setup_lora_adapters( - mp, existing_weights, algorithm, lora_dtype, rank - ) + bypass_manager = None + if bypass_mode: + logging.debug("Using bypass mode for training") + lora_sd, all_weight_adapters, bypass_manager = _setup_lora_adapters_bypass( + mp, existing_weights, algorithm, lora_dtype, rank + ) + else: + lora_sd, all_weight_adapters = _setup_lora_adapters( + mp, existing_weights, algorithm, lora_dtype, rank + ) # Create optimizer and loss function optimizer = _create_optimizer( @@ -1029,6 +1103,14 @@ class TrainLoraNode(io.ComfyNode): guider = TrainGuider(mp) guider.set_conds(positive) + # Inject bypass hooks if bypass mode is enabled + bypass_injections = None + if bypass_manager is not None: + bypass_injections = bypass_manager.create_injections(mp.model) + for injection in bypass_injections: + injection.inject(mp) + logging.debug(f"[BypassMode] Injected {bypass_manager.get_hook_count()} bypass hooks") + # Run training loop try: _run_training_loop( @@ -1041,6 +1123,11 @@ class TrainLoraNode(io.ComfyNode): multi_res, ) finally: + # Eject bypass hooks if they were injected + if bypass_injections is not None: + for injection in bypass_injections: + injection.eject(mp) + logging.debug("[BypassMode] Ejected bypass hooks") for m in mp.model.modules(): unpatch(m) del train_sampler, optimizer @@ -1052,7 +1139,9 @@ class TrainLoraNode(io.ComfyNode): for param in lora_sd: lora_sd[param] = lora_sd[param].to(lora_dtype) - return io.NodeOutput(mp, lora_sd, loss_map, steps + existing_steps) + # mp in train node is highly specialized for training + # use it in inference will result in bad behavior so we don't return it + return io.NodeOutput(lora_sd, loss_map, steps + existing_steps) class LoraModelLoader(io.ComfyNode):# @@ -1101,6 +1190,7 @@ class SaveLoRA(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SaveLoRA", + search_aliases=["export lora"], display_name="Save LoRA Weights", category="loaders", is_experimental=True, @@ -1144,6 +1234,7 @@ class LossGraphNode(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LossGraphNode", + search_aliases=["training chart", "training visualization", "plot loss"], display_name="Plot Loss Graph", category="training", is_experimental=True, diff --git a/comfy_extras/nodes_video.py b/comfy_extras/nodes_video.py index c609e03da..ccf7b63d3 100644 --- a/comfy_extras/nodes_video.py +++ b/comfy_extras/nodes_video.py @@ -16,6 +16,7 @@ class SaveWEBM(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SaveWEBM", + search_aliases=["export webm"], category="image/video", is_experimental=True, inputs=[ @@ -69,6 +70,7 @@ class SaveVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SaveVideo", + search_aliases=["export video"], display_name="Save Video", category="image/video", description="Saves the input images to your ComfyUI output directory.", @@ -116,6 +118,7 @@ class CreateVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="CreateVideo", + search_aliases=["images to video"], display_name="Create Video", category="image/video", description="Create a video from images.", @@ -140,6 +143,7 @@ class GetVideoComponents(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="GetVideoComponents", + search_aliases=["extract frames", "split video", "video to images", "demux"], display_name="Get Video Components", category="image/video", description="Extracts all components from a video: frames, audio, and framerate.", @@ -167,6 +171,7 @@ class LoadVideo(io.ComfyNode): files = folder_paths.filter_files_content_types(files, ["video"]) return io.Schema( node_id="LoadVideo", + search_aliases=["import video", "open video", "video file"], display_name="Load Video", category="image/video", inputs=[ diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index d32aad98e..2ff012134 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -8,9 +8,10 @@ import comfy.latent_formats import comfy.clip_vision import json import numpy as np -from typing import Tuple +from typing import Tuple, TypedDict from typing_extensions import override from comfy_api.latest import ComfyExtension, io +import logging class WanImageToVideo(io.ComfyNode): @classmethod @@ -286,6 +287,7 @@ class WanVaceToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanVaceToVideo", + search_aliases=["video conditioning", "video control"], category="conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), @@ -704,6 +706,7 @@ class WanTrackToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanTrackToVideo", + search_aliases=["motion tracking", "trajectory video", "point tracking", "keypoint animation"], category="conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), @@ -1288,6 +1291,171 @@ class Wan22ImageToVideoLatent(io.ComfyNode): return io.NodeOutput(out_latent) +from comfy.ldm.wan.model_multitalk import InfiniteTalkOuterSampleWrapper, MultiTalkCrossAttnPatch, MultiTalkGetAttnMapPatch, project_audio_features +class WanInfiniteTalkToVideo(io.ComfyNode): + class DCValues(TypedDict): + mode: str + audio_encoder_output_2: io.AudioEncoderOutput.Type + mask: io.Mask.Type + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="WanInfiniteTalkToVideo", + category="conditioning/video_models", + inputs=[ + io.DynamicCombo.Input("mode", options=[ + io.DynamicCombo.Option("single_speaker", []), + io.DynamicCombo.Option("two_speakers", [ + io.AudioEncoderOutput.Input("audio_encoder_output_2", optional=True), + io.Mask.Input("mask_1", optional=True, tooltip="Mask for the first speaker, required if using two audio inputs."), + io.Mask.Input("mask_2", optional=True, tooltip="Mask for the second speaker, required if using two audio inputs."), + ]), + ]), + io.Model.Input("model"), + io.ModelPatch.Input("model_patch"), + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.ClipVisionOutput.Input("clip_vision_output", optional=True), + io.Image.Input("start_image", optional=True), + io.AudioEncoderOutput.Input("audio_encoder_output_1"), + io.Int.Input("motion_frame_count", default=9, min=1, max=33, step=1, tooltip="Number of previous frames to use as motion context."), + io.Float.Input("audio_scale", default=1.0, min=-10.0, max=10.0, step=0.01), + io.Image.Input("previous_frames", optional=True), + ], + outputs=[ + io.Model.Output(display_name="model"), + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + io.Int.Output(display_name="trim_image"), + ], + ) + + @classmethod + def execute(cls, mode: DCValues, model, model_patch, positive, negative, vae, width, height, length, audio_encoder_output_1, motion_frame_count, + start_image=None, previous_frames=None, audio_scale=None, clip_vision_output=None, audio_encoder_output_2=None, mask_1=None, mask_2=None) -> io.NodeOutput: + + if previous_frames is not None and previous_frames.shape[0] < motion_frame_count: + raise ValueError("Not enough previous frames provided.") + + if mode["mode"] == "two_speakers": + audio_encoder_output_2 = mode["audio_encoder_output_2"] + mask_1 = mode["mask_1"] + mask_2 = mode["mask_2"] + + if audio_encoder_output_2 is not None: + if mask_1 is None or mask_2 is None: + raise ValueError("Masks must be provided if two audio encoder outputs are used.") + + ref_masks = None + if mask_1 is not None and mask_2 is not None: + if audio_encoder_output_2 is None: + raise ValueError("Second audio encoder output must be provided if two masks are used.") + ref_masks = torch.cat([mask_1, mask_2]) + + latent = torch.zeros([1, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) + if start_image is not None: + start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + image = torch.ones((length, height, width, start_image.shape[-1]), device=start_image.device, dtype=start_image.dtype) * 0.5 + image[:start_image.shape[0]] = start_image + + concat_latent_image = vae.encode(image[:, :, :, :3]) + concat_mask = torch.ones((1, 1, latent.shape[2], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=start_image.device, dtype=start_image.dtype) + concat_mask[:, :, :((start_image.shape[0] - 1) // 4) + 1] = 0.0 + + positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image, "concat_mask": concat_mask}) + negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent_image, "concat_mask": concat_mask}) + + if clip_vision_output is not None: + positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output}) + negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output}) + + model_patched = model.clone() + + encoded_audio_list = [] + seq_lengths = [] + + for audio_encoder_output in [audio_encoder_output_1, audio_encoder_output_2]: + if audio_encoder_output is None: + continue + all_layers = audio_encoder_output["encoded_audio_all_layers"] + encoded_audio = torch.stack(all_layers, dim=0).squeeze(1)[1:] # shape: [num_layers, T, 512] + encoded_audio = linear_interpolation(encoded_audio, input_fps=50, output_fps=25).movedim(0, 1) # shape: [T, num_layers, 512] + encoded_audio_list.append(encoded_audio) + seq_lengths.append(encoded_audio.shape[0]) + + # Pad / combine depending on multi_audio_type + multi_audio_type = "add" + if len(encoded_audio_list) > 1: + if multi_audio_type == "para": + max_len = max(seq_lengths) + padded = [] + for emb in encoded_audio_list: + if emb.shape[0] < max_len: + pad = torch.zeros(max_len - emb.shape[0], *emb.shape[1:], dtype=emb.dtype) + emb = torch.cat([emb, pad], dim=0) + padded.append(emb) + encoded_audio_list = padded + elif multi_audio_type == "add": + total_len = sum(seq_lengths) + full_list = [] + offset = 0 + for emb, seq_len in zip(encoded_audio_list, seq_lengths): + full = torch.zeros(total_len, *emb.shape[1:], dtype=emb.dtype) + full[offset:offset+seq_len] = emb + full_list.append(full) + offset += seq_len + encoded_audio_list = full_list + + token_ref_target_masks = None + if ref_masks is not None: + token_ref_target_masks = torch.nn.functional.interpolate( + ref_masks.unsqueeze(0), size=(latent.shape[-2] // 2, latent.shape[-1] // 2), mode='nearest')[0] + token_ref_target_masks = (token_ref_target_masks > 0).view(token_ref_target_masks.shape[0], -1) + + # when extending from previous frames + if previous_frames is not None: + motion_frames = comfy.utils.common_upscale(previous_frames[-motion_frame_count:].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + frame_offset = previous_frames.shape[0] - motion_frame_count + + audio_start = frame_offset + audio_end = audio_start + length + logging.info(f"InfiniteTalk: Processing audio frames {audio_start} - {audio_end}") + + motion_frames_latent = vae.encode(motion_frames[:, :, :, :3]) + trim_image = motion_frame_count + else: + audio_start = trim_image = 0 + audio_end = length + motion_frames_latent = concat_latent_image[:, :, :1] + + audio_embed = project_audio_features(model_patch.model.audio_proj, encoded_audio_list, audio_start, audio_end).to(model_patched.model_dtype()) + model_patched.model_options["transformer_options"]["audio_embeds"] = audio_embed + + # add outer sample wrapper + model_patched.add_wrapper_with_key( + comfy.patcher_extension.WrappersMP.OUTER_SAMPLE, + "infinite_talk_outer_sample", + InfiniteTalkOuterSampleWrapper( + motion_frames_latent, + model_patch, + is_extend=previous_frames is not None, + )) + # add cross-attention patch + model_patched.set_model_patch(MultiTalkCrossAttnPatch(model_patch, audio_scale), "attn2_patch") + if token_ref_target_masks is not None: + model_patched.set_model_patch(MultiTalkGetAttnMapPatch(token_ref_target_masks), "attn1_patch") + + out_latent = {} + out_latent["samples"] = latent + return io.NodeOutput(model_patched, positive, negative, out_latent, trim_image) + + class WanExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[io.ComfyNode]]: @@ -1307,6 +1475,7 @@ class WanExtension(ComfyExtension): WanHuMoImageToVideo, WanAnimateToVideo, Wan22ImageToVideoLatent, + WanInfiniteTalkToVideo, ] async def comfy_entrypoint() -> WanExtension: diff --git a/comfy_extras/nodes_wanmove.py b/comfy_extras/nodes_wanmove.py index 5f39afa46..d60baf230 100644 --- a/comfy_extras/nodes_wanmove.py +++ b/comfy_extras/nodes_wanmove.py @@ -324,6 +324,7 @@ class GenerateTracks(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="GenerateTracks", + search_aliases=["motion paths", "camera movement", "trajectory"], category="conditioning/video_models", inputs=[ io.Int.Input("width", default=832, min=16, max=4096, step=16), diff --git a/comfy_extras/nodes_webcam.py b/comfy_extras/nodes_webcam.py index 5bf80b4c6..6349ac017 100644 --- a/comfy_extras/nodes_webcam.py +++ b/comfy_extras/nodes_webcam.py @@ -5,6 +5,7 @@ MAX_RESOLUTION = nodes.MAX_RESOLUTION class WebcamCapture(nodes.LoadImage): + SEARCH_ALIASES = ["camera input", "live capture", "camera feed", "snapshot"] @classmethod def INPUT_TYPES(s): return { diff --git a/comfyui_version.py b/comfyui_version.py index 952d413db..b1ebaa115 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.10.0" +__version__ = "0.11.1" diff --git a/execution.py b/execution.py index 648f204ec..4b4f63c80 100644 --- a/execution.py +++ b/execution.py @@ -175,7 +175,7 @@ def get_input_data(inputs, class_def, unique_id, execution_list=None, dynprompt= continue obj = cached.outputs[output_index] input_data_all[x] = obj - elif input_category is not None: + elif input_category is not None or (is_v3 and class_def.ACCEPT_ALL_INPUTS): input_data_all[x] = [input_data] if is_v3: diff --git a/latent_preview.py b/latent_preview.py index d52e3f7a1..a9d777661 100644 --- a/latent_preview.py +++ b/latent_preview.py @@ -11,7 +11,7 @@ import logging default_preview_method = args.preview_method MAX_PREVIEW_RESOLUTION = args.preview_size -VIDEO_TAES = ["taehv", "lighttaew2_2", "lighttaew2_1", "lighttaehy1_5"] +VIDEO_TAES = ["taehv", "lighttaew2_2", "lighttaew2_1", "lighttaehy1_5", "taeltx_2"] def preview_to_image(latent_image, do_scale=True): if do_scale: diff --git a/manager_requirements.txt b/manager_requirements.txt index bea6d4927..c420cc48e 100644 --- a/manager_requirements.txt +++ b/manager_requirements.txt @@ -1 +1 @@ -comfyui_manager==4.0.5 +comfyui_manager==4.1b1 diff --git a/nodes.py b/nodes.py index 67b61dcfe..1cb43d9e2 100644 --- a/nodes.py +++ b/nodes.py @@ -93,6 +93,8 @@ class ConditioningCombine: return (conditioning_1 + conditioning_2, ) class ConditioningAverage : + SEARCH_ALIASES = ["blend prompts", "interpolate conditioning", "mix prompts", "style fusion", "weighted blend"] + @classmethod def INPUT_TYPES(s): return {"required": {"conditioning_to": ("CONDITIONING", ), "conditioning_from": ("CONDITIONING", ), @@ -159,6 +161,8 @@ class ConditioningConcat: return (out, ) class ConditioningSetArea: + SEARCH_ALIASES = ["regional prompt", "area prompt", "spatial conditioning", "localized prompt"] + @classmethod def INPUT_TYPES(s): return {"required": {"conditioning": ("CONDITIONING", ), @@ -217,6 +221,8 @@ class ConditioningSetAreaStrength: class ConditioningSetMask: + SEARCH_ALIASES = ["masked prompt", "regional inpaint conditioning", "mask conditioning"] + @classmethod def INPUT_TYPES(s): return {"required": {"conditioning": ("CONDITIONING", ), @@ -242,6 +248,8 @@ class ConditioningSetMask: return (c, ) class ConditioningZeroOut: + SEARCH_ALIASES = ["null conditioning", "clear conditioning"] + @classmethod def INPUT_TYPES(s): return {"required": {"conditioning": ("CONDITIONING", )}} @@ -467,6 +475,8 @@ class InpaintModelConditioning: class SaveLatent: + SEARCH_ALIASES = ["export latent"] + def __init__(self): self.output_dir = folder_paths.get_output_directory() @@ -518,6 +528,8 @@ class SaveLatent: class LoadLatent: + SEARCH_ALIASES = ["import latent", "open latent"] + @classmethod def INPUT_TYPES(s): input_dir = folder_paths.get_input_directory() @@ -554,6 +566,8 @@ class LoadLatent: class CheckpointLoader: + SEARCH_ALIASES = ["load model", "model loader"] + @classmethod def INPUT_TYPES(s): return {"required": { "config_name": (folder_paths.get_filename_list("configs"), ), @@ -593,6 +607,8 @@ class CheckpointLoaderSimple: return out[:3] class DiffusersLoader: + SEARCH_ALIASES = ["load diffusers model"] + @classmethod def INPUT_TYPES(cls): paths = [] @@ -707,7 +723,7 @@ class LoraLoaderModelOnly(LoraLoader): return (self.load_lora(model, None, lora_name, strength_model, 0)[0],) class VAELoader: - video_taes = ["taehv", "lighttaew2_2", "lighttaew2_1", "lighttaehy1_5"] + video_taes = ["taehv", "lighttaew2_2", "lighttaew2_1", "lighttaehy1_5", "taeltx_2"] image_taes = ["taesd", "taesdxl", "taesd3", "taef1"] @staticmethod def vae_list(s): @@ -1063,6 +1079,8 @@ class StyleModelLoader: class StyleModelApply: + SEARCH_ALIASES = ["style transfer"] + @classmethod def INPUT_TYPES(s): return {"required": {"conditioning": ("CONDITIONING", ), @@ -1212,10 +1230,12 @@ class EmptyLatentImage: def generate(self, width, height, batch_size=1): latent = torch.zeros([batch_size, 4, height // 8, width // 8], device=self.device) - return ({"samples":latent}, ) + return ({"samples": latent, "downscale_ratio_spacial": 8}, ) class LatentFromBatch: + SEARCH_ALIASES = ["select from batch", "pick latent", "batch subset"] + @classmethod def INPUT_TYPES(s): return {"required": { "samples": ("LATENT",), @@ -1248,6 +1268,8 @@ class LatentFromBatch: return (s,) class RepeatLatentBatch: + SEARCH_ALIASES = ["duplicate latent", "clone latent"] + @classmethod def INPUT_TYPES(s): return {"required": { "samples": ("LATENT",), @@ -1274,6 +1296,8 @@ class RepeatLatentBatch: return (s,) class LatentUpscale: + SEARCH_ALIASES = ["enlarge latent", "resize latent"] + upscale_methods = ["nearest-exact", "bilinear", "area", "bicubic", "bislerp"] crop_methods = ["disabled", "center"] @@ -1308,6 +1332,8 @@ class LatentUpscale: return (s,) class LatentUpscaleBy: + SEARCH_ALIASES = ["enlarge latent", "resize latent", "scale latent"] + upscale_methods = ["nearest-exact", "bilinear", "area", "bicubic", "bislerp"] @classmethod @@ -1351,6 +1377,8 @@ class LatentRotate: return (s,) class LatentFlip: + SEARCH_ALIASES = ["mirror latent"] + @classmethod def INPUT_TYPES(s): return {"required": { "samples": ("LATENT",), @@ -1371,6 +1399,8 @@ class LatentFlip: return (s,) class LatentComposite: + SEARCH_ALIASES = ["overlay latent", "layer latent", "paste latent"] + @classmethod def INPUT_TYPES(s): return {"required": { "samples_to": ("LATENT",), @@ -1413,6 +1443,8 @@ class LatentComposite: return (samples_out,) class LatentBlend: + SEARCH_ALIASES = ["mix latents", "interpolate latents"] + @classmethod def INPUT_TYPES(s): return {"required": { @@ -1454,6 +1486,8 @@ class LatentBlend: raise ValueError(f"Unsupported blend mode: {mode}") class LatentCrop: + SEARCH_ALIASES = ["trim latent", "cut latent"] + @classmethod def INPUT_TYPES(s): return {"required": { "samples": ("LATENT",), @@ -1504,7 +1538,7 @@ class SetLatentNoiseMask: def common_ksampler(model, seed, steps, cfg, sampler_name, scheduler, positive, negative, latent, denoise=1.0, disable_noise=False, start_step=None, last_step=None, force_full_denoise=False): latent_image = latent["samples"] - latent_image = comfy.sample.fix_empty_latent_channels(model, latent_image) + latent_image = comfy.sample.fix_empty_latent_channels(model, latent_image, latent.get("downscale_ratio_spacial", None)) if disable_noise: noise = torch.zeros(latent_image.size(), dtype=latent_image.dtype, layout=latent_image.layout, device="cpu") @@ -1522,6 +1556,7 @@ def common_ksampler(model, seed, steps, cfg, sampler_name, scheduler, positive, denoise=denoise, disable_noise=disable_noise, start_step=start_step, last_step=last_step, force_full_denoise=force_full_denoise, noise_mask=noise_mask, callback=callback, disable_pbar=disable_pbar, seed=seed) out = latent.copy() + out.pop("downscale_ratio_spacial", None) out["samples"] = samples return (out, ) @@ -1739,6 +1774,8 @@ class LoadImage: return True class LoadImageMask: + SEARCH_ALIASES = ["import mask", "alpha mask", "channel mask"] + _color_channels = ["alpha", "red", "green", "blue"] @classmethod def INPUT_TYPES(s): @@ -1789,6 +1826,8 @@ class LoadImageMask: class LoadImageOutput(LoadImage): + SEARCH_ALIASES = ["output image", "previous generation"] + @classmethod def INPUT_TYPES(s): return { @@ -1862,6 +1901,7 @@ class ImageScaleBy: return (s,) class ImageInvert: + SEARCH_ALIASES = ["reverse colors"] @classmethod def INPUT_TYPES(s): @@ -1877,6 +1917,7 @@ class ImageInvert: return (s,) class ImageBatch: + SEARCH_ALIASES = ["combine images", "merge images", "stack images"] @classmethod def INPUT_TYPES(s): @@ -1922,6 +1963,7 @@ class EmptyImage: return (torch.cat((r, g, b), dim=-1), ) class ImagePadForOutpaint: + SEARCH_ALIASES = ["extend canvas", "expand image"] @classmethod def INPUT_TYPES(s): @@ -2063,7 +2105,8 @@ NODE_DISPLAY_NAME_MAPPINGS = { "CheckpointLoader": "Load Checkpoint With Config (DEPRECATED)", "CheckpointLoaderSimple": "Load Checkpoint", "VAELoader": "Load VAE", - "LoraLoader": "Load LoRA", + "LoraLoader": "Load LoRA (Model and CLIP)", + "LoraLoaderModelOnly": "Load LoRA", "CLIPLoader": "Load CLIP", "ControlNetLoader": "Load ControlNet Model", "DiffControlNetLoader": "Load ControlNet Model (diff)", @@ -2389,6 +2432,8 @@ async def init_builtin_extra_nodes(): "nodes_wanmove.py", "nodes_image_compare.py", "nodes_zimage.py", + "nodes_lora_debug.py", + "nodes_color.py" ] import_failed = [] diff --git a/pyproject.toml b/pyproject.toml index 120b6c751..042f124e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.10.0" +version = "0.11.1" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.10" diff --git a/requirements.txt b/requirements.txt index 3ac898de4..83960cd7c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.37.11 -comfyui-workflow-templates==0.8.15 +comfyui-workflow-templates==0.8.27 comfyui-embedded-docs==0.4.0 torch>=2.4.0 torchsde @@ -22,6 +22,7 @@ alembic SQLAlchemy>=2.0.0 av>=14.2.0 comfy-kitchen>=0.2.7 +requests #non essential dependencies: kornia>=0.7.1 diff --git a/server.py b/server.py index 1888745b7..2300393b2 100644 --- a/server.py +++ b/server.py @@ -656,6 +656,7 @@ class PromptServer(): info = {} info['input'] = obj_class.INPUT_TYPES() info['input_order'] = {key: list(value.keys()) for (key, value) in obj_class.INPUT_TYPES().items()} + info['is_input_list'] = getattr(obj_class, "INPUT_IS_LIST", False) info['output'] = obj_class.RETURN_TYPES info['output_is_list'] = obj_class.OUTPUT_IS_LIST if hasattr(obj_class, 'OUTPUT_IS_LIST') else [False] * len(obj_class.RETURN_TYPES) info['output_name'] = obj_class.RETURN_NAMES if hasattr(obj_class, 'RETURN_NAMES') else info['output'] @@ -679,6 +680,8 @@ class PromptServer(): info['deprecated'] = True if getattr(obj_class, "EXPERIMENTAL", False): info['experimental'] = True + if getattr(obj_class, "DEV_ONLY", False): + info['dev_only'] = True if hasattr(obj_class, 'API_NODE'): info['api_node'] = obj_class.API_NODE diff --git a/tests-unit/assets_test/conftest.py b/tests-unit/assets_test/conftest.py new file mode 100644 index 000000000..0a57dd7b5 --- /dev/null +++ b/tests-unit/assets_test/conftest.py @@ -0,0 +1,271 @@ +import contextlib +import json +import os +import socket +import subprocess +import sys +import tempfile +import time +from pathlib import Path +from typing import Callable, Iterator, Optional + +import pytest +import requests + + +def pytest_addoption(parser: pytest.Parser) -> None: + """ + Allow overriding the database URL used by the spawned ComfyUI process. + Priority: + 1) --db-url command line option + 2) ASSETS_TEST_DB_URL environment variable (used by CI) + 3) default: None (will use file-backed sqlite in temp dir) + """ + parser.addoption( + "--db-url", + action="store", + default=os.environ.get("ASSETS_TEST_DB_URL"), + help="SQLAlchemy DB URL (e.g. sqlite:///path/to/db.sqlite3)", + ) + + +def _free_port() -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("127.0.0.1", 0)) + return s.getsockname()[1] + + +def _make_base_dirs(root: Path) -> None: + for sub in ("models", "custom_nodes", "input", "output", "temp", "user"): + (root / sub).mkdir(parents=True, exist_ok=True) + + +def _wait_http_ready(base: str, session: requests.Session, timeout: float = 90.0) -> None: + start = time.time() + last_err = None + while time.time() - start < timeout: + try: + r = session.get(base + "/api/assets", timeout=5) + if r.status_code in (200, 400): + return + except Exception as e: + last_err = e + time.sleep(0.25) + raise RuntimeError(f"ComfyUI HTTP did not become ready: {last_err}") + + +@pytest.fixture(scope="session") +def comfy_tmp_base_dir() -> Path: + env_base = os.environ.get("ASSETS_TEST_BASE_DIR") + created_by_fixture = False + if env_base: + tmp = Path(env_base) + tmp.mkdir(parents=True, exist_ok=True) + else: + tmp = Path(tempfile.mkdtemp(prefix="comfyui-assets-tests-")) + created_by_fixture = True + _make_base_dirs(tmp) + yield tmp + if created_by_fixture: + with contextlib.suppress(Exception): + for p in sorted(tmp.rglob("*"), reverse=True): + if p.is_file() or p.is_symlink(): + p.unlink(missing_ok=True) + for p in sorted(tmp.glob("**/*"), reverse=True): + with contextlib.suppress(Exception): + p.rmdir() + tmp.rmdir() + + +@pytest.fixture(scope="session") +def comfy_url_and_proc(comfy_tmp_base_dir: Path, request: pytest.FixtureRequest): + """ + Boot ComfyUI subprocess with: + - sandbox base dir + - file-backed sqlite DB in temp dir + - autoscan disabled + Returns (base_url, process, port) + """ + port = _free_port() + db_url = request.config.getoption("--db-url") + if not db_url: + # Use a file-backed sqlite database in the temp directory + db_path = comfy_tmp_base_dir / "assets-test.sqlite3" + db_url = f"sqlite:///{db_path}" + + logs_dir = comfy_tmp_base_dir / "logs" + logs_dir.mkdir(exist_ok=True) + out_log = open(logs_dir / "stdout.log", "w", buffering=1) + err_log = open(logs_dir / "stderr.log", "w", buffering=1) + + comfy_root = Path(__file__).resolve().parent.parent.parent + if not (comfy_root / "main.py").is_file(): + raise FileNotFoundError(f"main.py not found under {comfy_root}") + + proc = subprocess.Popen( + args=[ + sys.executable, + "main.py", + f"--base-directory={str(comfy_tmp_base_dir)}", + f"--database-url={db_url}", + "--disable-assets-autoscan", + "--listen", + "127.0.0.1", + "--port", + str(port), + "--cpu", + ], + stdout=out_log, + stderr=err_log, + cwd=str(comfy_root), + env={**os.environ}, + ) + + for _ in range(50): + if proc.poll() is not None: + out_log.flush() + err_log.flush() + raise RuntimeError(f"ComfyUI exited early with code {proc.returncode}") + time.sleep(0.1) + + base_url = f"http://127.0.0.1:{port}" + try: + with requests.Session() as s: + _wait_http_ready(base_url, s, timeout=90.0) + yield base_url, proc, port + except Exception as e: + with contextlib.suppress(Exception): + proc.terminate() + proc.wait(timeout=10) + with contextlib.suppress(Exception): + out_log.flush() + err_log.flush() + raise RuntimeError(f"ComfyUI did not become ready: {e}") + + if proc and proc.poll() is None: + with contextlib.suppress(Exception): + proc.terminate() + proc.wait(timeout=15) + out_log.close() + err_log.close() + + +@pytest.fixture +def http() -> Iterator[requests.Session]: + with requests.Session() as s: + s.timeout = 120 + yield s + + +@pytest.fixture +def api_base(comfy_url_and_proc) -> str: + base_url, _proc, _port = comfy_url_and_proc + return base_url + + +def _post_multipart_asset( + session: requests.Session, + base: str, + *, + name: str, + tags: list[str], + meta: dict, + data: bytes, + extra_fields: Optional[dict] = None, +) -> tuple[int, dict]: + files = {"file": (name, data, "application/octet-stream")} + form_data = { + "tags": json.dumps(tags), + "name": name, + "user_metadata": json.dumps(meta), + } + if extra_fields: + for k, v in extra_fields.items(): + form_data[k] = v + r = session.post(base + "/api/assets", files=files, data=form_data, timeout=120) + return r.status_code, r.json() + + +@pytest.fixture +def make_asset_bytes() -> Callable[[str, int], bytes]: + def _make(name: str, size: int = 8192) -> bytes: + seed = sum(ord(c) for c in name) % 251 + return bytes((i * 31 + seed) % 256 for i in range(size)) + return _make + + +@pytest.fixture +def asset_factory(http: requests.Session, api_base: str): + """ + Returns create(name, tags, meta, data) -> response dict + Tracks created ids and deletes them after the test. + """ + created: list[str] = [] + + def create(name: str, tags: list[str], meta: dict, data: bytes) -> dict: + status, body = _post_multipart_asset(http, api_base, name=name, tags=tags, meta=meta, data=data) + assert status in (200, 201), body + created.append(body["id"]) + return body + + yield create + + for aid in created: + with contextlib.suppress(Exception): + http.delete(f"{api_base}/api/assets/{aid}", timeout=30) + + +@pytest.fixture +def seeded_asset(request: pytest.FixtureRequest, http: requests.Session, api_base: str) -> dict: + """ + Upload one asset with ".safetensors" extension into models/checkpoints/unit-tests/. + Returns response dict with id, asset_hash, tags, etc. + """ + name = "unit_1_example.safetensors" + p = getattr(request, "param", {}) or {} + tags: Optional[list[str]] = p.get("tags") + if tags is None: + tags = ["models", "checkpoints", "unit-tests", "alpha"] + meta = {"purpose": "test", "epoch": 1, "flags": ["x", "y"], "nullable": None} + files = {"file": (name, b"A" * 4096, "application/octet-stream")} + form_data = { + "tags": json.dumps(tags), + "name": name, + "user_metadata": json.dumps(meta), + } + r = http.post(api_base + "/api/assets", files=files, data=form_data, timeout=120) + body = r.json() + assert r.status_code == 201, body + return body + + +@pytest.fixture(autouse=True) +def autoclean_unit_test_assets(http: requests.Session, api_base: str): + """Ensure isolation by removing all AssetInfo rows tagged with 'unit-tests' after each test.""" + yield + + while True: + r = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests", "limit": "500", "sort": "name"}, + timeout=30, + ) + if r.status_code != 200: + break + body = r.json() + ids = [a["id"] for a in body.get("assets", [])] + if not ids: + break + for aid in ids: + with contextlib.suppress(Exception): + http.delete(f"{api_base}/api/assets/{aid}", timeout=30) + + +def trigger_sync_seed_assets(session: requests.Session, base_url: str) -> None: + """Force a fast sync/seed pass by calling the seed endpoint.""" + session.post(base_url + "/api/assets/seed", json={"roots": ["models", "input", "output"]}, timeout=30) + time.sleep(0.2) + + +def get_asset_filename(asset_hash: str, extension: str) -> str: + return asset_hash.removeprefix("blake3:") + extension diff --git a/tests-unit/assets_test/test_assets_missing_sync.py b/tests-unit/assets_test/test_assets_missing_sync.py new file mode 100644 index 000000000..78fa7b404 --- /dev/null +++ b/tests-unit/assets_test/test_assets_missing_sync.py @@ -0,0 +1,348 @@ +import os +import uuid +from pathlib import Path + +import pytest +import requests +from conftest import get_asset_filename, trigger_sync_seed_assets + + + + +@pytest.mark.parametrize("root", ["input", "output"]) +def test_seed_asset_removed_when_file_is_deleted( + root: str, + http: requests.Session, + api_base: str, + comfy_tmp_base_dir: Path, +): + """Asset without hash (seed) whose file disappears: + after triggering sync_seed_assets, Asset + AssetInfo disappear. + """ + # Create a file directly under input/unit-tests/ so tags include "unit-tests" + case_dir = comfy_tmp_base_dir / root / "unit-tests" / "syncseed" + case_dir.mkdir(parents=True, exist_ok=True) + name = f"seed_{uuid.uuid4().hex[:8]}.bin" + fp = case_dir / name + fp.write_bytes(b"Z" * 2048) + + # Trigger a seed sync so DB sees this path (seed asset => hash is NULL) + trigger_sync_seed_assets(http, api_base) + + # Verify it is visible via API and carries no hash (seed) + r1 = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,syncseed", "name_contains": name}, + timeout=120, + ) + body1 = r1.json() + assert r1.status_code == 200 + # there should be exactly one with that name + matches = [a for a in body1.get("assets", []) if a.get("name") == name] + assert matches + assert matches[0].get("asset_hash") is None + asset_info_id = matches[0]["id"] + + # Remove the underlying file and sync again + if fp.exists(): + fp.unlink() + + trigger_sync_seed_assets(http, api_base) + + # It should disappear (AssetInfo and seed Asset gone) + r2 = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,syncseed", "name_contains": name}, + timeout=120, + ) + body2 = r2.json() + assert r2.status_code == 200 + matches2 = [a for a in body2.get("assets", []) if a.get("name") == name] + assert not matches2, f"Seed asset {asset_info_id} should be gone after sync" + + +@pytest.mark.skip(reason="Requires computing hashes of files in directories to verify and clear missing tags") +def test_hashed_asset_missing_tag_added_then_removed_after_scan( + http: requests.Session, + api_base: str, + comfy_tmp_base_dir: Path, + asset_factory, + make_asset_bytes, +): + """Hashed asset with a single cache_state: + 1. delete its file -> sync adds 'missing' + 2. restore file -> sync removes 'missing' + """ + name = "missing_tag_test.png" + tags = ["input", "unit-tests", "msync2"] + data = make_asset_bytes(name, 4096) + a = asset_factory(name, tags, {}, data) + + # Compute its on-disk path and remove it + dest = comfy_tmp_base_dir / "input" / "unit-tests" / "msync2" / get_asset_filename(a["asset_hash"], ".png") + assert dest.exists(), f"Expected asset file at {dest}" + dest.unlink() + + # Fast sync should add 'missing' to the AssetInfo + trigger_sync_seed_assets(http, api_base) + + g1 = http.get(f"{api_base}/api/assets/{a['id']}", timeout=120) + d1 = g1.json() + assert g1.status_code == 200, d1 + assert "missing" in set(d1.get("tags", [])), "Expected 'missing' tag after deletion" + + # Restore the file with the exact same content and sync again + dest.parent.mkdir(parents=True, exist_ok=True) + dest.write_bytes(data) + + trigger_sync_seed_assets(http, api_base) + + g2 = http.get(f"{api_base}/api/assets/{a['id']}", timeout=120) + d2 = g2.json() + assert g2.status_code == 200, d2 + assert "missing" not in set(d2.get("tags", [])), "Missing tag should be cleared after verify" + + +def test_hashed_asset_two_asset_infos_both_get_missing( + http: requests.Session, + api_base: str, + comfy_tmp_base_dir: Path, + asset_factory, +): + """Hashed asset with a single cache_state, but two AssetInfo rows: + deleting the single file then syncing should add 'missing' to both infos. + """ + # Upload one hashed asset + name = "two_infos_one_path.png" + base_tags = ["input", "unit-tests", "multiinfo"] + created = asset_factory(name, base_tags, {}, b"A" * 2048) + + # Create second AssetInfo for the same Asset via from-hash + payload = { + "hash": created["asset_hash"], + "name": "two_infos_one_path_copy.png", + "tags": base_tags, # keep it in our unit-tests scope for cleanup + "user_metadata": {"k": "v"}, + } + r2 = http.post(api_base + "/api/assets/from-hash", json=payload, timeout=120) + b2 = r2.json() + assert r2.status_code == 201, b2 + second_id = b2["id"] + + # Remove the single underlying file + p = comfy_tmp_base_dir / "input" / "unit-tests" / "multiinfo" / get_asset_filename(b2["asset_hash"], ".png") + assert p.exists() + p.unlink() + + r0 = http.get(api_base + "/api/tags", params={"limit": "1000", "include_zero": "false"}, timeout=120) + tags0 = r0.json() + assert r0.status_code == 200, tags0 + byname0 = {t["name"]: t for t in tags0.get("tags", [])} + old_missing = int(byname0.get("missing", {}).get("count", 0)) + + # Sync -> both AssetInfos for this asset must receive 'missing' + trigger_sync_seed_assets(http, api_base) + + ga = http.get(f"{api_base}/api/assets/{created['id']}", timeout=120) + da = ga.json() + assert ga.status_code == 200, da + assert "missing" in set(da.get("tags", [])) + + gb = http.get(f"{api_base}/api/assets/{second_id}", timeout=120) + db = gb.json() + assert gb.status_code == 200, db + assert "missing" in set(db.get("tags", [])) + + # Tag usage for 'missing' increased by exactly 2 (two AssetInfos) + r1 = http.get(api_base + "/api/tags", params={"limit": "1000", "include_zero": "false"}, timeout=120) + tags1 = r1.json() + assert r1.status_code == 200, tags1 + byname1 = {t["name"]: t for t in tags1.get("tags", [])} + new_missing = int(byname1.get("missing", {}).get("count", 0)) + assert new_missing == old_missing + 2 + + +@pytest.mark.skip(reason="Requires computing hashes of files in directories to deduplicate into multiple cache states") +def test_hashed_asset_two_cache_states_partial_delete_then_full_delete( + http: requests.Session, + api_base: str, + comfy_tmp_base_dir: Path, + asset_factory, + make_asset_bytes, + run_scan_and_wait, +): + """Hashed asset with two cache_state rows: + 1. delete one file -> sync should NOT add 'missing' + 2. delete second file -> sync should add 'missing' + """ + name = "two_cache_states_partial_delete.png" + tags = ["input", "unit-tests", "dual"] + data = make_asset_bytes(name, 3072) + + created = asset_factory(name, tags, {}, data) + path1 = comfy_tmp_base_dir / "input" / "unit-tests" / "dual" / get_asset_filename(created["asset_hash"], ".png") + assert path1.exists() + + # Create a second on-disk copy under the same root but different subfolder + path2 = comfy_tmp_base_dir / "input" / "unit-tests" / "dual_copy" / name + path2.parent.mkdir(parents=True, exist_ok=True) + path2.write_bytes(data) + + # Fast seed so the second path appears (as a seed initially) + trigger_sync_seed_assets(http, api_base) + + # Deduplication of AssetInfo-s will not happen as first AssetInfo has owner='default' and second has empty owner. + run_scan_and_wait("input") + + # Remove only one file and sync -> asset should still be healthy (no 'missing') + path1.unlink() + trigger_sync_seed_assets(http, api_base) + + g1 = http.get(f"{api_base}/api/assets/{created['id']}", timeout=120) + d1 = g1.json() + assert g1.status_code == 200, d1 + assert "missing" not in set(d1.get("tags", [])), "Should not be missing while one valid path remains" + + # Baseline 'missing' usage count just before last file removal + r0 = http.get(api_base + "/api/tags", params={"limit": "1000", "include_zero": "false"}, timeout=120) + tags0 = r0.json() + assert r0.status_code == 200, tags0 + old_missing = int({t["name"]: t for t in tags0.get("tags", [])}.get("missing", {}).get("count", 0)) + + # Remove the second (last) file and sync -> now we expect 'missing' on this AssetInfo + path2.unlink() + trigger_sync_seed_assets(http, api_base) + + g2 = http.get(f"{api_base}/api/assets/{created['id']}", timeout=120) + d2 = g2.json() + assert g2.status_code == 200, d2 + assert "missing" in set(d2.get("tags", [])), "Missing must be set once no valid paths remain" + + # Tag usage for 'missing' increased by exactly 2 (two AssetInfo for one Asset) + r1 = http.get(api_base + "/api/tags", params={"limit": "1000", "include_zero": "false"}, timeout=120) + tags1 = r1.json() + assert r1.status_code == 200, tags1 + new_missing = int({t["name"]: t for t in tags1.get("tags", [])}.get("missing", {}).get("count", 0)) + assert new_missing == old_missing + 2 + + +@pytest.mark.parametrize("root", ["input", "output"]) +def test_missing_tag_clears_on_fastpass_when_mtime_and_size_match( + root: str, + http: requests.Session, + api_base: str, + comfy_tmp_base_dir: Path, + asset_factory, + make_asset_bytes, +): + """ + Fast pass alone clears 'missing' when size and mtime match exactly: + 1) upload (hashed), record original mtime_ns + 2) delete -> fast pass adds 'missing' + 3) restore same bytes and set mtime back to the original value + 4) run fast pass again -> 'missing' is removed (no slow scan) + """ + scope = f"fastclear-{uuid.uuid4().hex[:6]}" + name = "fastpass_clear.bin" + data = make_asset_bytes(name, 3072) + + a = asset_factory(name, [root, "unit-tests", scope], {}, data) + aid = a["id"] + base = comfy_tmp_base_dir / root / "unit-tests" / scope + p = base / get_asset_filename(a["asset_hash"], ".bin") + st0 = p.stat() + orig_mtime_ns = getattr(st0, "st_mtime_ns", int(st0.st_mtime * 1_000_000_000)) + + # Delete -> fast pass adds 'missing' + p.unlink() + trigger_sync_seed_assets(http, api_base) + g1 = http.get(f"{api_base}/api/assets/{aid}", timeout=120) + d1 = g1.json() + assert g1.status_code == 200, d1 + assert "missing" in set(d1.get("tags", [])) + + # Restore same bytes and revert mtime to the original value + p.parent.mkdir(parents=True, exist_ok=True) + p.write_bytes(data) + # set both atime and mtime in ns to ensure exact match + os.utime(p, ns=(orig_mtime_ns, orig_mtime_ns)) + + # Fast pass should clear 'missing' without a scan + trigger_sync_seed_assets(http, api_base) + g2 = http.get(f"{api_base}/api/assets/{aid}", timeout=120) + d2 = g2.json() + assert g2.status_code == 200, d2 + assert "missing" not in set(d2.get("tags", [])), "Fast pass should clear 'missing' when size+mtime match" + + +@pytest.mark.skip(reason="Requires computing hashes of files in directories to deduplicate into multiple cache states") +@pytest.mark.parametrize("root", ["input", "output"]) +def test_fastpass_removes_stale_state_row_no_missing( + root: str, + http: requests.Session, + api_base: str, + comfy_tmp_base_dir: Path, + asset_factory, + make_asset_bytes, + run_scan_and_wait, +): + """ + Hashed asset with two states: + - delete one file + - run fast pass only + Expect: + - asset stays healthy (no 'missing') + - stale AssetCacheState row for the deleted path is removed. + We verify this behaviorally by recreating the deleted path and running fast pass again: + a new *seed* AssetInfo is created, which proves the old state row was not reused. + """ + scope = f"stale-{uuid.uuid4().hex[:6]}" + name = "two_states.bin" + data = make_asset_bytes(name, 2048) + + # Upload hashed asset at path1 + a = asset_factory(name, [root, "unit-tests", scope], {}, data) + base = comfy_tmp_base_dir / root / "unit-tests" / scope + a1_filename = get_asset_filename(a["asset_hash"], ".bin") + p1 = base / a1_filename + assert p1.exists() + + aid = a["id"] + h = a["asset_hash"] + + # Create second state path2, seed+scan to dedupe into the same Asset + p2 = base / "copy" / name + p2.parent.mkdir(parents=True, exist_ok=True) + p2.write_bytes(data) + trigger_sync_seed_assets(http, api_base) + run_scan_and_wait(root) + + # Delete path1 and run fast pass -> no 'missing' and stale state row should be removed + p1.unlink() + trigger_sync_seed_assets(http, api_base) + g1 = http.get(f"{api_base}/api/assets/{aid}", timeout=120) + d1 = g1.json() + assert g1.status_code == 200, d1 + assert "missing" not in set(d1.get("tags", [])) + + # Recreate path1 and run fast pass again. + # If the stale state row was removed, a NEW seed AssetInfo will appear for this path. + p1.write_bytes(data) + trigger_sync_seed_assets(http, api_base) + + rl = http.get( + api_base + "/api/assets", + params={"include_tags": f"unit-tests,{scope}"}, + timeout=120, + ) + bl = rl.json() + assert rl.status_code == 200, bl + items = bl.get("assets", []) + # one hashed AssetInfo (asset_hash == h) + one seed AssetInfo (asset_hash == null) + hashes = [it.get("asset_hash") for it in items if it.get("name") in (name, a1_filename)] + assert h in hashes + assert any(x is None for x in hashes), "Expected a new seed AssetInfo for the recreated path" + + # Asset identity still healthy + rh = http.head(f"{api_base}/api/assets/hash/{h}", timeout=120) + assert rh.status_code == 200 diff --git a/tests-unit/assets_test/test_crud.py b/tests-unit/assets_test/test_crud.py new file mode 100644 index 000000000..d2b69f475 --- /dev/null +++ b/tests-unit/assets_test/test_crud.py @@ -0,0 +1,306 @@ +import uuid +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path + +import pytest +import requests +from conftest import get_asset_filename, trigger_sync_seed_assets + + +def test_create_from_hash_success( + http: requests.Session, api_base: str, seeded_asset: dict +): + h = seeded_asset["asset_hash"] + payload = { + "hash": h, + "name": "from_hash_ok.safetensors", + "tags": ["models", "checkpoints", "unit-tests", "from-hash"], + "user_metadata": {"k": "v"}, + } + r1 = http.post(f"{api_base}/api/assets/from-hash", json=payload, timeout=120) + b1 = r1.json() + assert r1.status_code == 201, b1 + assert b1["asset_hash"] == h + assert b1["created_new"] is False + aid = b1["id"] + + # Calling again with the same name should return the same AssetInfo id + r2 = http.post(f"{api_base}/api/assets/from-hash", json=payload, timeout=120) + b2 = r2.json() + assert r2.status_code == 201, b2 + assert b2["id"] == aid + + +def test_get_and_delete_asset(http: requests.Session, api_base: str, seeded_asset: dict): + aid = seeded_asset["id"] + + # GET detail + rg = http.get(f"{api_base}/api/assets/{aid}", timeout=120) + detail = rg.json() + assert rg.status_code == 200, detail + assert detail["id"] == aid + assert "user_metadata" in detail + assert "filename" in detail["user_metadata"] + + # DELETE + rd = http.delete(f"{api_base}/api/assets/{aid}", timeout=120) + assert rd.status_code == 204 + + # GET again -> 404 + rg2 = http.get(f"{api_base}/api/assets/{aid}", timeout=120) + body = rg2.json() + assert rg2.status_code == 404 + assert body["error"]["code"] == "ASSET_NOT_FOUND" + + +def test_delete_upon_reference_count( + http: requests.Session, api_base: str, seeded_asset: dict +): + # Create a second reference to the same asset via from-hash + src_hash = seeded_asset["asset_hash"] + payload = { + "hash": src_hash, + "name": "unit_ref_copy.safetensors", + "tags": ["models", "checkpoints", "unit-tests", "del-flow"], + "user_metadata": {"note": "copy"}, + } + r2 = http.post(f"{api_base}/api/assets/from-hash", json=payload, timeout=120) + copy = r2.json() + assert r2.status_code == 201, copy + assert copy["asset_hash"] == src_hash + assert copy["created_new"] is False + + # Delete original reference -> asset identity must remain + aid1 = seeded_asset["id"] + rd1 = http.delete(f"{api_base}/api/assets/{aid1}", timeout=120) + assert rd1.status_code == 204 + + rh1 = http.head(f"{api_base}/api/assets/hash/{src_hash}", timeout=120) + assert rh1.status_code == 200 # identity still present + + # Delete the last reference with default semantics -> identity and cached files removed + aid2 = copy["id"] + rd2 = http.delete(f"{api_base}/api/assets/{aid2}", timeout=120) + assert rd2.status_code == 204 + + rh2 = http.head(f"{api_base}/api/assets/hash/{src_hash}", timeout=120) + assert rh2.status_code == 404 # orphan content removed + + +def test_update_asset_fields(http: requests.Session, api_base: str, seeded_asset: dict): + aid = seeded_asset["id"] + original_tags = seeded_asset["tags"] + + payload = { + "name": "unit_1_renamed.safetensors", + "user_metadata": {"purpose": "updated", "epoch": 2}, + } + ru = http.put(f"{api_base}/api/assets/{aid}", json=payload, timeout=120) + body = ru.json() + assert ru.status_code == 200, body + assert body["name"] == payload["name"] + assert body["tags"] == original_tags # tags unchanged + assert body["user_metadata"]["purpose"] == "updated" + # filename should still be present and normalized by server + assert "filename" in body["user_metadata"] + + +def test_head_asset_by_hash(http: requests.Session, api_base: str, seeded_asset: dict): + h = seeded_asset["asset_hash"] + + # Existing + rh1 = http.head(f"{api_base}/api/assets/hash/{h}", timeout=120) + assert rh1.status_code == 200 + + # Non-existent + rh2 = http.head(f"{api_base}/api/assets/hash/blake3:{'0'*64}", timeout=120) + assert rh2.status_code == 404 + + +def test_head_asset_bad_hash_returns_400_and_no_body(http: requests.Session, api_base: str): + # Invalid format; handler returns a JSON error, but HEAD responses must not carry a payload. + # requests exposes an empty body for HEAD, so validate status and that there is no payload. + rh = http.head(f"{api_base}/api/assets/hash/not_a_hash", timeout=120) + assert rh.status_code == 400 + body = rh.content + assert body == b"" + + +def test_delete_nonexistent_returns_404(http: requests.Session, api_base: str): + bogus = str(uuid.uuid4()) + r = http.delete(f"{api_base}/api/assets/{bogus}", timeout=120) + body = r.json() + assert r.status_code == 404 + assert body["error"]["code"] == "ASSET_NOT_FOUND" + + +def test_create_from_hash_invalids(http: requests.Session, api_base: str): + # Bad hash algorithm + bad = { + "hash": "sha256:" + "0" * 64, + "name": "x.bin", + "tags": ["models", "checkpoints", "unit-tests"], + } + r1 = http.post(f"{api_base}/api/assets/from-hash", json=bad, timeout=120) + b1 = r1.json() + assert r1.status_code == 400 + assert b1["error"]["code"] == "INVALID_BODY" + + # Invalid JSON body + r2 = http.post(f"{api_base}/api/assets/from-hash", data=b"{not json}", timeout=120) + b2 = r2.json() + assert r2.status_code == 400 + assert b2["error"]["code"] == "INVALID_JSON" + + +def test_get_update_download_bad_ids(http: requests.Session, api_base: str): + # All endpoints should be not found, as we UUID regex directly in the route definition. + bad_id = "not-a-uuid" + + r1 = http.get(f"{api_base}/api/assets/{bad_id}", timeout=120) + assert r1.status_code == 404 + + r3 = http.get(f"{api_base}/api/assets/{bad_id}/content", timeout=120) + assert r3.status_code == 404 + + +def test_update_requires_at_least_one_field(http: requests.Session, api_base: str, seeded_asset: dict): + aid = seeded_asset["id"] + r = http.put(f"{api_base}/api/assets/{aid}", json={}, timeout=120) + body = r.json() + assert r.status_code == 400 + assert body["error"]["code"] == "INVALID_BODY" + + +@pytest.mark.parametrize("root", ["input", "output"]) +def test_concurrent_delete_same_asset_info_single_204( + root: str, + http: requests.Session, + api_base: str, + asset_factory, + make_asset_bytes, +): + """ + Many concurrent DELETE for the same AssetInfo should result in: + - exactly one 204 No Content (the one that actually deleted) + - all others 404 Not Found (row already gone) + """ + scope = f"conc-del-{uuid.uuid4().hex[:6]}" + name = "to_delete.bin" + data = make_asset_bytes(name, 1536) + + created = asset_factory(name, [root, "unit-tests", scope], {}, data) + aid = created["id"] + + # Hit the same endpoint N times in parallel. + n_tests = 4 + url = f"{api_base}/api/assets/{aid}?delete_content=false" + + def _do_delete(delete_url): + with requests.Session() as s: + return s.delete(delete_url, timeout=120).status_code + + with ThreadPoolExecutor(max_workers=n_tests) as ex: + statuses = list(ex.map(_do_delete, [url] * n_tests)) + + # Exactly one actual delete, the rest must be 404 + assert statuses.count(204) == 1, f"Expected exactly one 204; got: {statuses}" + assert statuses.count(404) == n_tests - 1, f"Expected {n_tests-1} 404; got: {statuses}" + + # The resource must be gone. + rg = http.get(f"{api_base}/api/assets/{aid}", timeout=120) + assert rg.status_code == 404 + + +@pytest.mark.parametrize("root", ["input", "output"]) +def test_metadata_filename_is_set_for_seed_asset_without_hash( + root: str, + http: requests.Session, + api_base: str, + comfy_tmp_base_dir: Path, +): + """Seed ingest (no hash yet) must compute user_metadata['filename'] immediately.""" + scope = f"seedmeta-{uuid.uuid4().hex[:6]}" + name = "seed_filename.bin" + + base = comfy_tmp_base_dir / root / "unit-tests" / scope / "a" / "b" + base.mkdir(parents=True, exist_ok=True) + fp = base / name + fp.write_bytes(b"Z" * 2048) + + trigger_sync_seed_assets(http, api_base) + + r1 = http.get( + api_base + "/api/assets", + params={"include_tags": f"unit-tests,{scope}", "name_contains": name}, + timeout=120, + ) + body = r1.json() + assert r1.status_code == 200, body + matches = [a for a in body.get("assets", []) if a.get("name") == name] + assert matches, "Seed asset should be visible after sync" + assert matches[0].get("asset_hash") is None # still a seed + aid = matches[0]["id"] + + r2 = http.get(f"{api_base}/api/assets/{aid}", timeout=120) + detail = r2.json() + assert r2.status_code == 200, detail + filename = (detail.get("user_metadata") or {}).get("filename") + expected = str(fp.relative_to(comfy_tmp_base_dir / root)).replace("\\", "/") + assert filename == expected, f"expected filename={expected}, got {filename!r}" + + +@pytest.mark.skip(reason="Requires computing hashes of files in directories to retarget cache states") +@pytest.mark.parametrize("root", ["input", "output"]) +def test_metadata_filename_computed_and_updated_on_retarget( + root: str, + http: requests.Session, + api_base: str, + comfy_tmp_base_dir: Path, + asset_factory, + make_asset_bytes, + run_scan_and_wait, +): + """ + 1) Ingest under {root}/unit-tests//a/b/ -> filename reflects relative path. + 2) Retarget by copying to {root}/unit-tests//x/, remove old file, + run fast pass + scan -> filename updates to new relative path. + """ + scope = f"meta-fn-{uuid.uuid4().hex[:6]}" + name1 = "compute_metadata_filename.png" + name2 = "compute_changed_metadata_filename.png" + data = make_asset_bytes(name1, 2100) + + # Upload into nested path a/b + a = asset_factory(name1, [root, "unit-tests", scope, "a", "b"], {}, data) + aid = a["id"] + + root_base = comfy_tmp_base_dir / root + p1 = (root_base / "unit-tests" / scope / "a" / "b" / get_asset_filename(a["asset_hash"], ".png")) + assert p1.exists() + + # filename at ingest should be the path relative to root + rel1 = str(p1.relative_to(root_base)).replace("\\", "/") + g1 = http.get(f"{api_base}/api/assets/{aid}", timeout=120) + d1 = g1.json() + assert g1.status_code == 200, d1 + fn1 = d1["user_metadata"].get("filename") + assert fn1 == rel1 + + # Retarget: copy to x/, remove old, then sync+scan + p2 = root_base / "unit-tests" / scope / "x" / name2 + p2.parent.mkdir(parents=True, exist_ok=True) + p2.write_bytes(data) + if p1.exists(): + p1.unlink() + + trigger_sync_seed_assets(http, api_base) # seed the new path + run_scan_and_wait(root) # verify/hash and reconcile + + # filename should now point at x/ + rel2 = str(p2.relative_to(root_base)).replace("\\", "/") + g2 = http.get(f"{api_base}/api/assets/{aid}", timeout=120) + d2 = g2.json() + assert g2.status_code == 200, d2 + fn2 = d2["user_metadata"].get("filename") + assert fn2 == rel2 diff --git a/tests-unit/assets_test/test_downloads.py b/tests-unit/assets_test/test_downloads.py new file mode 100644 index 000000000..cdebf9082 --- /dev/null +++ b/tests-unit/assets_test/test_downloads.py @@ -0,0 +1,166 @@ +import time +import uuid +from datetime import datetime +from pathlib import Path +from typing import Optional + +import pytest +import requests +from conftest import get_asset_filename, trigger_sync_seed_assets + + +def test_download_attachment_and_inline(http: requests.Session, api_base: str, seeded_asset: dict): + aid = seeded_asset["id"] + + # default attachment + r1 = http.get(f"{api_base}/api/assets/{aid}/content", timeout=120) + data = r1.content + assert r1.status_code == 200 + cd = r1.headers.get("Content-Disposition", "") + assert "attachment" in cd + assert data and len(data) == 4096 + + # inline requested + r2 = http.get(f"{api_base}/api/assets/{aid}/content?disposition=inline", timeout=120) + r2.content + assert r2.status_code == 200 + cd2 = r2.headers.get("Content-Disposition", "") + assert "inline" in cd2 + + +@pytest.mark.skip(reason="Requires computing hashes of files in directories to deduplicate into multiple cache states") +@pytest.mark.parametrize("root", ["input", "output"]) +def test_download_chooses_existing_state_and_updates_access_time( + root: str, + http: requests.Session, + api_base: str, + comfy_tmp_base_dir: Path, + asset_factory, + make_asset_bytes, + run_scan_and_wait, +): + """ + Hashed asset with two state paths: if the first one disappears, + GET /content still serves from the remaining path and bumps last_access_time. + """ + scope = f"dl-first-{uuid.uuid4().hex[:6]}" + name = "first_existing_state.bin" + data = make_asset_bytes(name, 3072) + + # Upload -> path1 + a = asset_factory(name, [root, "unit-tests", scope], {}, data) + aid = a["id"] + + base = comfy_tmp_base_dir / root / "unit-tests" / scope + path1 = base / get_asset_filename(a["asset_hash"], ".bin") + assert path1.exists() + + # Seed path2 by copying, then scan to dedupe into a second state + path2 = base / "alt" / name + path2.parent.mkdir(parents=True, exist_ok=True) + path2.write_bytes(data) + trigger_sync_seed_assets(http, api_base) + run_scan_and_wait(root) + + # Remove path1 so server must fall back to path2 + path1.unlink() + + # last_access_time before + rg0 = http.get(f"{api_base}/api/assets/{aid}", timeout=120) + d0 = rg0.json() + assert rg0.status_code == 200, d0 + ts0 = d0.get("last_access_time") + + time.sleep(0.05) + r = http.get(f"{api_base}/api/assets/{aid}/content", timeout=120) + blob = r.content + assert r.status_code == 200 + assert blob == data # must serve from the surviving state (same bytes) + + rg1 = http.get(f"{api_base}/api/assets/{aid}", timeout=120) + d1 = rg1.json() + assert rg1.status_code == 200, d1 + ts1 = d1.get("last_access_time") + + def _parse_iso8601(s: Optional[str]) -> Optional[float]: + if not s: + return None + s = s[:-1] if s.endswith("Z") else s + return datetime.fromisoformat(s).timestamp() + + t0 = _parse_iso8601(ts0) + t1 = _parse_iso8601(ts1) + assert t1 is not None + if t0 is not None: + assert t1 > t0 + + +@pytest.mark.parametrize("seeded_asset", [{"tags": ["models", "checkpoints"]}], indirect=True) +def test_download_missing_file_returns_404( + http: requests.Session, api_base: str, comfy_tmp_base_dir: Path, seeded_asset: dict +): + # Remove the underlying file then attempt download. + # We initialize fixture without additional tags to know exactly the asset file path. + try: + aid = seeded_asset["id"] + rg = http.get(f"{api_base}/api/assets/{aid}", timeout=120) + detail = rg.json() + assert rg.status_code == 200 + asset_filename = get_asset_filename(detail["asset_hash"], ".safetensors") + abs_path = comfy_tmp_base_dir / "models" / "checkpoints" / asset_filename + assert abs_path.exists() + abs_path.unlink() + + r2 = http.get(f"{api_base}/api/assets/{aid}/content", timeout=120) + assert r2.status_code == 404 + body = r2.json() + assert body["error"]["code"] == "FILE_NOT_FOUND" + finally: + # We created asset without the "unit-tests" tag(see `autoclean_unit_test_assets`), we need to clear it manually. + dr = http.delete(f"{api_base}/api/assets/{aid}", timeout=120) + dr.content + + +@pytest.mark.skip(reason="Requires computing hashes of files in directories to deduplicate into multiple cache states") +@pytest.mark.parametrize("root", ["input", "output"]) +def test_download_404_if_all_states_missing( + root: str, + http: requests.Session, + api_base: str, + comfy_tmp_base_dir: Path, + asset_factory, + make_asset_bytes, + run_scan_and_wait, +): + """Multi-state asset: after the last remaining on-disk file is removed, download must return 404.""" + scope = f"dl-404-{uuid.uuid4().hex[:6]}" + name = "missing_all_states.bin" + data = make_asset_bytes(name, 2048) + + # Upload -> path1 + a = asset_factory(name, [root, "unit-tests", scope], {}, data) + aid = a["id"] + + base = comfy_tmp_base_dir / root / "unit-tests" / scope + p1 = base / get_asset_filename(a["asset_hash"], ".bin") + assert p1.exists() + + # Seed a second state and dedupe + p2 = base / "copy" / name + p2.parent.mkdir(parents=True, exist_ok=True) + p2.write_bytes(data) + trigger_sync_seed_assets(http, api_base) + run_scan_and_wait(root) + + # Remove first file -> download should still work via the second state + p1.unlink() + ok1 = http.get(f"{api_base}/api/assets/{aid}/content", timeout=120) + b1 = ok1.content + assert ok1.status_code == 200 and b1 == data + + # Remove the last file -> download must 404 + p2.unlink() + r2 = http.get(f"{api_base}/api/assets/{aid}/content", timeout=120) + body = r2.json() + assert r2.status_code == 404 + assert body["error"]["code"] == "FILE_NOT_FOUND" diff --git a/tests-unit/assets_test/test_list_filter.py b/tests-unit/assets_test/test_list_filter.py new file mode 100644 index 000000000..82e109832 --- /dev/null +++ b/tests-unit/assets_test/test_list_filter.py @@ -0,0 +1,342 @@ +import time +import uuid + +import requests + + +def test_list_assets_paging_and_sort(http: requests.Session, api_base: str, asset_factory, make_asset_bytes): + names = ["a1_u.safetensors", "a2_u.safetensors", "a3_u.safetensors"] + for n in names: + asset_factory( + n, + ["models", "checkpoints", "unit-tests", "paging"], + {"epoch": 1}, + make_asset_bytes(n, size=2048), + ) + + # name ascending for stable order + r1 = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,paging", "sort": "name", "order": "asc", "limit": "2", "offset": "0"}, + timeout=120, + ) + b1 = r1.json() + assert r1.status_code == 200 + got1 = [a["name"] for a in b1["assets"]] + assert got1 == sorted(names)[:2] + assert b1["has_more"] is True + + r2 = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,paging", "sort": "name", "order": "asc", "limit": "2", "offset": "2"}, + timeout=120, + ) + b2 = r2.json() + assert r2.status_code == 200 + got2 = [a["name"] for a in b2["assets"]] + assert got2 == sorted(names)[2:] + assert b2["has_more"] is False + + +def test_list_assets_include_exclude_and_name_contains(http: requests.Session, api_base: str, asset_factory): + a = asset_factory("inc_a.safetensors", ["models", "checkpoints", "unit-tests", "alpha"], {}, b"X" * 1024) + b = asset_factory("inc_b.safetensors", ["models", "checkpoints", "unit-tests", "beta"], {}, b"Y" * 1024) + + r = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,alpha", "exclude_tags": "beta", "limit": "50"}, + timeout=120, + ) + body = r.json() + assert r.status_code == 200 + names = [x["name"] for x in body["assets"]] + assert a["name"] in names + assert b["name"] not in names + + r2 = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests", "name_contains": "inc_"}, + timeout=120, + ) + body2 = r2.json() + assert r2.status_code == 200 + names2 = [x["name"] for x in body2["assets"]] + assert a["name"] in names2 + assert b["name"] in names2 + + r2 = http.get( + api_base + "/api/assets", + params={"include_tags": "non-existing-tag"}, + timeout=120, + ) + body3 = r2.json() + assert r2.status_code == 200 + assert not body3["assets"] + + +def test_list_assets_sort_by_size_both_orders(http, api_base, asset_factory, make_asset_bytes): + t = ["models", "checkpoints", "unit-tests", "lf-size"] + n1, n2, n3 = "sz1.safetensors", "sz2.safetensors", "sz3.safetensors" + asset_factory(n1, t, {}, make_asset_bytes(n1, 1024)) + asset_factory(n2, t, {}, make_asset_bytes(n2, 2048)) + asset_factory(n3, t, {}, make_asset_bytes(n3, 3072)) + + r1 = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,lf-size", "sort": "size", "order": "asc"}, + timeout=120, + ) + b1 = r1.json() + names = [a["name"] for a in b1["assets"]] + assert names[:3] == [n1, n2, n3] + + r2 = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,lf-size", "sort": "size", "order": "desc"}, + timeout=120, + ) + b2 = r2.json() + names2 = [a["name"] for a in b2["assets"]] + assert names2[:3] == [n3, n2, n1] + + + +def test_list_assets_sort_by_updated_at_desc(http, api_base, asset_factory, make_asset_bytes): + t = ["models", "checkpoints", "unit-tests", "lf-upd"] + a1 = asset_factory("upd_a.safetensors", t, {}, make_asset_bytes("upd_a", 1200)) + a2 = asset_factory("upd_b.safetensors", t, {}, make_asset_bytes("upd_b", 1200)) + + # Rename the second asset to bump updated_at + rp = http.put(f"{api_base}/api/assets/{a2['id']}", json={"name": "upd_b_renamed.safetensors"}, timeout=120) + upd = rp.json() + assert rp.status_code == 200, upd + + r = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,lf-upd", "sort": "updated_at", "order": "desc"}, + timeout=120, + ) + body = r.json() + assert r.status_code == 200 + names = [x["name"] for x in body["assets"]] + assert names[0] == "upd_b_renamed.safetensors" + assert a1["name"] in names + + + +def test_list_assets_sort_by_last_access_time_desc(http, api_base, asset_factory, make_asset_bytes): + t = ["models", "checkpoints", "unit-tests", "lf-access"] + asset_factory("acc_a.safetensors", t, {}, make_asset_bytes("acc_a", 1100)) + time.sleep(0.02) + a2 = asset_factory("acc_b.safetensors", t, {}, make_asset_bytes("acc_b", 1100)) + + # Touch last_access_time of b by downloading its content + time.sleep(0.02) + dl = http.get(f"{api_base}/api/assets/{a2['id']}/content", timeout=120) + assert dl.status_code == 200 + dl.content + + r = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,lf-access", "sort": "last_access_time", "order": "desc"}, + timeout=120, + ) + body = r.json() + assert r.status_code == 200 + names = [x["name"] for x in body["assets"]] + assert names[0] == a2["name"] + + +def test_list_assets_include_tags_variants_and_case(http, api_base, asset_factory, make_asset_bytes): + t = ["models", "checkpoints", "unit-tests", "lf-include"] + a = asset_factory("incvar_alpha.safetensors", [*t, "alpha"], {}, make_asset_bytes("iva")) + asset_factory("incvar_beta.safetensors", [*t, "beta"], {}, make_asset_bytes("ivb")) + + # CSV + case-insensitive + r1 = http.get( + api_base + "/api/assets", + params={"include_tags": "UNIT-TESTS,LF-INCLUDE,alpha"}, + timeout=120, + ) + b1 = r1.json() + assert r1.status_code == 200 + names1 = [x["name"] for x in b1["assets"]] + assert a["name"] in names1 + assert not any("beta" in x for x in names1) + + # Repeated query params for include_tags + params_multi = [ + ("include_tags", "unit-tests"), + ("include_tags", "lf-include"), + ("include_tags", "alpha"), + ] + r2 = http.get(api_base + "/api/assets", params=params_multi, timeout=120) + b2 = r2.json() + assert r2.status_code == 200 + names2 = [x["name"] for x in b2["assets"]] + assert a["name"] in names2 + assert not any("beta" in x for x in names2) + + # Duplicates and spaces in CSV + r3 = http.get( + api_base + "/api/assets", + params={"include_tags": " unit-tests , lf-include , alpha , alpha "}, + timeout=120, + ) + b3 = r3.json() + assert r3.status_code == 200 + names3 = [x["name"] for x in b3["assets"]] + assert a["name"] in names3 + + +def test_list_assets_exclude_tags_dedup_and_case(http, api_base, asset_factory, make_asset_bytes): + t = ["models", "checkpoints", "unit-tests", "lf-exclude"] + a = asset_factory("ex_a_alpha.safetensors", [*t, "alpha"], {}, make_asset_bytes("exa", 900)) + asset_factory("ex_b_beta.safetensors", [*t, "beta"], {}, make_asset_bytes("exb", 900)) + + # Exclude uppercase should work + r1 = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,lf-exclude", "exclude_tags": "BETA"}, + timeout=120, + ) + b1 = r1.json() + assert r1.status_code == 200 + names1 = [x["name"] for x in b1["assets"]] + assert a["name"] in names1 + # Repeated excludes with duplicates + params_multi = [ + ("include_tags", "unit-tests"), + ("include_tags", "lf-exclude"), + ("exclude_tags", "beta"), + ("exclude_tags", "beta"), + ] + r2 = http.get(api_base + "/api/assets", params=params_multi, timeout=120) + b2 = r2.json() + assert r2.status_code == 200 + names2 = [x["name"] for x in b2["assets"]] + assert all("beta" not in x for x in names2) + + +def test_list_assets_name_contains_case_and_specials(http, api_base, asset_factory, make_asset_bytes): + t = ["models", "checkpoints", "unit-tests", "lf-name"] + a1 = asset_factory("CaseMix.SAFE", t, {}, make_asset_bytes("cm", 800)) + a2 = asset_factory("case-other.safetensors", t, {}, make_asset_bytes("co", 800)) + + r1 = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,lf-name", "name_contains": "casemix"}, + timeout=120, + ) + b1 = r1.json() + assert r1.status_code == 200 + names1 = [x["name"] for x in b1["assets"]] + assert a1["name"] in names1 + + r2 = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,lf-name", "name_contains": ".SAFE"}, + timeout=120, + ) + b2 = r2.json() + assert r2.status_code == 200 + names2 = [x["name"] for x in b2["assets"]] + assert a1["name"] in names2 + + r3 = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,lf-name", "name_contains": "case-"}, + timeout=120, + ) + b3 = r3.json() + assert r3.status_code == 200 + names3 = [x["name"] for x in b3["assets"]] + assert a2["name"] in names3 + + +def test_list_assets_offset_beyond_total_and_limit_boundary(http, api_base, asset_factory, make_asset_bytes): + t = ["models", "checkpoints", "unit-tests", "lf-pagelimits"] + asset_factory("pl1.safetensors", t, {}, make_asset_bytes("pl1", 600)) + asset_factory("pl2.safetensors", t, {}, make_asset_bytes("pl2", 600)) + asset_factory("pl3.safetensors", t, {}, make_asset_bytes("pl3", 600)) + + # Offset far beyond total + r1 = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,lf-pagelimits", "limit": "2", "offset": "10"}, + timeout=120, + ) + b1 = r1.json() + assert r1.status_code == 200 + assert not b1["assets"] + assert b1["has_more"] is False + + # Boundary large limit (<=500 is valid) + r2 = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,lf-pagelimits", "limit": "500"}, + timeout=120, + ) + b2 = r2.json() + assert r2.status_code == 200 + assert len(b2["assets"]) == 3 + assert b2["has_more"] is False + + +def test_list_assets_offset_negative_and_limit_nonint_rejected(http, api_base): + r1 = http.get(api_base + "/api/assets", params={"offset": "-1"}, timeout=120) + b1 = r1.json() + assert r1.status_code == 400 + assert b1["error"]["code"] == "INVALID_QUERY" + + r2 = http.get(api_base + "/api/assets", params={"limit": "abc"}, timeout=120) + b2 = r2.json() + assert r2.status_code == 400 + assert b2["error"]["code"] == "INVALID_QUERY" + + +def test_list_assets_invalid_query_rejected(http: requests.Session, api_base: str): + # limit too small + r1 = http.get(api_base + "/api/assets", params={"limit": "0"}, timeout=120) + b1 = r1.json() + assert r1.status_code == 400 + assert b1["error"]["code"] == "INVALID_QUERY" + + # bad metadata JSON + r2 = http.get(api_base + "/api/assets", params={"metadata_filter": "{not json"}, timeout=120) + b2 = r2.json() + assert r2.status_code == 400 + assert b2["error"]["code"] == "INVALID_QUERY" + + +def test_list_assets_name_contains_literal_underscore( + http, + api_base, + asset_factory, + make_asset_bytes, +): + """'name_contains' must treat '_' literally, not as a SQL wildcard. + We create: + - foo_bar.safetensors (should match) + - fooxbar.safetensors (must NOT match if '_' is escaped) + - foobar.safetensors (must NOT match) + """ + scope = f"lf-underscore-{uuid.uuid4().hex[:6]}" + tags = ["models", "checkpoints", "unit-tests", scope] + + a = asset_factory("foo_bar.safetensors", tags, {}, make_asset_bytes("a", 700)) + b = asset_factory("fooxbar.safetensors", tags, {}, make_asset_bytes("b", 700)) + c = asset_factory("foobar.safetensors", tags, {}, make_asset_bytes("c", 700)) + + r = http.get( + api_base + "/api/assets", + params={"include_tags": f"unit-tests,{scope}", "name_contains": "foo_bar"}, + timeout=120, + ) + body = r.json() + assert r.status_code == 200, body + names = [x["name"] for x in body["assets"]] + assert a["name"] in names, f"Expected literal underscore match to include {a['name']}" + assert b["name"] not in names, "Underscore must be escaped — should not match 'fooxbar'" + assert c["name"] not in names, "Underscore must be escaped — should not match 'foobar'" + assert body["total"] == 1 diff --git a/tests-unit/assets_test/test_metadata_filters.py b/tests-unit/assets_test/test_metadata_filters.py new file mode 100644 index 000000000..20285a3b3 --- /dev/null +++ b/tests-unit/assets_test/test_metadata_filters.py @@ -0,0 +1,395 @@ +import json + + +def test_meta_and_across_keys_and_types( + http, api_base: str, asset_factory, make_asset_bytes +): + name = "mf_and_mix.safetensors" + tags = ["models", "checkpoints", "unit-tests", "mf-and"] + meta = {"purpose": "mix", "epoch": 1, "active": True, "score": 1.23} + asset_factory(name, tags, meta, make_asset_bytes(name, 4096)) + + # All keys must match (AND semantics) + f_ok = {"purpose": "mix", "epoch": 1, "active": True, "score": 1.23} + r1 = http.get( + api_base + "/api/assets", + params={ + "include_tags": "unit-tests,mf-and", + "metadata_filter": json.dumps(f_ok), + }, + timeout=120, + ) + b1 = r1.json() + assert r1.status_code == 200 + names = [a["name"] for a in b1["assets"]] + assert name in names + + # One key mismatched -> no result + f_bad = {"purpose": "mix", "epoch": 2, "active": True} + r2 = http.get( + api_base + "/api/assets", + params={ + "include_tags": "unit-tests,mf-and", + "metadata_filter": json.dumps(f_bad), + }, + timeout=120, + ) + b2 = r2.json() + assert r2.status_code == 200 + assert not b2["assets"] + + +def test_meta_type_strictness_int_vs_str_and_bool(http, api_base, asset_factory, make_asset_bytes): + name = "mf_types.safetensors" + tags = ["models", "checkpoints", "unit-tests", "mf-types"] + meta = {"epoch": 1, "active": True} + asset_factory(name, tags, meta, make_asset_bytes(name)) + + # int filter matches numeric + r1 = http.get( + api_base + "/api/assets", + params={ + "include_tags": "unit-tests,mf-types", + "metadata_filter": json.dumps({"epoch": 1}), + }, + timeout=120, + ) + b1 = r1.json() + assert r1.status_code == 200 and any(a["name"] == name for a in b1["assets"]) + + # string "1" must NOT match numeric 1 + r2 = http.get( + api_base + "/api/assets", + params={ + "include_tags": "unit-tests,mf-types", + "metadata_filter": json.dumps({"epoch": "1"}), + }, + timeout=120, + ) + b2 = r2.json() + assert r2.status_code == 200 and not b2["assets"] + + # bool True matches, string "true" must NOT match + r3 = http.get( + api_base + "/api/assets", + params={ + "include_tags": "unit-tests,mf-types", + "metadata_filter": json.dumps({"active": True}), + }, + timeout=120, + ) + b3 = r3.json() + assert r3.status_code == 200 and any(a["name"] == name for a in b3["assets"]) + + r4 = http.get( + api_base + "/api/assets", + params={ + "include_tags": "unit-tests,mf-types", + "metadata_filter": json.dumps({"active": "true"}), + }, + timeout=120, + ) + b4 = r4.json() + assert r4.status_code == 200 and not b4["assets"] + + +def test_meta_any_of_list_of_scalars(http, api_base, asset_factory, make_asset_bytes): + name = "mf_list_scalars.safetensors" + tags = ["models", "checkpoints", "unit-tests", "mf-list"] + meta = {"flags": ["red", "green"]} + asset_factory(name, tags, meta, make_asset_bytes(name, 3000)) + + # Any-of should match because "green" is present + filt_ok = {"flags": ["blue", "green"]} + r1 = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,mf-list", "metadata_filter": json.dumps(filt_ok)}, + timeout=120, + ) + b1 = r1.json() + assert r1.status_code == 200 and any(a["name"] == name for a in b1["assets"]) + + # None of provided flags present -> no match + filt_miss = {"flags": ["blue", "yellow"]} + r2 = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,mf-list", "metadata_filter": json.dumps(filt_miss)}, + timeout=120, + ) + b2 = r2.json() + assert r2.status_code == 200 and not b2["assets"] + + # Duplicates in list should not break matching + filt_dup = {"flags": ["green", "green", "green"]} + r3 = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,mf-list", "metadata_filter": json.dumps(filt_dup)}, + timeout=120, + ) + b3 = r3.json() + assert r3.status_code == 200 and any(a["name"] == name for a in b3["assets"]) + + +def test_meta_none_semantics_missing_or_null_and_any_of_with_none( + http, api_base, asset_factory, make_asset_bytes +): + # a1: key missing; a2: explicit null; a3: concrete value + t = ["models", "checkpoints", "unit-tests", "mf-none"] + a1 = asset_factory("mf_none_missing.safetensors", t, {"x": 1}, make_asset_bytes("a1")) + a2 = asset_factory("mf_none_null.safetensors", t, {"maybe": None}, make_asset_bytes("a2")) + a3 = asset_factory("mf_none_value.safetensors", t, {"maybe": "x"}, make_asset_bytes("a3")) + + # Filter {maybe: None} must match a1 and a2, not a3 + filt = {"maybe": None} + r1 = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,mf-none", "metadata_filter": json.dumps(filt), "sort": "name"}, + timeout=120, + ) + b1 = r1.json() + assert r1.status_code == 200 + got = [a["name"] for a in b1["assets"]] + assert a1["name"] in got and a2["name"] in got and a3["name"] not in got + + # Any-of with None should include missing/null plus value matches + filt_any = {"maybe": [None, "x"]} + r2 = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,mf-none", "metadata_filter": json.dumps(filt_any), "sort": "name"}, + timeout=120, + ) + b2 = r2.json() + assert r2.status_code == 200 + got2 = [a["name"] for a in b2["assets"]] + assert a1["name"] in got2 and a2["name"] in got2 and a3["name"] in got2 + + +def test_meta_nested_json_object_equality(http, api_base, asset_factory, make_asset_bytes): + name = "mf_nested_json.safetensors" + tags = ["models", "checkpoints", "unit-tests", "mf-nested"] + cfg = {"optimizer": "adam", "lr": 0.001, "schedule": {"type": "cosine", "warmup": 100}} + asset_factory(name, tags, {"config": cfg}, make_asset_bytes(name, 2200)) + + # Exact JSON object equality (same structure) + r1 = http.get( + api_base + "/api/assets", + params={ + "include_tags": "unit-tests,mf-nested", + "metadata_filter": json.dumps({"config": cfg}), + }, + timeout=120, + ) + b1 = r1.json() + assert r1.status_code == 200 and any(a["name"] == name for a in b1["assets"]) + + # Different JSON object should not match + r2 = http.get( + api_base + "/api/assets", + params={ + "include_tags": "unit-tests,mf-nested", + "metadata_filter": json.dumps({"config": {"optimizer": "sgd"}}), + }, + timeout=120, + ) + b2 = r2.json() + assert r2.status_code == 200 and not b2["assets"] + + +def test_meta_list_of_objects_any_of(http, api_base, asset_factory, make_asset_bytes): + name = "mf_list_objects.safetensors" + tags = ["models", "checkpoints", "unit-tests", "mf-objlist"] + transforms = [{"type": "crop", "size": 128}, {"type": "flip", "p": 0.5}] + asset_factory(name, tags, {"transforms": transforms}, make_asset_bytes(name, 2048)) + + # Any-of for list of objects should match when one element equals the filter object + r1 = http.get( + api_base + "/api/assets", + params={ + "include_tags": "unit-tests,mf-objlist", + "metadata_filter": json.dumps({"transforms": {"type": "flip", "p": 0.5}}), + }, + timeout=120, + ) + b1 = r1.json() + assert r1.status_code == 200 and any(a["name"] == name for a in b1["assets"]) + + # Non-matching object -> no match + r2 = http.get( + api_base + "/api/assets", + params={ + "include_tags": "unit-tests,mf-objlist", + "metadata_filter": json.dumps({"transforms": {"type": "rotate", "deg": 90}}), + }, + timeout=120, + ) + b2 = r2.json() + assert r2.status_code == 200 and not b2["assets"] + + +def test_meta_with_special_and_unicode_keys(http, api_base, asset_factory, make_asset_bytes): + name = "mf_keys_unicode.safetensors" + tags = ["models", "checkpoints", "unit-tests", "mf-keys"] + meta = { + "weird.key": "v1", + "path/like": 7, + "with:colon": True, + "ключ": "значение", + "emoji": "🐍", + } + asset_factory(name, tags, meta, make_asset_bytes(name, 1500)) + + # Match all the special keys + filt = {"weird.key": "v1", "path/like": 7, "with:colon": True, "emoji": "🐍"} + r1 = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,mf-keys", "metadata_filter": json.dumps(filt)}, + timeout=120, + ) + b1 = r1.json() + assert r1.status_code == 200 and any(a["name"] == name for a in b1["assets"]) + + # Unicode key match + r2 = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,mf-keys", "metadata_filter": json.dumps({"ключ": "значение"})}, + timeout=120, + ) + b2 = r2.json() + assert r2.status_code == 200 and any(a["name"] == name for a in b2["assets"]) + + +def test_meta_with_zero_and_boolean_lists(http, api_base, asset_factory, make_asset_bytes): + t = ["models", "checkpoints", "unit-tests", "mf-zero-bool"] + a0 = asset_factory("mf_zero_count.safetensors", t, {"count": 0}, make_asset_bytes("z", 1025)) + a1 = asset_factory("mf_bool_list.safetensors", t, {"choices": [True, False]}, make_asset_bytes("b", 1026)) + + # count == 0 must match only a0 + r1 = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,mf-zero-bool", "metadata_filter": json.dumps({"count": 0})}, + timeout=120, + ) + b1 = r1.json() + assert r1.status_code == 200 + names1 = [a["name"] for a in b1["assets"]] + assert a0["name"] in names1 and a1["name"] not in names1 + + # Any-of list of booleans: True matches second asset + r2 = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,mf-zero-bool", "metadata_filter": json.dumps({"choices": True})}, + timeout=120, + ) + b2 = r2.json() + assert r2.status_code == 200 and any(a["name"] == a1["name"] for a in b2["assets"]) + + +def test_meta_mixed_list_types_and_strictness(http, api_base, asset_factory, make_asset_bytes): + name = "mf_mixed_list.safetensors" + tags = ["models", "checkpoints", "unit-tests", "mf-mixed"] + meta = {"mix": ["1", 1, True, None]} + asset_factory(name, tags, meta, make_asset_bytes(name, 1999)) + + # Should match because 1 is present + r1 = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,mf-mixed", "metadata_filter": json.dumps({"mix": [2, 1]})}, + timeout=120, + ) + b1 = r1.json() + assert r1.status_code == 200 and any(a["name"] == name for a in b1["assets"]) + + # Should NOT match for False + r2 = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,mf-mixed", "metadata_filter": json.dumps({"mix": False})}, + timeout=120, + ) + b2 = r2.json() + assert r2.status_code == 200 and not b2["assets"] + + +def test_meta_unknown_key_and_none_behavior_with_scope_tags(http, api_base, asset_factory, make_asset_bytes): + # Use a unique scope tag to avoid interference + t = ["models", "checkpoints", "unit-tests", "mf-unknown-scope"] + x = asset_factory("mf_unknown_a.safetensors", t, {"k1": 1}, make_asset_bytes("ua")) + y = asset_factory("mf_unknown_b.safetensors", t, {"k2": 2}, make_asset_bytes("ub")) + + # Filtering by unknown key with None should return both (missing key OR null) + r1 = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,mf-unknown-scope", "metadata_filter": json.dumps({"unknown": None})}, + timeout=120, + ) + b1 = r1.json() + assert r1.status_code == 200 + names = {a["name"] for a in b1["assets"]} + assert x["name"] in names and y["name"] in names + + # Filtering by unknown key with concrete value should return none + r2 = http.get( + api_base + "/api/assets", + params={"include_tags": "unit-tests,mf-unknown-scope", "metadata_filter": json.dumps({"unknown": "x"})}, + timeout=120, + ) + b2 = r2.json() + assert r2.status_code == 200 and not b2["assets"] + + +def test_meta_with_tags_include_exclude_and_name_contains(http, api_base, asset_factory, make_asset_bytes): + # alpha matches epoch=1; beta has epoch=2 + a = asset_factory( + "mf_tag_alpha.safetensors", + ["models", "checkpoints", "unit-tests", "mf-tag", "alpha"], + {"epoch": 1}, + make_asset_bytes("alpha"), + ) + b = asset_factory( + "mf_tag_beta.safetensors", + ["models", "checkpoints", "unit-tests", "mf-tag", "beta"], + {"epoch": 2}, + make_asset_bytes("beta"), + ) + + params = { + "include_tags": "unit-tests,mf-tag,alpha", + "exclude_tags": "beta", + "name_contains": "mf_tag_", + "metadata_filter": json.dumps({"epoch": 1}), + } + r = http.get(api_base + "/api/assets", params=params, timeout=120) + body = r.json() + assert r.status_code == 200 + names = [x["name"] for x in body["assets"]] + assert a["name"] in names + assert b["name"] not in names + + +def test_meta_sort_and_paging_under_filter(http, api_base, asset_factory, make_asset_bytes): + # Three assets in same scope with different sizes and a common filter key + t = ["models", "checkpoints", "unit-tests", "mf-sort"] + n1, n2, n3 = "mf_sort_1.safetensors", "mf_sort_2.safetensors", "mf_sort_3.safetensors" + asset_factory(n1, t, {"group": "g"}, make_asset_bytes(n1, 1024)) + asset_factory(n2, t, {"group": "g"}, make_asset_bytes(n2, 2048)) + asset_factory(n3, t, {"group": "g"}, make_asset_bytes(n3, 3072)) + + # Sort by size ascending with paging + q = { + "include_tags": "unit-tests,mf-sort", + "metadata_filter": json.dumps({"group": "g"}), + "sort": "size", "order": "asc", "limit": "2", + } + r1 = http.get(api_base + "/api/assets", params=q, timeout=120) + b1 = r1.json() + assert r1.status_code == 200 + got1 = [a["name"] for a in b1["assets"]] + assert got1 == [n1, n2] + assert b1["has_more"] is True + + q2 = {**q, "offset": "2"} + r2 = http.get(api_base + "/api/assets", params=q2, timeout=120) + b2 = r2.json() + assert r2.status_code == 200 + got2 = [a["name"] for a in b2["assets"]] + assert got2 == [n3] + assert b2["has_more"] is False diff --git a/tests-unit/assets_test/test_prune_orphaned_assets.py b/tests-unit/assets_test/test_prune_orphaned_assets.py new file mode 100644 index 000000000..f602e5a77 --- /dev/null +++ b/tests-unit/assets_test/test_prune_orphaned_assets.py @@ -0,0 +1,141 @@ +import uuid +from pathlib import Path + +import pytest +import requests +from conftest import get_asset_filename, trigger_sync_seed_assets + + +@pytest.fixture +def create_seed_file(comfy_tmp_base_dir: Path): + """Create a file on disk that will become a seed asset after sync.""" + created: list[Path] = [] + + def _create(root: str, scope: str, name: str | None = None, data: bytes = b"TEST") -> Path: + name = name or f"seed_{uuid.uuid4().hex[:8]}.bin" + path = comfy_tmp_base_dir / root / "unit-tests" / scope / name + path.parent.mkdir(parents=True, exist_ok=True) + path.write_bytes(data) + created.append(path) + return path + + yield _create + + for p in created: + p.unlink(missing_ok=True) + + +@pytest.fixture +def find_asset(http: requests.Session, api_base: str): + """Query API for assets matching scope and optional name.""" + def _find(scope: str, name: str | None = None) -> list[dict]: + params = {"include_tags": f"unit-tests,{scope}"} + if name: + params["name_contains"] = name + r = http.get(f"{api_base}/api/assets", params=params, timeout=120) + assert r.status_code == 200 + assets = r.json().get("assets", []) + if name: + return [a for a in assets if a.get("name") == name] + return assets + + return _find + + +@pytest.mark.parametrize("root", ["input", "output"]) +def test_orphaned_seed_asset_is_pruned( + root: str, + create_seed_file, + find_asset, + http: requests.Session, + api_base: str, +): + """Seed asset with deleted file is removed; with file present, it survives.""" + scope = f"prune-{uuid.uuid4().hex[:6]}" + fp = create_seed_file(root, scope) + name = fp.name + + trigger_sync_seed_assets(http, api_base) + assert find_asset(scope, name), "Seed asset should exist" + + fp.unlink() + trigger_sync_seed_assets(http, api_base) + assert not find_asset(scope, name), "Orphaned seed should be pruned" + + +def test_seed_asset_with_file_survives_prune( + create_seed_file, + find_asset, + http: requests.Session, + api_base: str, +): + """Seed asset with file still on disk is NOT pruned.""" + scope = f"keep-{uuid.uuid4().hex[:6]}" + fp = create_seed_file("input", scope) + + trigger_sync_seed_assets(http, api_base) + trigger_sync_seed_assets(http, api_base) + + assert find_asset(scope, fp.name), "Seed with valid file should survive" + + +def test_hashed_asset_not_pruned_when_file_missing( + http: requests.Session, + api_base: str, + comfy_tmp_base_dir: Path, + asset_factory, + make_asset_bytes, +): + """Hashed assets are never deleted by prune, even without file.""" + scope = f"hashed-{uuid.uuid4().hex[:6]}" + data = make_asset_bytes("test", 2048) + a = asset_factory("test.bin", ["input", "unit-tests", scope], {}, data) + + path = comfy_tmp_base_dir / "input" / "unit-tests" / scope / get_asset_filename(a["asset_hash"], ".bin") + path.unlink() + + trigger_sync_seed_assets(http, api_base) + + r = http.get(f"{api_base}/api/assets/{a['id']}", timeout=120) + assert r.status_code == 200, "Hashed asset should NOT be pruned" + + +def test_prune_across_multiple_roots( + create_seed_file, + find_asset, + http: requests.Session, + api_base: str, +): + """Prune correctly handles assets across input and output roots.""" + scope = f"multi-{uuid.uuid4().hex[:6]}" + input_fp = create_seed_file("input", scope, "input.bin") + create_seed_file("output", scope, "output.bin") + + trigger_sync_seed_assets(http, api_base) + assert len(find_asset(scope)) == 2 + + input_fp.unlink() + trigger_sync_seed_assets(http, api_base) + + remaining = find_asset(scope) + assert len(remaining) == 1 + assert remaining[0]["name"] == "output.bin" + + +@pytest.mark.parametrize("dirname", ["100%_done", "my_folder_name", "has spaces"]) +def test_special_chars_in_path_escaped_correctly( + dirname: str, + create_seed_file, + find_asset, + http: requests.Session, + api_base: str, + comfy_tmp_base_dir: Path, +): + """SQL LIKE wildcards (%, _) and spaces in paths don't cause false matches.""" + scope = f"special-{uuid.uuid4().hex[:6]}/{dirname}" + fp = create_seed_file("input", scope) + + trigger_sync_seed_assets(http, api_base) + trigger_sync_seed_assets(http, api_base) + + assert find_asset(scope.split("/")[0], fp.name), "Asset with special chars should survive" diff --git a/tests-unit/assets_test/test_tags.py b/tests-unit/assets_test/test_tags.py new file mode 100644 index 000000000..6b1047802 --- /dev/null +++ b/tests-unit/assets_test/test_tags.py @@ -0,0 +1,225 @@ +import json +import uuid + +import requests + + +def test_tags_present(http: requests.Session, api_base: str, seeded_asset: dict): + # Include zero-usage tags by default + r1 = http.get(api_base + "/api/tags", params={"limit": "50"}, timeout=120) + body1 = r1.json() + assert r1.status_code == 200 + names = [t["name"] for t in body1["tags"]] + # A few system tags from migration should exist: + assert "models" in names + assert "checkpoints" in names + + # Only used tags before we add anything new from this test cycle + r2 = http.get(api_base + "/api/tags", params={"include_zero": "false"}, timeout=120) + body2 = r2.json() + assert r2.status_code == 200 + # We already seeded one asset via fixture, so used tags must be non-empty + used_names = [t["name"] for t in body2["tags"]] + assert "models" in used_names + assert "checkpoints" in used_names + + # Prefix filter should refine the list + r3 = http.get(api_base + "/api/tags", params={"include_zero": "false", "prefix": "uni"}, timeout=120) + b3 = r3.json() + assert r3.status_code == 200 + names3 = [t["name"] for t in b3["tags"]] + assert "unit-tests" in names3 + assert "models" not in names3 # filtered out by prefix + + # Order by name ascending should be stable + r4 = http.get(api_base + "/api/tags", params={"include_zero": "false", "order": "name_asc"}, timeout=120) + b4 = r4.json() + assert r4.status_code == 200 + names4 = [t["name"] for t in b4["tags"]] + assert names4 == sorted(names4) + + +def test_tags_empty_usage(http: requests.Session, api_base: str, asset_factory, make_asset_bytes): + # Baseline: system tags exist when include_zero (default) is true + r1 = http.get(api_base + "/api/tags", params={"limit": "500"}, timeout=120) + body1 = r1.json() + assert r1.status_code == 200 + names = [t["name"] for t in body1["tags"]] + assert "models" in names and "checkpoints" in names + + # Create a short-lived asset under input with a unique custom tag + scope = f"tags-empty-usage-{uuid.uuid4().hex[:6]}" + custom_tag = f"temp-{uuid.uuid4().hex[:8]}" + name = "tag_seed.bin" + _asset = asset_factory( + name, + ["input", "unit-tests", scope, custom_tag], + {}, + make_asset_bytes(name, 512), + ) + + # While the asset exists, the custom tag must appear when include_zero=false + r2 = http.get( + api_base + "/api/tags", + params={"include_zero": "false", "prefix": custom_tag, "limit": "50"}, + timeout=120, + ) + body2 = r2.json() + assert r2.status_code == 200 + used_names = [t["name"] for t in body2["tags"]] + assert custom_tag in used_names + + # Delete the asset so the tag usage drops to zero + rd = http.delete(f"{api_base}/api/assets/{_asset['id']}", timeout=120) + assert rd.status_code == 204 + + # Now the custom tag must not be returned when include_zero=false + r3 = http.get( + api_base + "/api/tags", + params={"include_zero": "false", "prefix": custom_tag, "limit": "50"}, + timeout=120, + ) + body3 = r3.json() + assert r3.status_code == 200 + names_after = [t["name"] for t in body3["tags"]] + assert custom_tag not in names_after + assert not names_after # filtered view should be empty now + + +def test_add_and_remove_tags(http: requests.Session, api_base: str, seeded_asset: dict): + aid = seeded_asset["id"] + + # Add tags with duplicates and mixed case + payload_add = {"tags": ["NewTag", "unit-tests", "newtag", "BETA"]} + r1 = http.post(f"{api_base}/api/assets/{aid}/tags", json=payload_add, timeout=120) + b1 = r1.json() + assert r1.status_code == 200, b1 + # normalized, deduplicated; 'unit-tests' was already present from the seed + assert set(b1["added"]) == {"newtag", "beta"} + assert set(b1["already_present"]) == {"unit-tests"} + assert "newtag" in b1["total_tags"] and "beta" in b1["total_tags"] + + rg = http.get(f"{api_base}/api/assets/{aid}", timeout=120) + g = rg.json() + assert rg.status_code == 200 + tags_now = set(g["tags"]) + assert {"newtag", "beta"}.issubset(tags_now) + + # Remove a tag and a non-existent tag + payload_del = {"tags": ["newtag", "does-not-exist"]} + r2 = http.delete(f"{api_base}/api/assets/{aid}/tags", json=payload_del, timeout=120) + b2 = r2.json() + assert r2.status_code == 200 + assert set(b2["removed"]) == {"newtag"} + assert set(b2["not_present"]) == {"does-not-exist"} + + # Verify remaining tags after deletion + rg2 = http.get(f"{api_base}/api/assets/{aid}", timeout=120) + g2 = rg2.json() + assert rg2.status_code == 200 + tags_later = set(g2["tags"]) + assert "newtag" not in tags_later + assert "beta" in tags_later # still present + + +def test_tags_list_order_and_prefix(http: requests.Session, api_base: str, seeded_asset: dict): + aid = seeded_asset["id"] + h = seeded_asset["asset_hash"] + + # Add both tags to the seeded asset (usage: orderaaa=1, orderbbb=1) + r_add = http.post(f"{api_base}/api/assets/{aid}/tags", json={"tags": ["orderaaa", "orderbbb"]}, timeout=120) + add_body = r_add.json() + assert r_add.status_code == 200, add_body + + # Create another AssetInfo from the same content but tagged ONLY with 'orderbbb'. + payload = { + "hash": h, + "name": "order_only_bbb.safetensors", + "tags": ["input", "unit-tests", "orderbbb"], + "user_metadata": {}, + } + r_copy = http.post(f"{api_base}/api/assets/from-hash", json=payload, timeout=120) + copy_body = r_copy.json() + assert r_copy.status_code == 201, copy_body + + # 1) Default order (count_desc): 'orderbbb' should come before 'orderaaa' + # because it has higher usage (2 vs 1). + r1 = http.get(api_base + "/api/tags", params={"prefix": "order", "include_zero": "false"}, timeout=120) + b1 = r1.json() + assert r1.status_code == 200, b1 + names1 = [t["name"] for t in b1["tags"]] + counts1 = {t["name"]: t["count"] for t in b1["tags"]} + # Both must be present within the prefix subset + assert "orderaaa" in names1 and "orderbbb" in names1 + # Usage of 'orderbbb' must be >= 'orderaaa'; in our setup it's 2 vs 1 + assert counts1["orderbbb"] >= counts1["orderaaa"] + # And with count_desc, 'orderbbb' appears earlier than 'orderaaa' + assert names1.index("orderbbb") < names1.index("orderaaa") + + # 2) name_asc: lexical order should flip the relative order + r2 = http.get( + api_base + "/api/tags", + params={"prefix": "order", "include_zero": "false", "order": "name_asc"}, + timeout=120, + ) + b2 = r2.json() + assert r2.status_code == 200, b2 + names2 = [t["name"] for t in b2["tags"]] + assert "orderaaa" in names2 and "orderbbb" in names2 + assert names2.index("orderaaa") < names2.index("orderbbb") + + # 3) invalid limit rejected (existing negative case retained) + r3 = http.get(api_base + "/api/tags", params={"limit": "1001"}, timeout=120) + b3 = r3.json() + assert r3.status_code == 400 + assert b3["error"]["code"] == "INVALID_QUERY" + + +def test_tags_endpoints_invalid_bodies(http: requests.Session, api_base: str, seeded_asset: dict): + aid = seeded_asset["id"] + + # Add with empty list + r1 = http.post(f"{api_base}/api/assets/{aid}/tags", json={"tags": []}, timeout=120) + b1 = r1.json() + assert r1.status_code == 400 + assert b1["error"]["code"] == "INVALID_BODY" + + # Remove with wrong type + r2 = http.delete(f"{api_base}/api/assets/{aid}/tags", json={"tags": [123]}, timeout=120) + b2 = r2.json() + assert r2.status_code == 400 + assert b2["error"]["code"] == "INVALID_BODY" + + # metadata_filter provided as JSON array should be rejected (must be object) + r3 = http.get( + api_base + "/api/assets", + params={"metadata_filter": json.dumps([{"x": 1}])}, + timeout=120, + ) + b3 = r3.json() + assert r3.status_code == 400 + assert b3["error"]["code"] == "INVALID_QUERY" + + +def test_tags_prefix_treats_underscore_literal( + http, + api_base, + asset_factory, + make_asset_bytes, +): + """'prefix' for /api/tags must treat '_' literally, not as a wildcard.""" + base = f"pref_{uuid.uuid4().hex[:6]}" + tag_ok = f"{base}_ok" # should match prefix=f"{base}_" + tag_bad = f"{base}xok" # must NOT match if '_' is escaped + scope = f"tags-underscore-{uuid.uuid4().hex[:6]}" + + asset_factory("t1.bin", ["input", "unit-tests", scope, tag_ok], {}, make_asset_bytes("t1", 512)) + asset_factory("t2.bin", ["input", "unit-tests", scope, tag_bad], {}, make_asset_bytes("t2", 512)) + + r = http.get(api_base + "/api/tags", params={"include_zero": "false", "prefix": f"{base}_"}, timeout=120) + body = r.json() + assert r.status_code == 200, body + names = [t["name"] for t in body["tags"]] + assert tag_ok in names, f"Expected {tag_ok} to be returned for prefix '{base}_'" + assert tag_bad not in names, f"'{tag_bad}' must not match — '_' is not a wildcard" + assert body["total"] == 1 diff --git a/tests-unit/assets_test/test_uploads.py b/tests-unit/assets_test/test_uploads.py new file mode 100644 index 000000000..137d7391a --- /dev/null +++ b/tests-unit/assets_test/test_uploads.py @@ -0,0 +1,281 @@ +import json +import uuid +from concurrent.futures import ThreadPoolExecutor + +import requests +import pytest + + +def test_upload_ok_duplicate_reference(http: requests.Session, api_base: str, make_asset_bytes): + name = "dup_a.safetensors" + tags = ["models", "checkpoints", "unit-tests", "alpha"] + meta = {"purpose": "dup"} + data = make_asset_bytes(name) + files = {"file": (name, data, "application/octet-stream")} + form = {"tags": json.dumps(tags), "name": name, "user_metadata": json.dumps(meta)} + r1 = http.post(api_base + "/api/assets", data=form, files=files, timeout=120) + a1 = r1.json() + assert r1.status_code == 201, a1 + assert a1["created_new"] is True + + # Second upload with the same data and name should return created_new == False and the same asset + files = {"file": (name, data, "application/octet-stream")} + form = {"tags": json.dumps(tags), "name": name, "user_metadata": json.dumps(meta)} + r2 = http.post(api_base + "/api/assets", data=form, files=files, timeout=120) + a2 = r2.json() + assert r2.status_code == 200, a2 + assert a2["created_new"] is False + assert a2["asset_hash"] == a1["asset_hash"] + assert a2["id"] == a1["id"] # old reference + + # Third upload with the same data but new name should return created_new == False and the new AssetReference + files = {"file": (name, data, "application/octet-stream")} + form = {"tags": json.dumps(tags), "name": name + "_d", "user_metadata": json.dumps(meta)} + r2 = http.post(api_base + "/api/assets", data=form, files=files, timeout=120) + a3 = r2.json() + assert r2.status_code == 200, a3 + assert a3["created_new"] is False + assert a3["asset_hash"] == a1["asset_hash"] + assert a3["id"] != a1["id"] # old reference + + +def test_upload_fastpath_from_existing_hash_no_file(http: requests.Session, api_base: str): + # Seed a small file first + name = "fastpath_seed.safetensors" + tags = ["models", "checkpoints", "unit-tests"] + meta = {} + files = {"file": (name, b"B" * 1024, "application/octet-stream")} + form = {"tags": json.dumps(tags), "name": name, "user_metadata": json.dumps(meta)} + r1 = http.post(api_base + "/api/assets", data=form, files=files, timeout=120) + b1 = r1.json() + assert r1.status_code == 201, b1 + h = b1["asset_hash"] + + # Now POST /api/assets with only hash and no file + files = [ + ("hash", (None, h)), + ("tags", (None, json.dumps(tags))), + ("name", (None, "fastpath_copy.safetensors")), + ("user_metadata", (None, json.dumps({"purpose": "copy"}))), + ] + r2 = http.post(api_base + "/api/assets", files=files, timeout=120) + b2 = r2.json() + assert r2.status_code == 200, b2 # fast path returns 200 with created_new == False + assert b2["created_new"] is False + assert b2["asset_hash"] == h + + +def test_upload_fastpath_with_known_hash_and_file( + http: requests.Session, api_base: str +): + # Seed + files = {"file": ("seed.safetensors", b"C" * 128, "application/octet-stream")} + form = {"tags": json.dumps(["models", "checkpoints", "unit-tests", "fp"]), "name": "seed.safetensors", "user_metadata": json.dumps({})} + r1 = http.post(api_base + "/api/assets", data=form, files=files, timeout=120) + b1 = r1.json() + assert r1.status_code == 201, b1 + h = b1["asset_hash"] + + # Send both file and hash of existing content -> server must drain file and create from hash (200) + files = {"file": ("ignored.bin", b"ignored" * 10, "application/octet-stream")} + form = {"hash": h, "tags": json.dumps(["models", "checkpoints", "unit-tests", "fp"]), "name": "copy_from_hash.safetensors", "user_metadata": json.dumps({})} + r2 = http.post(api_base + "/api/assets", data=form, files=files, timeout=120) + b2 = r2.json() + assert r2.status_code == 200, b2 + assert b2["created_new"] is False + assert b2["asset_hash"] == h + + +def test_upload_multiple_tags_fields_are_merged(http: requests.Session, api_base: str): + data = [ + ("tags", "models,checkpoints"), + ("tags", json.dumps(["unit-tests", "alpha"])), + ("name", "merge.safetensors"), + ("user_metadata", json.dumps({"u": 1})), + ] + files = {"file": ("merge.safetensors", b"B" * 256, "application/octet-stream")} + r1 = http.post(api_base + "/api/assets", data=data, files=files, timeout=120) + created = r1.json() + assert r1.status_code in (200, 201), created + aid = created["id"] + + # Verify all tags are present on the resource + rg = http.get(f"{api_base}/api/assets/{aid}", timeout=120) + detail = rg.json() + assert rg.status_code == 200, detail + tags = set(detail["tags"]) + assert {"models", "checkpoints", "unit-tests", "alpha"}.issubset(tags) + + +@pytest.mark.parametrize("root", ["input", "output"]) +def test_concurrent_upload_identical_bytes_different_names( + root: str, + http: requests.Session, + api_base: str, + make_asset_bytes, +): + """ + Two concurrent uploads of identical bytes but different names. + Expect a single Asset (same hash), two AssetInfo rows, and exactly one created_new=True. + """ + scope = f"concupload-{uuid.uuid4().hex[:6]}" + name1, name2 = "cu_a.bin", "cu_b.bin" + data = make_asset_bytes("concurrent", 4096) + tags = [root, "unit-tests", scope] + + def _do_upload(args): + url, form_data, files_data = args + with requests.Session() as s: + return s.post(url, data=form_data, files=files_data, timeout=120) + + url = api_base + "/api/assets" + form1 = {"tags": json.dumps(tags), "name": name1, "user_metadata": json.dumps({})} + files1 = {"file": (name1, data, "application/octet-stream")} + form2 = {"tags": json.dumps(tags), "name": name2, "user_metadata": json.dumps({})} + files2 = {"file": (name2, data, "application/octet-stream")} + + with ThreadPoolExecutor(max_workers=2) as executor: + futures = list(executor.map(_do_upload, [(url, form1, files1), (url, form2, files2)])) + r1, r2 = futures + + b1, b2 = r1.json(), r2.json() + assert r1.status_code in (200, 201), b1 + assert r2.status_code in (200, 201), b2 + assert b1["asset_hash"] == b2["asset_hash"] + assert b1["id"] != b2["id"] + + created_flags = sorted([bool(b1.get("created_new")), bool(b2.get("created_new"))]) + assert created_flags == [False, True] + + rl = http.get( + api_base + "/api/assets", + params={"include_tags": f"unit-tests,{scope}", "sort": "name"}, + timeout=120, + ) + bl = rl.json() + assert rl.status_code == 200, bl + names = [a["name"] for a in bl.get("assets", [])] + assert set([name1, name2]).issubset(names) + + +def test_create_from_hash_endpoint_404(http: requests.Session, api_base: str): + payload = { + "hash": "blake3:" + "0" * 64, + "name": "nonexistent.bin", + "tags": ["models", "checkpoints", "unit-tests"], + } + r = http.post(api_base + "/api/assets/from-hash", json=payload, timeout=120) + body = r.json() + assert r.status_code == 404 + assert body["error"]["code"] == "ASSET_NOT_FOUND" + + +def test_upload_zero_byte_rejected(http: requests.Session, api_base: str): + files = {"file": ("empty.safetensors", b"", "application/octet-stream")} + form = {"tags": json.dumps(["models", "checkpoints", "unit-tests", "edge"]), "name": "empty.safetensors", "user_metadata": json.dumps({})} + r = http.post(api_base + "/api/assets", data=form, files=files, timeout=120) + body = r.json() + assert r.status_code == 400 + assert body["error"]["code"] == "EMPTY_UPLOAD" + + +def test_upload_invalid_root_tag_rejected(http: requests.Session, api_base: str): + files = {"file": ("badroot.bin", b"A" * 64, "application/octet-stream")} + form = {"tags": json.dumps(["not-a-root", "whatever"]), "name": "badroot.bin", "user_metadata": json.dumps({})} + r = http.post(api_base + "/api/assets", data=form, files=files, timeout=120) + body = r.json() + assert r.status_code == 400 + assert body["error"]["code"] == "INVALID_BODY" + + +def test_upload_user_metadata_must_be_json(http: requests.Session, api_base: str): + files = {"file": ("badmeta.bin", b"A" * 128, "application/octet-stream")} + form = {"tags": json.dumps(["models", "checkpoints", "unit-tests", "edge"]), "name": "badmeta.bin", "user_metadata": "{not json}"} + r = http.post(api_base + "/api/assets", data=form, files=files, timeout=120) + body = r.json() + assert r.status_code == 400 + assert body["error"]["code"] == "INVALID_BODY" + + +def test_upload_requires_multipart(http: requests.Session, api_base: str): + r = http.post(api_base + "/api/assets", json={"foo": "bar"}, timeout=120) + body = r.json() + assert r.status_code == 415 + assert body["error"]["code"] == "UNSUPPORTED_MEDIA_TYPE" + + +def test_upload_missing_file_and_hash(http: requests.Session, api_base: str): + files = [ + ("tags", (None, json.dumps(["models", "checkpoints", "unit-tests"]))), + ("name", (None, "x.safetensors")), + ] + r = http.post(api_base + "/api/assets", files=files, timeout=120) + body = r.json() + assert r.status_code == 400 + assert body["error"]["code"] == "MISSING_FILE" + + +def test_upload_models_unknown_category(http: requests.Session, api_base: str): + files = {"file": ("m.safetensors", b"A" * 128, "application/octet-stream")} + form = {"tags": json.dumps(["models", "no_such_category", "unit-tests"]), "name": "m.safetensors"} + r = http.post(api_base + "/api/assets", data=form, files=files, timeout=120) + body = r.json() + assert r.status_code == 400 + assert body["error"]["code"] == "INVALID_BODY" + assert body["error"]["message"].startswith("unknown models category") + + +def test_upload_models_requires_category(http: requests.Session, api_base: str): + files = {"file": ("nocat.safetensors", b"A" * 64, "application/octet-stream")} + form = {"tags": json.dumps(["models"]), "name": "nocat.safetensors", "user_metadata": json.dumps({})} + r = http.post(api_base + "/api/assets", data=form, files=files, timeout=120) + body = r.json() + assert r.status_code == 400 + assert body["error"]["code"] == "INVALID_BODY" + + +def test_upload_tags_traversal_guard(http: requests.Session, api_base: str): + files = {"file": ("evil.safetensors", b"A" * 256, "application/octet-stream")} + form = {"tags": json.dumps(["models", "checkpoints", "unit-tests", "..", "zzz"]), "name": "evil.safetensors"} + r = http.post(api_base + "/api/assets", data=form, files=files, timeout=120) + body = r.json() + assert r.status_code == 400 + assert body["error"]["code"] in ("BAD_REQUEST", "INVALID_BODY") + + +@pytest.mark.parametrize("root", ["input", "output"]) +def test_duplicate_upload_same_display_name_does_not_clobber( + root: str, + http: requests.Session, + api_base: str, + asset_factory, + make_asset_bytes, +): + """ + Two uploads use the same tags and the same display name but different bytes. + With hash-based filenames, they must NOT overwrite each other. Both assets + remain accessible and serve their original content. + """ + scope = f"dup-path-{uuid.uuid4().hex[:6]}" + display_name = "same_display.bin" + + d1 = make_asset_bytes(scope + "-v1", 1536) + d2 = make_asset_bytes(scope + "-v2", 2048) + tags = [root, "unit-tests", scope] + + first = asset_factory(display_name, tags, {}, d1) + second = asset_factory(display_name, tags, {}, d2) + + assert first["id"] != second["id"] + assert first["asset_hash"] != second["asset_hash"] # different content + assert first["name"] == second["name"] == display_name + + # Both must be independently retrievable + r1 = http.get(f"{api_base}/api/assets/{first['id']}/content", timeout=120) + b1 = r1.content + assert r1.status_code == 200 + assert b1 == d1 + r2 = http.get(f"{api_base}/api/assets/{second['id']}/content", timeout=120) + b2 = r2.content + assert r2.status_code == 200 + assert b2 == d2 diff --git a/tests-unit/requirements.txt b/tests-unit/requirements.txt index 3a6790ee0..2355b8000 100644 --- a/tests-unit/requirements.txt +++ b/tests-unit/requirements.txt @@ -2,3 +2,4 @@ pytest>=7.8.0 pytest-aiohttp pytest-asyncio websocket-client +blake3