From ee615ac26916f15290b63a20d4feb07ea702d5da Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 5 Jul 2025 11:34:57 -0700 Subject: [PATCH 001/243] Add warning when loading file unsafely. (#8800) --- comfy/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/comfy/utils.py b/comfy/utils.py index 1f8d71292..47981d8f6 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -77,6 +77,7 @@ def load_torch_file(ckpt, safe_load=False, device=None, return_metadata=False): if safe_load or ALWAYS_SAFE_LOAD: pl_sd = torch.load(ckpt, map_location=device, weights_only=True, **torch_args) else: + logging.warning("WARNING: loading {} unsafely, upgrade your pytorch to 2.4 or newer to load this file safely.".format(ckpt)) pl_sd = torch.load(ckpt, map_location=device, pickle_module=comfy.checkpoint_pickle) if "state_dict" in pl_sd: sd = pl_sd["state_dict"] From 75d327abd5ee8fb6a1ec992ffdb8e43926574a63 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 6 Jul 2025 04:07:39 -0700 Subject: [PATCH 002/243] Remove some useless code. (#8812) --- comfy/gligen.py | 47 +---------------------------------------------- 1 file changed, 1 insertion(+), 46 deletions(-) diff --git a/comfy/gligen.py b/comfy/gligen.py index 161d8a5e5..1d7b6c2f4 100644 --- a/comfy/gligen.py +++ b/comfy/gligen.py @@ -1,55 +1,10 @@ import math import torch from torch import nn -from .ldm.modules.attention import CrossAttention -from inspect import isfunction +from .ldm.modules.attention import CrossAttention, FeedForward import comfy.ops ops = comfy.ops.manual_cast -def exists(val): - return val is not None - - -def uniq(arr): - return{el: True for el in arr}.keys() - - -def default(val, d): - if exists(val): - return val - return d() if isfunction(d) else d - - -# feedforward -class GEGLU(nn.Module): - def __init__(self, dim_in, dim_out): - super().__init__() - self.proj = ops.Linear(dim_in, dim_out * 2) - - def forward(self, x): - x, gate = self.proj(x).chunk(2, dim=-1) - return x * torch.nn.functional.gelu(gate) - - -class FeedForward(nn.Module): - def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.): - super().__init__() - inner_dim = int(dim * mult) - dim_out = default(dim_out, dim) - project_in = nn.Sequential( - ops.Linear(dim, inner_dim), - nn.GELU() - ) if not glu else GEGLU(dim, inner_dim) - - self.net = nn.Sequential( - project_in, - nn.Dropout(dropout), - ops.Linear(inner_dim, dim_out) - ) - - def forward(self, x): - return self.net(x) - class GatedCrossAttentionDense(nn.Module): def __init__(self, query_dim, context_dim, n_heads, d_head): From 7eab7d29447429645c4c27a7e6c85d2be8ed79ff Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 6 Jul 2025 11:01:32 -0700 Subject: [PATCH 003/243] Remove dependency on deprecated torchaudio.save function (#8815) --- comfy_extras/nodes_audio.py | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py index 49af1eae4..aca09e4cc 100644 --- a/comfy_extras/nodes_audio.py +++ b/comfy_extras/nodes_audio.py @@ -133,14 +133,6 @@ def save_audio(self, audio, filename_prefix="ComfyUI", format="flac", prompt=Non if sample_rate != audio["sample_rate"]: waveform = torchaudio.functional.resample(waveform, audio["sample_rate"], sample_rate) - # Create in-memory WAV buffer - wav_buffer = io.BytesIO() - torchaudio.save(wav_buffer, waveform, sample_rate, format="WAV") - wav_buffer.seek(0) # Rewind for reading - - # Use PyAV to convert and add metadata - input_container = av.open(wav_buffer) - # Create output with specified format output_buffer = io.BytesIO() output_container = av.open(output_buffer, mode='w', format=format) @@ -150,7 +142,6 @@ def save_audio(self, audio, filename_prefix="ComfyUI", format="flac", prompt=Non output_container.metadata[key] = value # Set up the output stream with appropriate properties - input_container.streams.audio[0] if format == "opus": out_stream = output_container.add_stream("libopus", rate=sample_rate) if quality == "64k": @@ -175,18 +166,15 @@ def save_audio(self, audio, filename_prefix="ComfyUI", format="flac", prompt=Non else: #format == "flac": out_stream = output_container.add_stream("flac", rate=sample_rate) - - # Copy frames from input to output - for frame in input_container.decode(audio=0): - frame.pts = None # Let PyAV handle timestamps - output_container.mux(out_stream.encode(frame)) + frame = av.AudioFrame.from_ndarray(waveform.movedim(0, 1).reshape(1, -1).float().numpy(), format='flt', layout='mono' if waveform.shape[0] == 1 else 'stereo') + frame.sample_rate = sample_rate + output_container.mux(out_stream.encode(frame)) # Flush encoder output_container.mux(out_stream.encode(None)) # Close containers output_container.close() - input_container.close() # Write the output to file output_buffer.seek(0) From e740dfd8061e696c9701c917e05649eca9b4f631 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 7 Jul 2025 00:16:00 -0700 Subject: [PATCH 004/243] Fix warning in audio save nodes. (#8818) --- comfy_extras/nodes_audio.py | 1 + 1 file changed, 1 insertion(+) diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py index aca09e4cc..8cd647846 100644 --- a/comfy_extras/nodes_audio.py +++ b/comfy_extras/nodes_audio.py @@ -168,6 +168,7 @@ def save_audio(self, audio, filename_prefix="ComfyUI", format="flac", prompt=Non frame = av.AudioFrame.from_ndarray(waveform.movedim(0, 1).reshape(1, -1).float().numpy(), format='flt', layout='mono' if waveform.shape[0] == 1 else 'stereo') frame.sample_rate = sample_rate + frame.pts = 0 output_container.mux(out_stream.encode(frame)) # Flush encoder From 059cd38aa21aa6b91ca117a92218532fcd6e9c75 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Tue, 8 Jul 2025 08:43:56 +0800 Subject: [PATCH 005/243] Update template and node docs package version (#8825) --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 27d385389..03e29ab44 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.32 -comfyui-embedded-docs==0.2.3 +comfyui-workflow-templates==0.1.33 +comfyui-embedded-docs==0.2.4 torch torchsde torchvision From 1359c969e4962c2373eaccdf6fdcc9214c6957c2 Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Mon, 7 Jul 2025 20:35:41 -0700 Subject: [PATCH 006/243] Update template to 0.1.34 (#8829) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 03e29ab44..eb51479f6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.33 +comfyui-workflow-templates==0.1.34 comfyui-embedded-docs==0.2.4 torch torchsde From b5e97db9ac69637dc0e29d3e6259bb2a333a0b27 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Tue, 8 Jul 2025 20:52:02 +0800 Subject: [PATCH 007/243] Update template to 0.1.35 (#8831) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index eb51479f6..19a40ca0e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.34 +comfyui-workflow-templates==0.1.35 comfyui-embedded-docs==0.2.4 torch torchsde From 9fd0cd7cf7534d44624457e37b0902b2cd6cb04f Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Tue, 8 Jul 2025 05:54:30 -0700 Subject: [PATCH 008/243] Add Moonvalley nodes (#8832) --- comfy_api/input/video_types.py | 19 +- comfy_api/input_impl/video_types.py | 9 + comfy_api_nodes/apis/__init__.py | 171 +++++++- comfy_api_nodes/nodes_moonvalley.py | 639 ++++++++++++++++++++++++++++ nodes.py | 1 + 5 files changed, 837 insertions(+), 2 deletions(-) create mode 100644 comfy_api_nodes/nodes_moonvalley.py diff --git a/comfy_api/input/video_types.py b/comfy_api/input/video_types.py index dc22d34ff..bb936e0a4 100644 --- a/comfy_api/input/video_types.py +++ b/comfy_api/input/video_types.py @@ -1,6 +1,7 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import Optional +from typing import Optional, Union +import io from comfy_api.util import VideoContainer, VideoCodec, VideoComponents class VideoInput(ABC): @@ -31,6 +32,22 @@ class VideoInput(ABC): """ pass + def get_stream_source(self) -> Union[str, io.BytesIO]: + """ + Get a streamable source for the video. This allows processing without + loading the entire video into memory. + + Returns: + Either a file path (str) or a BytesIO object that can be opened with av. + + Default implementation creates a BytesIO buffer, but subclasses should + override this for better performance when possible. + """ + buffer = io.BytesIO() + self.save_to(buffer) + buffer.seek(0) + return buffer + # Provide a default implementation, but subclasses can provide optimized versions # if possible. def get_dimensions(self) -> tuple[int, int]: diff --git a/comfy_api/input_impl/video_types.py b/comfy_api/input_impl/video_types.py index 197f6558c..9ae818f4e 100644 --- a/comfy_api/input_impl/video_types.py +++ b/comfy_api/input_impl/video_types.py @@ -64,6 +64,15 @@ class VideoFromFile(VideoInput): """ self.__file = file + def get_stream_source(self) -> str | io.BytesIO: + """ + Return the underlying file source for efficient streaming. + This avoids unnecessary memory copies when the source is already a file path. + """ + if isinstance(self.__file, io.BytesIO): + self.__file.seek(0) + return self.__file + def get_dimensions(self) -> tuple[int, int]: """ Returns the dimensions of the video input. diff --git a/comfy_api_nodes/apis/__init__.py b/comfy_api_nodes/apis/__init__.py index e38d38cc9..086028abe 100644 --- a/comfy_api_nodes/apis/__init__.py +++ b/comfy_api_nodes/apis/__init__.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: filtered-openapi.yaml -# timestamp: 2025-05-19T21:38:55+00:00 +# timestamp: 2025-07-06T09:47:31+00:00 from __future__ import annotations @@ -1355,6 +1355,158 @@ class ModelResponseProperties(BaseModel): ) +class Keyframes(BaseModel): + image_url: Optional[str] = None + + +class MoonvalleyPromptResponse(BaseModel): + error: Optional[Dict[str, Any]] = None + frame_conditioning: Optional[Dict[str, Any]] = None + id: Optional[str] = None + inference_params: Optional[Dict[str, Any]] = None + meta: Optional[Dict[str, Any]] = None + model_params: Optional[Dict[str, Any]] = None + output_url: Optional[str] = None + prompt_text: Optional[str] = None + status: Optional[str] = None + + +class MoonvalleyTextToVideoInferenceParams(BaseModel): + add_quality_guidance: Optional[bool] = Field( + True, description='Whether to add quality guidance' + ) + caching_coefficient: Optional[float] = Field( + 0.3, description='Caching coefficient for optimization' + ) + caching_cooldown: Optional[int] = Field( + 3, description='Number of caching cooldown steps' + ) + caching_warmup: Optional[int] = Field( + 3, description='Number of caching warmup steps' + ) + clip_value: Optional[float] = Field( + 3, description='CLIP value for generation control' + ) + conditioning_frame_index: Optional[int] = Field( + 0, description='Index of the conditioning frame' + ) + cooldown_steps: Optional[int] = Field( + None, description='Number of cooldown steps (calculated based on num_frames)' + ) + fps: Optional[int] = Field( + 24, description='Frames per second of the generated video' + ) + guidance_scale: Optional[float] = Field( + 12.5, description='Guidance scale for generation control' + ) + height: Optional[int] = Field( + 1080, description='Height of the generated video in pixels' + ) + negative_prompt: Optional[str] = Field(None, description='Negative prompt text') + num_frames: Optional[int] = Field(64, description='Number of frames to generate') + seed: Optional[int] = Field( + None, description='Random seed for generation (default: random)' + ) + shift_value: Optional[float] = Field( + 3, description='Shift value for generation control' + ) + steps: Optional[int] = Field(80, description='Number of denoising steps') + use_guidance_schedule: Optional[bool] = Field( + True, description='Whether to use guidance scheduling' + ) + use_negative_prompts: Optional[bool] = Field( + False, description='Whether to use negative prompts' + ) + use_timestep_transform: Optional[bool] = Field( + True, description='Whether to use timestep transformation' + ) + warmup_steps: Optional[int] = Field( + None, description='Number of warmup steps (calculated based on num_frames)' + ) + width: Optional[int] = Field( + 1920, description='Width of the generated video in pixels' + ) + + +class MoonvalleyTextToVideoRequest(BaseModel): + image_url: Optional[str] = None + inference_params: Optional[MoonvalleyTextToVideoInferenceParams] = None + prompt_text: Optional[str] = None + webhook_url: Optional[str] = None + + +class MoonvalleyUploadFileRequest(BaseModel): + file: Optional[StrictBytes] = None + + +class MoonvalleyUploadFileResponse(BaseModel): + access_url: Optional[str] = None + + +class MoonvalleyVideoToVideoInferenceParams(BaseModel): + add_quality_guidance: Optional[bool] = Field( + True, description='Whether to add quality guidance' + ) + caching_coefficient: Optional[float] = Field( + 0.3, description='Caching coefficient for optimization' + ) + caching_cooldown: Optional[int] = Field( + 3, description='Number of caching cooldown steps' + ) + caching_warmup: Optional[int] = Field( + 3, description='Number of caching warmup steps' + ) + clip_value: Optional[float] = Field( + 3, description='CLIP value for generation control' + ) + conditioning_frame_index: Optional[int] = Field( + 0, description='Index of the conditioning frame' + ) + cooldown_steps: Optional[int] = Field( + None, description='Number of cooldown steps (calculated based on num_frames)' + ) + guidance_scale: Optional[float] = Field( + 12.5, description='Guidance scale for generation control' + ) + negative_prompt: Optional[str] = Field(None, description='Negative prompt text') + seed: Optional[int] = Field( + None, description='Random seed for generation (default: random)' + ) + shift_value: Optional[float] = Field( + 3, description='Shift value for generation control' + ) + steps: Optional[int] = Field(80, description='Number of denoising steps') + use_guidance_schedule: Optional[bool] = Field( + True, description='Whether to use guidance scheduling' + ) + use_negative_prompts: Optional[bool] = Field( + False, description='Whether to use negative prompts' + ) + use_timestep_transform: Optional[bool] = Field( + True, description='Whether to use timestep transformation' + ) + warmup_steps: Optional[int] = Field( + None, description='Number of warmup steps (calculated based on num_frames)' + ) + + +class ControlType(str, Enum): + motion_control = 'motion_control' + pose_control = 'pose_control' + + +class MoonvalleyVideoToVideoRequest(BaseModel): + control_type: ControlType = Field( + ..., description='Supported types for video control' + ) + inference_params: Optional[MoonvalleyVideoToVideoInferenceParams] = None + prompt_text: str = Field(..., description='Describes the video to generate') + video_url: str = Field(..., description='Url to control video') + webhook_url: Optional[str] = Field( + None, description='Optional webhook URL for notifications' + ) + + class Moderation(str, Enum): low = 'low' auto = 'auto' @@ -3107,6 +3259,23 @@ class LumaUpscaleVideoGenerationRequest(BaseModel): resolution: Optional[LumaVideoModelOutputResolution] = None +class MoonvalleyImageToVideoRequest(MoonvalleyTextToVideoRequest): + keyframes: Optional[Dict[str, Keyframes]] = None + + +class MoonvalleyResizeVideoRequest(MoonvalleyVideoToVideoRequest): + frame_position: Optional[List[int]] = Field(None, max_length=2, min_length=2) + frame_resolution: Optional[List[int]] = Field(None, max_length=2, min_length=2) + scale: Optional[List[int]] = Field(None, max_length=2, min_length=2) + + +class MoonvalleyTextToImageRequest(BaseModel): + image_url: Optional[str] = None + inference_params: Optional[MoonvalleyTextToVideoInferenceParams] = None + prompt_text: Optional[str] = None + webhook_url: Optional[str] = None + + class OutputContent(RootModel[Union[OutputTextContent, OutputAudioContent]]): root: Union[OutputTextContent, OutputAudioContent] diff --git a/comfy_api_nodes/nodes_moonvalley.py b/comfy_api_nodes/nodes_moonvalley.py new file mode 100644 index 000000000..6e937411c --- /dev/null +++ b/comfy_api_nodes/nodes_moonvalley.py @@ -0,0 +1,639 @@ +import logging +from typing import Any, Callable, Optional, TypeVar +import random +import torch +from comfy_api_nodes.util.validation_utils import get_image_dimensions, validate_image_dimensions, validate_video_dimensions + + +from comfy_api_nodes.apis import ( + MoonvalleyTextToVideoRequest, + MoonvalleyTextToVideoInferenceParams, + MoonvalleyVideoToVideoInferenceParams, + MoonvalleyVideoToVideoRequest, + MoonvalleyPromptResponse +) +from comfy_api_nodes.apis.client import ( + ApiEndpoint, + HttpMethod, + SynchronousOperation, + PollingOperation, + EmptyRequest, +) +from comfy_api_nodes.apinode_utils import ( + download_url_to_video_output, + upload_images_to_comfyapi, + upload_video_to_comfyapi, +) +from comfy_api_nodes.mapper_utils import model_field_to_node_input + +from comfy_api.input.video_types import VideoInput +from comfy.comfy_types.node_typing import IO +from comfy_api.input_impl import VideoFromFile +import av +import io + +API_UPLOADS_ENDPOINT = "/proxy/moonvalley/uploads" +API_PROMPTS_ENDPOINT = "/proxy/moonvalley/prompts" +API_VIDEO2VIDEO_ENDPOINT = "/proxy/moonvalley/prompts/video-to-video" +API_TXT2VIDEO_ENDPOINT = "/proxy/moonvalley/prompts/text-to-video" +API_IMG2VIDEO_ENDPOINT = "/proxy/moonvalley/prompts/image-to-video" + +MIN_WIDTH = 300 +MIN_HEIGHT = 300 + +MAX_WIDTH = 10000 +MAX_HEIGHT = 10000 + +MIN_VID_WIDTH = 300 +MIN_VID_HEIGHT = 300 + +MAX_VID_WIDTH = 10000 +MAX_VID_HEIGHT = 10000 + +MAX_VIDEO_SIZE = 1024 * 1024 * 1024 # 1 GB max for in-memory video processing + +MOONVALLEY_MAREY_MAX_PROMPT_LENGTH = 5000 +R = TypeVar("R") +class MoonvalleyApiError(Exception): + """Base exception for Moonvalley API errors.""" + pass + +def is_valid_task_creation_response(response: MoonvalleyPromptResponse) -> bool: + """Verifies that the initial response contains a task ID.""" + return bool(response.id) + +def validate_task_creation_response(response) -> None: + if not is_valid_task_creation_response(response): + error_msg = f"Moonvalley Marey API: Initial request failed. Code: {response.code}, Message: {response.message}, Data: {response}" + logging.error(error_msg) + raise MoonvalleyApiError(error_msg) + +def get_video_from_response(response): + video = response.output_url + logging.info( + "Moonvalley Marey API: Task %s succeeded. Video URL: %s", response.id, video + ) + return video + + +def get_video_url_from_response(response) -> Optional[str]: + """Returns the first video url from the Moonvalley video generation task result. + Will not raise an error if the response is not valid. + """ + if response: + return str(get_video_from_response(response)) + else: + return None + + +def poll_until_finished( + auth_kwargs: dict[str, str], + api_endpoint: ApiEndpoint[Any, R], + result_url_extractor: Optional[Callable[[R], str]] = None, + node_id: Optional[str] = None, +) -> R: + """Polls the Moonvalley API endpoint until the task reaches a terminal state, then returns the response.""" + return PollingOperation( + poll_endpoint=api_endpoint, + completed_statuses=[ + "completed", + ], + max_poll_attempts=240, # 64 minutes with 16s interval + poll_interval=16.0, + failed_statuses=["error"], + status_extractor=lambda response: ( + response.status + if response and response.status + else None + ), + auth_kwargs=auth_kwargs, + result_url_extractor=result_url_extractor, + node_id=node_id, + ).execute() + +def validate_prompts(prompt:str, negative_prompt: str, max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH): + """Verifies that the prompt isn't empty and that neither prompt is too long.""" + if not prompt: + raise ValueError("Positive prompt is empty") + if len(prompt) > max_length: + raise ValueError(f"Positive prompt is too long: {len(prompt)} characters") + if negative_prompt and len(negative_prompt) > max_length: + raise ValueError( + f"Negative prompt is too long: {len(negative_prompt)} characters" + ) + return True + +def validate_input_media(width, height, with_frame_conditioning, num_frames_in=None): + # inference validation + # T = num_frames + # in all cases, the following must be true: T divisible by 16 and H,W by 8. in addition... + # with image conditioning: H*W must be divisible by 8192 + # without image conditioning: T divisible by 32 + if num_frames_in and not num_frames_in % 16 == 0 : + return False, ( + "The input video total frame count must be divisible by 16!" + ) + + if height % 8 != 0 or width % 8 != 0: + return False, ( + f"Height ({height}) and width ({width}) must be " "divisible by 8" + ) + + if with_frame_conditioning: + if (height * width) % 8192 != 0: + return False, ( + f"Height * width ({height * width}) must be " + "divisible by 8192 for frame conditioning" + ) + else: + if num_frames_in and not num_frames_in % 32 == 0 : + return False, ( + "The input video total frame count must be divisible by 32!" + ) + + +def validate_input_image(image: torch.Tensor, with_frame_conditioning: bool=False) -> None: + """ + Validates the input image adheres to the expectations of the API: + - The image resolution should not be less than 300*300px + - The aspect ratio of the image should be between 1:2.5 ~ 2.5:1 + + """ + height, width = get_image_dimensions(image) + validate_input_media(width, height, with_frame_conditioning ) + validate_image_dimensions(image, min_width=300, min_height=300, max_height=MAX_HEIGHT, max_width=MAX_WIDTH) + +def validate_input_video(video: VideoInput, num_frames_out: int, with_frame_conditioning: bool=False): + try: + width, height = video.get_dimensions() + except Exception as e: + logging.error("Error getting dimensions of video: %s", e) + raise ValueError(f"Cannot get video dimensions: {e}") from e + + validate_input_media(width, height, with_frame_conditioning) + validate_video_dimensions(video, min_width=MIN_VID_WIDTH, min_height=MIN_VID_HEIGHT, max_width=MAX_VID_WIDTH, max_height=MAX_VID_HEIGHT) + + trimmed_video = validate_input_video_length(video, num_frames_out) + return trimmed_video + + +def validate_input_video_length(video: VideoInput, num_frames: int): + + if video.get_duration() > 60: + raise MoonvalleyApiError("Input Video lenth should be less than 1min. Please trim.") + + if num_frames == 128: + if video.get_duration() < 5: + raise MoonvalleyApiError("Input Video length is less than 5s. Please use a video longer than or equal to 5s.") + if video.get_duration() > 5: + # trim video to 5s + video = trim_video(video, 5) + if num_frames == 256: + if video.get_duration() < 10: + raise MoonvalleyApiError("Input Video length is less than 10s. Please use a video longer than or equal to 10s.") + if video.get_duration() > 10: + # trim video to 10s + video = trim_video(video, 10) + return video + +def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: + """ + Returns a new VideoInput object trimmed from the beginning to the specified duration, + using av to avoid loading entire video into memory. + + Args: + video: Input video to trim + duration_sec: Duration in seconds to keep from the beginning + + Returns: + VideoFromFile object that owns the output buffer + """ + output_buffer = io.BytesIO() + + input_container = None + output_container = None + + try: + # Get the stream source - this avoids loading entire video into memory + # when the source is already a file path + input_source = video.get_stream_source() + + # Open containers + input_container = av.open(input_source, mode='r') + output_container = av.open(output_buffer, mode='w', format='mp4') + + # Set up output streams for re-encoding + video_stream = None + audio_stream = None + + for stream in input_container.streams: + logging.info(f"Found stream: type={stream.type}, class={type(stream)}") + if isinstance(stream, av.VideoStream): + # Create output video stream with same parameters + video_stream = output_container.add_stream('h264', rate=stream.average_rate) + video_stream.width = stream.width + video_stream.height = stream.height + video_stream.pix_fmt = 'yuv420p' + logging.info(f"Added video stream: {stream.width}x{stream.height} @ {stream.average_rate}fps") + elif isinstance(stream, av.AudioStream): + # Create output audio stream with same parameters + audio_stream = output_container.add_stream('aac', rate=stream.sample_rate) + audio_stream.sample_rate = stream.sample_rate + audio_stream.layout = stream.layout + logging.info(f"Added audio stream: {stream.sample_rate}Hz, {stream.channels} channels") + + # Calculate target frame count that's divisible by 32 + fps = input_container.streams.video[0].average_rate + estimated_frames = int(duration_sec * fps) + target_frames = (estimated_frames // 32) * 32 # Round down to nearest multiple of 32 + + if target_frames == 0: + raise ValueError("Video too short: need at least 32 frames for Moonvalley") + + frame_count = 0 + audio_frame_count = 0 + + # Decode and re-encode video frames + if video_stream: + for frame in input_container.decode(video=0): + if frame_count >= target_frames: + break + + # Re-encode frame + for packet in video_stream.encode(frame): + output_container.mux(packet) + frame_count += 1 + + # Flush encoder + for packet in video_stream.encode(): + output_container.mux(packet) + + logging.info(f"Encoded {frame_count} video frames (target: {target_frames})") + + # Decode and re-encode audio frames + if audio_stream: + input_container.seek(0) # Reset to beginning for audio + for frame in input_container.decode(audio=0): + if frame.time >= duration_sec: + break + + # Re-encode frame + for packet in audio_stream.encode(frame): + output_container.mux(packet) + audio_frame_count += 1 + + # Flush encoder + for packet in audio_stream.encode(): + output_container.mux(packet) + + logging.info(f"Encoded {audio_frame_count} audio frames") + + # Close containers + output_container.close() + input_container.close() + + + # Return as VideoFromFile using the buffer + output_buffer.seek(0) + return VideoFromFile(output_buffer) + + except Exception as e: + # Clean up on error + if input_container is not None: + input_container.close() + if output_container is not None: + output_container.close() + raise RuntimeError(f"Failed to trim video: {str(e)}") from e + +# --- BaseMoonvalleyVideoNode --- +class BaseMoonvalleyVideoNode: + def parseWidthHeightFromRes(self, resolution: str): + # Accepts a string like "16:9 (1920 x 1080)" and returns width, height as a dict + res_map = { + "16:9 (1920 x 1080)": {"width": 1920, "height": 1080}, + "9:16 (1080 x 1920)": {"width": 1080, "height": 1920}, + "1:1 (1152 x 1152)": {"width": 1152, "height": 1152}, + "4:3 (1440 x 1080)": {"width": 1440, "height": 1080}, + "3:4 (1080 x 1440)": {"width": 1080, "height": 1440}, + "21:9 (2560 x 1080)": {"width": 2560, "height": 1080}, + } + if resolution in res_map: + return res_map[resolution] + else: + # Default to 1920x1080 if unknown + return {"width": 1920, "height": 1080} + + def parseControlParameter(self, value): + control_map = { + "Motion Transfer": "motion_control", + "Canny": "canny_control", + "Pose Transfer": "pose_control", + "Depth": "depth_control" + } + if value in control_map: + return control_map[value] + else: + return control_map["Motion Transfer"] + + def get_response( + self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None + ) -> MoonvalleyPromptResponse: + return poll_until_finished( + auth_kwargs, + ApiEndpoint( + path=f"{API_PROMPTS_ENDPOINT}/{task_id}", + method=HttpMethod.GET, + request_model=EmptyRequest, + response_model=MoonvalleyPromptResponse, + ), + result_url_extractor=get_video_url_from_response, + node_id=node_id, + ) + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "prompt": model_field_to_node_input( + IO.STRING, MoonvalleyTextToVideoRequest, "prompt_text", + multiline=True + ), + "negative_prompt": model_field_to_node_input( + IO.STRING, + MoonvalleyTextToVideoInferenceParams, + "negative_prompt", + multiline=True, + default="gopro, bright, contrast, static, overexposed, bright, vignette, artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, flare, saturation, distorted, warped, wide angle, contrast, saturated, vibrant, glowing, cross dissolve, texture, videogame, saturation, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, blown out, horrible, blurry, worst quality, bad, transition, dissolve, cross-dissolve, melt, fade in, fade out, wobbly, weird, low quality, plastic, stock footage, video camera, boring, static", + ), + + "resolution": (IO.COMBO, { + "options": ["16:9 (1920 x 1080)", + "9:16 (1080 x 1920)", + "1:1 (1152 x 1152)", + "4:3 (1440 x 1080)", + "3:4 (1080 x 1440)", + "21:9 (2560 x 1080)"], + "default": "16:9 (1920 x 1080)", + "tooltip": "Resolution of the output video", + }), + # "length": (IO.COMBO,{"options":['5s','10s'], "default": '5s'}), + "prompt_adherence": model_field_to_node_input(IO.FLOAT,MoonvalleyTextToVideoInferenceParams,"guidance_scale",default=7.0, step=1, min=1, max=20), + "seed": model_field_to_node_input(IO.INT,MoonvalleyTextToVideoInferenceParams, "seed", default=random.randint(0, 2**32 - 1), min=0, max=4294967295, step=1, display="number", tooltip="Random seed value", control_after_generate=True), + "steps": model_field_to_node_input(IO.INT, MoonvalleyTextToVideoInferenceParams, "steps", default=100, min=1, max=100), + }, + "hidden": { + "auth_token": "AUTH_TOKEN_COMFY_ORG", + "comfy_api_key": "API_KEY_COMFY_ORG", + "unique_id": "UNIQUE_ID", + }, + "optional": { + "image": model_field_to_node_input( + IO.IMAGE, + MoonvalleyTextToVideoRequest, + "image_url", + tooltip="The reference image used to generate the video", + ), + } + } + + RETURN_TYPES = ("STRING",) + FUNCTION = "generate" + CATEGORY = "api node/video/Moonvalley Marey" + API_NODE = True + + def generate(self, **kwargs): + return None + +# --- MoonvalleyImg2VideoNode --- +class MoonvalleyImg2VideoNode(BaseMoonvalleyVideoNode): + + @classmethod + def INPUT_TYPES(cls): + return super().INPUT_TYPES() + + RETURN_TYPES = ("VIDEO",) + RETURN_NAMES = ("video",) + DESCRIPTION = "Moonvalley Marey Image to Video Node" + + def generate(self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs): + image = kwargs.get("image", None) + if (image is None): + raise MoonvalleyApiError("image is required") + total_frames = get_total_frames_from_length() + + validate_input_image(image,True) + validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) + width_height = self.parseWidthHeightFromRes(kwargs.get("resolution")) + + inference_params=MoonvalleyTextToVideoInferenceParams( + negative_prompt=negative_prompt, + steps=kwargs.get("steps"), + seed=kwargs.get("seed"), + guidance_scale=kwargs.get("prompt_adherence"), + num_frames=total_frames, + width=width_height.get("width"), + height=width_height.get("height"), + use_negative_prompts=True + ) + """Upload image to comfy backend to have a URL available for further processing""" + # Get MIME type from tensor - assuming PNG format for image tensors + mime_type = "image/png" + + image_url = upload_images_to_comfyapi(image, max_images=1, auth_kwargs=kwargs, mime_type=mime_type)[0] + + request = MoonvalleyTextToVideoRequest( + image_url=image_url, + prompt_text=prompt, + inference_params=inference_params + ) + initial_operation = SynchronousOperation( + endpoint=ApiEndpoint(path=API_IMG2VIDEO_ENDPOINT, + method=HttpMethod.POST, + request_model=MoonvalleyTextToVideoRequest, + response_model=MoonvalleyPromptResponse + ), + request=request, + auth_kwargs=kwargs, + ) + task_creation_response = initial_operation.execute() + validate_task_creation_response(task_creation_response) + task_id = task_creation_response.id + + final_response = self.get_response( + task_id, auth_kwargs=kwargs, node_id=unique_id + ) + video = download_url_to_video_output(final_response.output_url) + return (video, ) + +# --- MoonvalleyVid2VidNode --- +class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): + def __init__(self): + super().__init__() + + @classmethod + def INPUT_TYPES(cls): + input_types = super().INPUT_TYPES() + for param in ["resolution", "image"]: + if param in input_types["required"]: + del input_types["required"][param] + if param in input_types["optional"]: + del input_types["optional"][param] + input_types["optional"] = { + "video": (IO.VIDEO, {"default": "", "multiline": False, "tooltip": "The reference video used to generate the output video. Input a 5s video for 128 frames and a 10s video for 256 frames. Longer videos will be trimmed automatically."}), + "control_type": ( + ["Motion Transfer", "Pose Transfer"], + {"default": "Motion Transfer"}, + ), + "motion_intensity": ( + "INT", + { + "default": 100, + "step": 1, + "min": 0, + "max": 100, + "tooltip": "Only used if control_type is 'Motion Transfer'", + }, + ) + } + + return input_types + + RETURN_TYPES = ("VIDEO",) + RETURN_NAMES = ("video",) + + def generate(self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs): + video = kwargs.get("video") + num_frames = get_total_frames_from_length() + + if not video : + raise MoonvalleyApiError("video is required") + + + """Validate video input""" + video_url="" + if video: + validated_video = validate_input_video(video, num_frames, False) + video_url = upload_video_to_comfyapi(validated_video, auth_kwargs=kwargs) + + control_type = kwargs.get("control_type") + motion_intensity = kwargs.get("motion_intensity") + + """Validate prompts and inference input""" + validate_prompts(prompt, negative_prompt) + inference_params=MoonvalleyVideoToVideoInferenceParams( + negative_prompt=negative_prompt, + steps=kwargs.get("steps"), + seed=kwargs.get("seed"), + guidance_scale=kwargs.get("prompt_adherence"), + control_params={'motion_intensity': motion_intensity} + ) + + control = self.parseControlParameter(control_type) + + request = MoonvalleyVideoToVideoRequest( + control_type=control, + video_url=video_url, + prompt_text=prompt, + inference_params=inference_params + ) + + initial_operation = SynchronousOperation( + endpoint=ApiEndpoint(path=API_VIDEO2VIDEO_ENDPOINT, + method=HttpMethod.POST, + request_model=MoonvalleyVideoToVideoRequest, + response_model=MoonvalleyPromptResponse + ), + request=request, + auth_kwargs=kwargs, + ) + task_creation_response = initial_operation.execute() + validate_task_creation_response(task_creation_response) + task_id = task_creation_response.id + + final_response = self.get_response( + task_id, auth_kwargs=kwargs, node_id=unique_id + ) + + video = download_url_to_video_output(final_response.output_url) + + return (video, ) + +# --- MoonvalleyTxt2VideoNode --- +class MoonvalleyTxt2VideoNode(BaseMoonvalleyVideoNode): + def __init__(self): + super().__init__() + + RETURN_TYPES = ("VIDEO",) + RETURN_NAMES = ("video",) + + @classmethod + def INPUT_TYPES(cls): + input_types = super().INPUT_TYPES() + # Remove image-specific parameters + for param in ["image"]: + if param in input_types["optional"]: + del input_types["optional"][param] + return input_types + + def generate(self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs): + validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) + width_height = self.parseWidthHeightFromRes(kwargs.get("resolution")) + num_frames = get_total_frames_from_length() + + inference_params=MoonvalleyTextToVideoInferenceParams( + negative_prompt=negative_prompt, + steps=kwargs.get("steps"), + seed=kwargs.get("seed"), + guidance_scale=kwargs.get("prompt_adherence"), + num_frames=num_frames, + width=width_height.get("width"), + height=width_height.get("height"), + ) + request = MoonvalleyTextToVideoRequest( + prompt_text=prompt, + inference_params=inference_params + ) + + initial_operation = SynchronousOperation( + endpoint=ApiEndpoint(path=API_TXT2VIDEO_ENDPOINT, + method=HttpMethod.POST, + request_model=MoonvalleyTextToVideoRequest, + response_model=MoonvalleyPromptResponse + ), + request=request, + auth_kwargs=kwargs, + ) + task_creation_response = initial_operation.execute() + validate_task_creation_response(task_creation_response) + task_id = task_creation_response.id + + final_response = self.get_response( + task_id, auth_kwargs=kwargs, node_id=unique_id + ) + + video = download_url_to_video_output(final_response.output_url) + return (video, ) + + + +NODE_CLASS_MAPPINGS = { + "MoonvalleyImg2VideoNode": MoonvalleyImg2VideoNode, + "MoonvalleyTxt2VideoNode": MoonvalleyTxt2VideoNode, + # "MoonvalleyVideo2VideoNode": MoonvalleyVideo2VideoNode, +} + + +NODE_DISPLAY_NAME_MAPPINGS = { + "MoonvalleyImg2VideoNode": "Moonvalley Marey Image to Video", + "MoonvalleyTxt2VideoNode": "Moonvalley Marey Text to Video", + # "MoonvalleyVideo2VideoNode": "Moonvalley Marey Video to Video", +} + +def get_total_frames_from_length(length="5s"): + # if length == '5s': + # return 128 + # elif length == '10s': + # return 256 + return 128 + # else: + # raise MoonvalleyApiError("length is required") diff --git a/nodes.py b/nodes.py index 1b465b9e6..231d4d4de 100644 --- a/nodes.py +++ b/nodes.py @@ -2310,6 +2310,7 @@ def init_builtin_api_nodes(): "nodes_pika.py", "nodes_runway.py", "nodes_tripo.py", + "nodes_moonvalley.py", "nodes_rodin.py", "nodes_gemini.py", ] From c5de4955bb91a2b136027a698aaecb8d19e3d892 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Tue, 8 Jul 2025 08:56:38 -0400 Subject: [PATCH 009/243] ComfyUI version 0.3.44 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index c98c90499..7981fbaca 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.43" +__version__ = "0.3.44" diff --git a/pyproject.toml b/pyproject.toml index 9d0f90032..96ead2157 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.43" +version = "0.3.44" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 974254218ab873e8b9642b6a467d56842cd228c4 Mon Sep 17 00:00:00 2001 From: josephrocca <1167575+josephrocca@users.noreply.github.com> Date: Wed, 9 Jul 2025 03:56:59 +0800 Subject: [PATCH 010/243] Un-hardcode chroma patch_size (#8840) --- comfy/ldm/chroma/model.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/comfy/ldm/chroma/model.py b/comfy/ldm/chroma/model.py index c75023a31..06021d4f2 100644 --- a/comfy/ldm/chroma/model.py +++ b/comfy/ldm/chroma/model.py @@ -254,13 +254,12 @@ class Chroma(nn.Module): def forward(self, x, timestep, context, guidance, control=None, transformer_options={}, **kwargs): bs, c, h, w = x.shape - patch_size = 2 - x = comfy.ldm.common_dit.pad_to_patch_size(x, (patch_size, patch_size)) + x = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size)) - img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size) + img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=self.patch_size, pw=self.patch_size) - h_len = ((h + (patch_size // 2)) // patch_size) - w_len = ((w + (patch_size // 2)) // patch_size) + h_len = ((h + (self.patch_size // 2)) // self.patch_size) + w_len = ((w + (self.patch_size // 2)) // self.patch_size) img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype) img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1) img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) @@ -268,4 +267,4 @@ class Chroma(nn.Module): txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype) out = self.forward_orig(img, img_ids, context, txt_ids, timestep, guidance, control, transformer_options, attn_mask=kwargs.get("attention_mask", None)) - return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)[:,:,:h,:w] + return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=self.patch_size, pw=self.patch_size)[:,:,:h,:w] From aac10ad23a8f65243fcfca7afa7a72c1740312d6 Mon Sep 17 00:00:00 2001 From: chaObserv <154517000+chaObserv@users.noreply.github.com> Date: Wed, 9 Jul 2025 04:17:06 +0800 Subject: [PATCH 011/243] Add SA-Solver sampler (#8834) --- comfy/k_diffusion/sa_solver.py | 121 +++++++++++++++++++++++++++ comfy/k_diffusion/sampling.py | 111 ++++++++++++++++++++++++ comfy/samplers.py | 2 +- comfy_extras/nodes_custom_sampler.py | 45 ++++++++++ 4 files changed, 278 insertions(+), 1 deletion(-) create mode 100644 comfy/k_diffusion/sa_solver.py diff --git a/comfy/k_diffusion/sa_solver.py b/comfy/k_diffusion/sa_solver.py new file mode 100644 index 000000000..0c6821b60 --- /dev/null +++ b/comfy/k_diffusion/sa_solver.py @@ -0,0 +1,121 @@ +# SA-Solver: Stochastic Adams Solver (NeurIPS 2023, arXiv:2309.05019) +# Conference: https://proceedings.neurips.cc/paper_files/paper/2023/file/f4a6806490d31216a3ba667eb240c897-Paper-Conference.pdf +# Codebase ref: https://github.com/scxue/SA-Solver + +import math +from typing import Union, Callable +import torch + + +def compute_exponential_coeffs(s: torch.Tensor, t: torch.Tensor, solver_order: int, tau_t: float) -> torch.Tensor: + """Compute (1 + tau^2) * integral of exp((1 + tau^2) * x) * x^p dx from s to t with exp((1 + tau^2) * t) factored out, using integration by parts. + + Integral of exp((1 + tau^2) * x) * x^p dx + = product_terms[p] - (p / (1 + tau^2)) * integral of exp((1 + tau^2) * x) * x^(p-1) dx, + with base case p=0 where integral equals product_terms[0]. + + where + product_terms[p] = x^p * exp((1 + tau^2) * x) / (1 + tau^2). + + Construct a recursive coefficient matrix following the above recursive relation to compute all integral terms up to p = (solver_order - 1). + Return coefficients used by the SA-Solver in data prediction mode. + + Args: + s: Start time s. + t: End time t. + solver_order: Current order of the solver. + tau_t: Stochastic strength parameter in the SDE. + + Returns: + Exponential coefficients used in data prediction, with exp((1 + tau^2) * t) factored out, ordered from p=0 to p=solver_order−1, shape (solver_order,). + """ + tau_mul = 1 + tau_t ** 2 + h = t - s + p = torch.arange(solver_order, dtype=s.dtype, device=s.device) + + # product_terms after factoring out exp((1 + tau^2) * t) + # Includes (1 + tau^2) factor from outside the integral + product_terms_factored = (t ** p - s ** p * (-tau_mul * h).exp()) + + # Lower triangular recursive coefficient matrix + # Accumulates recursive coefficients based on p / (1 + tau^2) + recursive_depth_mat = p.unsqueeze(1) - p.unsqueeze(0) + log_factorial = (p + 1).lgamma() + recursive_coeff_mat = log_factorial.unsqueeze(1) - log_factorial.unsqueeze(0) + if tau_t > 0: + recursive_coeff_mat = recursive_coeff_mat - (recursive_depth_mat * math.log(tau_mul)) + signs = torch.where(recursive_depth_mat % 2 == 0, 1.0, -1.0) + recursive_coeff_mat = (recursive_coeff_mat.exp() * signs).tril() + + return recursive_coeff_mat @ product_terms_factored + + +def compute_simple_stochastic_adams_b_coeffs(sigma_next: torch.Tensor, curr_lambdas: torch.Tensor, lambda_s: torch.Tensor, lambda_t: torch.Tensor, tau_t: float, is_corrector_step: bool = False) -> torch.Tensor: + """Compute simple order-2 b coefficients from SA-Solver paper (Appendix D. Implementation Details).""" + tau_mul = 1 + tau_t ** 2 + h = lambda_t - lambda_s + alpha_t = sigma_next * lambda_t.exp() + if is_corrector_step: + # Simplified 1-step (order-2) corrector + b_1 = alpha_t * (0.5 * tau_mul * h) + b_2 = alpha_t * (-h * tau_mul).expm1().neg() - b_1 + else: + # Simplified 2-step predictor + b_2 = alpha_t * (0.5 * tau_mul * h ** 2) / (curr_lambdas[-2] - lambda_s) + b_1 = alpha_t * (-h * tau_mul).expm1().neg() - b_2 + return torch.stack([b_2, b_1]) + + +def compute_stochastic_adams_b_coeffs(sigma_next: torch.Tensor, curr_lambdas: torch.Tensor, lambda_s: torch.Tensor, lambda_t: torch.Tensor, tau_t: float, simple_order_2: bool = False, is_corrector_step: bool = False) -> torch.Tensor: + """Compute b_i coefficients for the SA-Solver (see eqs. 15 and 18). + + The solver order corresponds to the number of input lambdas (half-logSNR points). + + Args: + sigma_next: Sigma at end time t. + curr_lambdas: Lambda time points used to construct the Lagrange basis, shape (N,). + lambda_s: Lambda at start time s. + lambda_t: Lambda at end time t. + tau_t: Stochastic strength parameter in the SDE. + simple_order_2: Whether to enable the simple order-2 scheme. + is_corrector_step: Flag for corrector step in simple order-2 mode. + + Returns: + b_i coefficients for the SA-Solver, shape (N,), where N is the solver order. + """ + num_timesteps = curr_lambdas.shape[0] + + if simple_order_2 and num_timesteps == 2: + return compute_simple_stochastic_adams_b_coeffs(sigma_next, curr_lambdas, lambda_s, lambda_t, tau_t, is_corrector_step) + + # Compute coefficients by solving a linear system from Lagrange basis interpolation + exp_integral_coeffs = compute_exponential_coeffs(lambda_s, lambda_t, num_timesteps, tau_t) + vandermonde_matrix_T = torch.vander(curr_lambdas, num_timesteps, increasing=True).T + lagrange_integrals = torch.linalg.solve(vandermonde_matrix_T, exp_integral_coeffs) + + # (sigma_t * exp(-tau^2 * lambda_t)) * exp((1 + tau^2) * lambda_t) + # = sigma_t * exp(lambda_t) = alpha_t + # exp((1 + tau^2) * lambda_t) is extracted from the integral + alpha_t = sigma_next * lambda_t.exp() + return alpha_t * lagrange_integrals + + +def get_tau_interval_func(start_sigma: float, end_sigma: float, eta: float = 1.0) -> Callable[[Union[torch.Tensor, float]], float]: + """Return a function that controls the stochasticity of SA-Solver. + + When eta = 0, SA-Solver runs as ODE. The official approach uses + time t to determine the SDE interval, while here we use sigma instead. + + See: + https://github.com/scxue/SA-Solver/blob/main/README.md + """ + + def tau_func(sigma: Union[torch.Tensor, float]) -> float: + if eta <= 0: + return 0.0 # ODE + + if isinstance(sigma, torch.Tensor): + sigma = sigma.item() + return eta if start_sigma >= sigma >= end_sigma else 0.0 + + return tau_func diff --git a/comfy/k_diffusion/sampling.py b/comfy/k_diffusion/sampling.py index 34218337a..2ed415b1f 100644 --- a/comfy/k_diffusion/sampling.py +++ b/comfy/k_diffusion/sampling.py @@ -9,6 +9,7 @@ from tqdm.auto import trange, tqdm from . import utils from . import deis +from . import sa_solver import comfy.model_patcher import comfy.model_sampling @@ -1648,3 +1649,113 @@ def sample_seeds_3(model, x, sigmas, extra_args=None, callback=None, disable=Non if inject_noise: x = x + sigmas[i + 1] * (noise_coeff_3 * noise_1 + noise_coeff_2 * noise_2 + noise_coeff_1 * noise_3) * s_noise return x + + +@torch.no_grad() +def sample_sa_solver(model, x, sigmas, extra_args=None, callback=None, disable=False, tau_func=None, s_noise=1.0, noise_sampler=None, predictor_order=3, corrector_order=4, use_pece=False, simple_order_2=False): + """Stochastic Adams Solver with predictor-corrector method (NeurIPS 2023).""" + if len(sigmas) <= 1: + return x + extra_args = {} if extra_args is None else extra_args + seed = extra_args.get("seed", None) + noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler + s_in = x.new_ones([x.shape[0]]) + + model_sampling = model.inner_model.model_patcher.get_model_object("model_sampling") + sigmas = offset_first_sigma_for_snr(sigmas, model_sampling) + lambdas = sigma_to_half_log_snr(sigmas, model_sampling=model_sampling) + + if tau_func is None: + # Use default interval for stochastic sampling + start_sigma = model_sampling.percent_to_sigma(0.2) + end_sigma = model_sampling.percent_to_sigma(0.8) + tau_func = sa_solver.get_tau_interval_func(start_sigma, end_sigma, eta=1.0) + + max_used_order = max(predictor_order, corrector_order) + x_pred = x # x: current state, x_pred: predicted next state + + h = 0.0 + tau_t = 0.0 + noise = 0.0 + pred_list = [] + + # Lower order near the end to improve stability + lower_order_to_end = sigmas[-1].item() == 0 + + for i in trange(len(sigmas) - 1, disable=disable): + # Evaluation + denoised = model(x_pred, sigmas[i] * s_in, **extra_args) + if callback is not None: + callback({"x": x_pred, "i": i, "sigma": sigmas[i], "sigma_hat": sigmas[i], "denoised": denoised}) + pred_list.append(denoised) + pred_list = pred_list[-max_used_order:] + + predictor_order_used = min(predictor_order, len(pred_list)) + if i == 0 or (sigmas[i + 1] == 0 and not use_pece): + corrector_order_used = 0 + else: + corrector_order_used = min(corrector_order, len(pred_list)) + + if lower_order_to_end: + predictor_order_used = min(predictor_order_used, len(sigmas) - 2 - i) + corrector_order_used = min(corrector_order_used, len(sigmas) - 1 - i) + + # Corrector + if corrector_order_used == 0: + # Update by the predicted state + x = x_pred + else: + curr_lambdas = lambdas[i - corrector_order_used + 1:i + 1] + b_coeffs = sa_solver.compute_stochastic_adams_b_coeffs( + sigmas[i], + curr_lambdas, + lambdas[i - 1], + lambdas[i], + tau_t, + simple_order_2, + is_corrector_step=True, + ) + pred_mat = torch.stack(pred_list[-corrector_order_used:], dim=1) # (B, K, ...) + corr_res = torch.tensordot(pred_mat, b_coeffs, dims=([1], [0])) # (B, ...) + x = sigmas[i] / sigmas[i - 1] * (-(tau_t ** 2) * h).exp() * x + corr_res + + if tau_t > 0 and s_noise > 0: + # The noise from the previous predictor step + x = x + noise + + if use_pece: + # Evaluate the corrected state + denoised = model(x, sigmas[i] * s_in, **extra_args) + pred_list[-1] = denoised + + # Predictor + if sigmas[i + 1] == 0: + # Denoising step + x = denoised + else: + tau_t = tau_func(sigmas[i + 1]) + curr_lambdas = lambdas[i - predictor_order_used + 1:i + 1] + b_coeffs = sa_solver.compute_stochastic_adams_b_coeffs( + sigmas[i + 1], + curr_lambdas, + lambdas[i], + lambdas[i + 1], + tau_t, + simple_order_2, + is_corrector_step=False, + ) + pred_mat = torch.stack(pred_list[-predictor_order_used:], dim=1) # (B, K, ...) + pred_res = torch.tensordot(pred_mat, b_coeffs, dims=([1], [0])) # (B, ...) + h = lambdas[i + 1] - lambdas[i] + x_pred = sigmas[i + 1] / sigmas[i] * (-(tau_t ** 2) * h).exp() * x + pred_res + + if tau_t > 0 and s_noise > 0: + noise = noise_sampler(sigmas[i], sigmas[i + 1]) * sigmas[i + 1] * (-2 * tau_t ** 2 * h).expm1().neg().sqrt() * s_noise + x_pred = x_pred + noise + return x + + +@torch.no_grad() +def sample_sa_solver_pece(model, x, sigmas, extra_args=None, callback=None, disable=False, tau_func=None, s_noise=1.0, noise_sampler=None, predictor_order=3, corrector_order=4, simple_order_2=False): + """Stochastic Adams Solver with PECE (Predict–Evaluate–Correct–Evaluate) mode (NeurIPS 2023).""" + return sample_sa_solver(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, tau_func=tau_func, s_noise=s_noise, noise_sampler=noise_sampler, predictor_order=predictor_order, corrector_order=corrector_order, use_pece=True, simple_order_2=simple_order_2) diff --git a/comfy/samplers.py b/comfy/samplers.py index 25ccaf39f..e93d2a315 100644 --- a/comfy/samplers.py +++ b/comfy/samplers.py @@ -720,7 +720,7 @@ KSAMPLER_NAMES = ["euler", "euler_cfg_pp", "euler_ancestral", "euler_ancestral_c "lms", "dpm_fast", "dpm_adaptive", "dpmpp_2s_ancestral", "dpmpp_2s_ancestral_cfg_pp", "dpmpp_sde", "dpmpp_sde_gpu", "dpmpp_2m", "dpmpp_2m_cfg_pp", "dpmpp_2m_sde", "dpmpp_2m_sde_gpu", "dpmpp_3m_sde", "dpmpp_3m_sde_gpu", "ddpm", "lcm", "ipndm", "ipndm_v", "deis", "res_multistep", "res_multistep_cfg_pp", "res_multistep_ancestral", "res_multistep_ancestral_cfg_pp", - "gradient_estimation", "gradient_estimation_cfg_pp", "er_sde", "seeds_2", "seeds_3"] + "gradient_estimation", "gradient_estimation_cfg_pp", "er_sde", "seeds_2", "seeds_3", "sa_solver", "sa_solver_pece"] class KSAMPLER(Sampler): def __init__(self, sampler_function, extra_options={}, inpaint_options={}): diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py index b3a772714..33bc41842 100644 --- a/comfy_extras/nodes_custom_sampler.py +++ b/comfy_extras/nodes_custom_sampler.py @@ -2,6 +2,7 @@ import math import comfy.samplers import comfy.sample from comfy.k_diffusion import sampling as k_diffusion_sampling +from comfy.k_diffusion import sa_solver from comfy.comfy_types import IO, ComfyNodeABC, InputTypeDict import latent_preview import torch @@ -521,6 +522,49 @@ class SamplerER_SDE(ComfyNodeABC): return (sampler,) +class SamplerSASolver(ComfyNodeABC): + @classmethod + def INPUT_TYPES(cls) -> InputTypeDict: + return { + "required": { + "model": (IO.MODEL, {}), + "eta": (IO.FLOAT, {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01, "round": False},), + "sde_start_percent": (IO.FLOAT, {"default": 0.2, "min": 0.0, "max": 1.0, "step": 0.001},), + "sde_end_percent": (IO.FLOAT, {"default": 0.8, "min": 0.0, "max": 1.0, "step": 0.001},), + "s_noise": (IO.FLOAT, {"default": 1.0, "min": 0.0, "max": 100.0, "step": 0.01, "round": False},), + "predictor_order": (IO.INT, {"default": 3, "min": 1, "max": 6}), + "corrector_order": (IO.INT, {"default": 4, "min": 0, "max": 6}), + "use_pece": (IO.BOOLEAN, {}), + "simple_order_2": (IO.BOOLEAN, {}), + } + } + + RETURN_TYPES = (IO.SAMPLER,) + CATEGORY = "sampling/custom_sampling/samplers" + + FUNCTION = "get_sampler" + + def get_sampler(self, model, eta, sde_start_percent, sde_end_percent, s_noise, predictor_order, corrector_order, use_pece, simple_order_2): + model_sampling = model.get_model_object("model_sampling") + start_sigma = model_sampling.percent_to_sigma(sde_start_percent) + end_sigma = model_sampling.percent_to_sigma(sde_end_percent) + tau_func = sa_solver.get_tau_interval_func(start_sigma, end_sigma, eta=eta) + + sampler_name = "sa_solver" + sampler = comfy.samplers.ksampler( + sampler_name, + { + "tau_func": tau_func, + "s_noise": s_noise, + "predictor_order": predictor_order, + "corrector_order": corrector_order, + "use_pece": use_pece, + "simple_order_2": simple_order_2, + }, + ) + return (sampler,) + + class Noise_EmptyNoise: def __init__(self): self.seed = 0 @@ -829,6 +873,7 @@ NODE_CLASS_MAPPINGS = { "SamplerDPMPP_2S_Ancestral": SamplerDPMPP_2S_Ancestral, "SamplerDPMAdaptative": SamplerDPMAdaptative, "SamplerER_SDE": SamplerER_SDE, + "SamplerSASolver": SamplerSASolver, "SplitSigmas": SplitSigmas, "SplitSigmasDenoise": SplitSigmasDenoise, "FlipSigmas": FlipSigmas, From 181a9bf26d4445e160645f6c81dc2ee29e7b6a08 Mon Sep 17 00:00:00 2001 From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com> Date: Wed, 9 Jul 2025 08:18:04 +0800 Subject: [PATCH 012/243] Support Multi Image-Caption dataset in lora training node (#8819) * initial impl of multi img/text dataset * Update nodes_train.py * Support Kohya-ss structure --- comfy_extras/nodes_train.py | 125 +++++++++++++++++++++++++++++++++--- 1 file changed, 115 insertions(+), 10 deletions(-) diff --git a/comfy_extras/nodes_train.py b/comfy_extras/nodes_train.py index fbff01010..17caf5ad5 100644 --- a/comfy_extras/nodes_train.py +++ b/comfy_extras/nodes_train.py @@ -75,7 +75,7 @@ class BiasDiff(torch.nn.Module): return self.passive_memory_usage() -def load_and_process_images(image_files, input_dir, resize_method="None"): +def load_and_process_images(image_files, input_dir, resize_method="None", w=None, h=None): """Utility function to load and process a list of images. Args: @@ -90,7 +90,6 @@ def load_and_process_images(image_files, input_dir, resize_method="None"): raise ValueError("No valid images found in input") output_images = [] - w, h = None, None for file in image_files: image_path = os.path.join(input_dir, file) @@ -206,6 +205,103 @@ class LoadImageSetFromFolderNode: return (output_tensor,) +class LoadImageTextSetFromFolderNode: + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "folder": (folder_paths.get_input_subfolders(), {"tooltip": "The folder to load images from."}), + "clip": (IO.CLIP, {"tooltip": "The CLIP model used for encoding the text."}), + }, + "optional": { + "resize_method": ( + ["None", "Stretch", "Crop", "Pad"], + {"default": "None"}, + ), + "width": ( + IO.INT, + { + "default": -1, + "min": -1, + "max": 10000, + "step": 1, + "tooltip": "The width to resize the images to. -1 means use the original width.", + }, + ), + "height": ( + IO.INT, + { + "default": -1, + "min": -1, + "max": 10000, + "step": 1, + "tooltip": "The height to resize the images to. -1 means use the original height.", + }, + ) + }, + } + + RETURN_TYPES = ("IMAGE", IO.CONDITIONING,) + FUNCTION = "load_images" + CATEGORY = "loaders" + EXPERIMENTAL = True + DESCRIPTION = "Loads a batch of images and caption from a directory for training." + + def load_images(self, folder, clip, resize_method, width=None, height=None): + if clip is None: + raise RuntimeError("ERROR: clip input is invalid: None\n\nIf the clip is from a checkpoint loader node your checkpoint does not contain a valid clip or text encoder model.") + + logging.info(f"Loading images from folder: {folder}") + + sub_input_dir = os.path.join(folder_paths.get_input_directory(), folder) + valid_extensions = [".png", ".jpg", ".jpeg", ".webp"] + + image_files = [] + for item in os.listdir(sub_input_dir): + path = os.path.join(sub_input_dir, item) + if any(item.lower().endswith(ext) for ext in valid_extensions): + image_files.append(path) + elif os.path.isdir(path): + # Support kohya-ss/sd-scripts folder structure + repeat = 1 + if item.split("_")[0].isdigit(): + repeat = int(item.split("_")[0]) + image_files.extend([ + os.path.join(path, f) for f in os.listdir(path) if any(f.lower().endswith(ext) for ext in valid_extensions) + ] * repeat) + + caption_file_path = [ + f.replace(os.path.splitext(f)[1], ".txt") + for f in image_files + ] + captions = [] + for caption_file in caption_file_path: + caption_path = os.path.join(sub_input_dir, caption_file) + if os.path.exists(caption_path): + with open(caption_path, "r", encoding="utf-8") as f: + caption = f.read().strip() + captions.append(caption) + else: + captions.append("") + + width = width if width != -1 else None + height = height if height != -1 else None + output_tensor = load_and_process_images(image_files, sub_input_dir, resize_method, width, height) + + logging.info(f"Loaded {len(output_tensor)} images from {sub_input_dir}.") + + logging.info(f"Encoding captions from {sub_input_dir}.") + conditions = [] + empty_cond = clip.encode_from_tokens_scheduled(clip.tokenize("")) + for text in captions: + if text == "": + conditions.append(empty_cond) + tokens = clip.tokenize(text) + conditions.extend(clip.encode_from_tokens_scheduled(tokens)) + logging.info(f"Encoded {len(conditions)} captions from {sub_input_dir}.") + return (output_tensor, conditions) + + def draw_loss_graph(loss_map, steps): width, height = 500, 300 img = Image.new("RGB", (width, height), "white") @@ -381,6 +477,13 @@ class TrainLoraNode: latents = latents["samples"].to(dtype) num_images = latents.shape[0] + logging.info(f"Total Images: {num_images}, Total Captions: {len(positive)}") + if len(positive) == 1 and num_images > 1: + positive = positive * num_images + elif len(positive) != num_images: + raise ValueError( + f"Number of positive conditions ({len(positive)}) does not match number of images ({num_images})." + ) with torch.inference_mode(False): lora_sd = {} @@ -474,6 +577,7 @@ class TrainLoraNode: # setup models for m in find_all_highest_child_module_with_forward(mp.model.diffusion_model): patch(m) + mp.model.requires_grad_(False) comfy.model_management.load_models_gpu([mp], memory_required=1e20, force_full_load=True) # Setup sampler and guider like in test script @@ -486,7 +590,6 @@ class TrainLoraNode: ) guider = comfy_extras.nodes_custom_sampler.Guider_Basic(mp) guider.set_conds(positive) # Set conditioning from input - ss = comfy_extras.nodes_custom_sampler.SamplerCustomAdvanced() # yoland: this currently resize to the first image in the dataset @@ -495,21 +598,21 @@ class TrainLoraNode: try: for step in (pbar:=tqdm.trange(steps, desc="Training LoRA", smoothing=0.01, disable=not comfy.utils.PROGRESS_BAR_ENABLED)): # Generate random sigma - sigma = mp.model.model_sampling.percent_to_sigma( + sigmas = [mp.model.model_sampling.percent_to_sigma( torch.rand((1,)).item() - ) - sigma = torch.tensor([sigma]) + ) for _ in range(min(batch_size, num_images))] + sigmas = torch.tensor(sigmas) noise = comfy_extras.nodes_custom_sampler.Noise_RandomNoise(step * 1000 + seed) indices = torch.randperm(num_images)[:batch_size] - ss.sample( - noise, guider, train_sampler, sigma, {"samples": latents[indices].clone()} - ) + batch_latent = latents[indices].clone() + guider.set_conds([positive[i] for i in indices]) # Set conditioning from input + guider.sample(noise.generate_noise({"samples": batch_latent}), batch_latent, train_sampler, sigmas, seed=noise.seed) finally: for m in mp.model.modules(): unpatch(m) - del ss, train_sampler, optimizer + del train_sampler, optimizer torch.cuda.empty_cache() for adapter in all_weight_adapters: @@ -697,6 +800,7 @@ NODE_CLASS_MAPPINGS = { "SaveLoRANode": SaveLoRA, "LoraModelLoader": LoraModelLoader, "LoadImageSetFromFolderNode": LoadImageSetFromFolderNode, + "LoadImageTextSetFromFolderNode": LoadImageTextSetFromFolderNode, "LossGraphNode": LossGraphNode, } @@ -705,5 +809,6 @@ NODE_DISPLAY_NAME_MAPPINGS = { "SaveLoRANode": "Save LoRA Weights", "LoraModelLoader": "Load LoRA Model", "LoadImageSetFromFolderNode": "Load Image Dataset from Folder", + "LoadImageTextSetFromFolderNode": "Load Image and Text Dataset from Folder", "LossGraphNode": "Plot Loss Graph", } From 5612670ee48ce500aab98e362b3372ab06d1d659 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 9 Jul 2025 00:45:48 -0700 Subject: [PATCH 013/243] Remove unmaintained notebook. (#8845) --- README.md | 4 - notebooks/comfyui_colab.ipynb | 322 ---------------------------------- 2 files changed, 326 deletions(-) delete mode 100644 notebooks/comfyui_colab.ipynb diff --git a/README.md b/README.md index ba8892b17..0e021a687 100644 --- a/README.md +++ b/README.md @@ -178,10 +178,6 @@ If you have trouble extracting it, right click the file -> properties -> unblock See the [Config file](extra_model_paths.yaml.example) to set the search paths for models. In the standalone windows build you can find this file in the ComfyUI directory. Rename this file to extra_model_paths.yaml and edit it with your favorite text editor. -## Jupyter Notebook - -To run it on services like paperspace, kaggle or colab you can use my [Jupyter Notebook](notebooks/comfyui_colab.ipynb) - ## [comfy-cli](https://docs.comfy.org/comfy-cli/getting-started) diff --git a/notebooks/comfyui_colab.ipynb b/notebooks/comfyui_colab.ipynb deleted file mode 100644 index 5560b5ff9..000000000 --- a/notebooks/comfyui_colab.ipynb +++ /dev/null @@ -1,322 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "aaaaaaaaaa" - }, - "source": [ - "Git clone the repo and install the requirements. (ignore the pip errors about protobuf)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "bbbbbbbbbb" - }, - "outputs": [], - "source": [ - "#@title Environment Setup\n", - "\n", - "\n", - "OPTIONS = {}\n", - "\n", - "USE_GOOGLE_DRIVE = False #@param {type:\"boolean\"}\n", - "UPDATE_COMFY_UI = True #@param {type:\"boolean\"}\n", - "WORKSPACE = 'ComfyUI'\n", - "OPTIONS['USE_GOOGLE_DRIVE'] = USE_GOOGLE_DRIVE\n", - "OPTIONS['UPDATE_COMFY_UI'] = UPDATE_COMFY_UI\n", - "\n", - "if OPTIONS['USE_GOOGLE_DRIVE']:\n", - " !echo \"Mounting Google Drive...\"\n", - " %cd /\n", - " \n", - " from google.colab import drive\n", - " drive.mount('/content/drive')\n", - "\n", - " WORKSPACE = \"/content/drive/MyDrive/ComfyUI\"\n", - " %cd /content/drive/MyDrive\n", - "\n", - "![ ! -d $WORKSPACE ] && echo -= Initial setup ComfyUI =- && git clone https://github.com/comfyanonymous/ComfyUI\n", - "%cd $WORKSPACE\n", - "\n", - "if OPTIONS['UPDATE_COMFY_UI']:\n", - " !echo -= Updating ComfyUI =-\n", - " !git pull\n", - "\n", - "!echo -= Install dependencies =-\n", - "!pip install xformers!=0.0.18 -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu121 --extra-index-url https://download.pytorch.org/whl/cu118 --extra-index-url https://download.pytorch.org/whl/cu117" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cccccccccc" - }, - "source": [ - "Download some models/checkpoints/vae or custom comfyui nodes (uncomment the commands for the ones you want)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dddddddddd" - }, - "outputs": [], - "source": [ - "# Checkpoints\n", - "\n", - "### SDXL\n", - "### I recommend these workflow examples: https://comfyanonymous.github.io/ComfyUI_examples/sdxl/\n", - "\n", - "#!wget -c https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors -P ./models/checkpoints/\n", - "#!wget -c https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0/resolve/main/sd_xl_refiner_1.0.safetensors -P ./models/checkpoints/\n", - "\n", - "# SDXL ReVision\n", - "#!wget -c https://huggingface.co/comfyanonymous/clip_vision_g/resolve/main/clip_vision_g.safetensors -P ./models/clip_vision/\n", - "\n", - "# SD1.5\n", - "!wget -c https://huggingface.co/Comfy-Org/stable-diffusion-v1-5-archive/resolve/main/v1-5-pruned-emaonly-fp16.safetensors -P ./models/checkpoints/\n", - "\n", - "# SD2\n", - "#!wget -c https://huggingface.co/stabilityai/stable-diffusion-2-1-base/resolve/main/v2-1_512-ema-pruned.safetensors -P ./models/checkpoints/\n", - "#!wget -c https://huggingface.co/stabilityai/stable-diffusion-2-1/resolve/main/v2-1_768-ema-pruned.safetensors -P ./models/checkpoints/\n", - "\n", - "# Some SD1.5 anime style\n", - "#!wget -c https://huggingface.co/WarriorMama777/OrangeMixs/resolve/main/Models/AbyssOrangeMix2/AbyssOrangeMix2_hard.safetensors -P ./models/checkpoints/\n", - "#!wget -c https://huggingface.co/WarriorMama777/OrangeMixs/resolve/main/Models/AbyssOrangeMix3/AOM3A1_orangemixs.safetensors -P ./models/checkpoints/\n", - "#!wget -c https://huggingface.co/WarriorMama777/OrangeMixs/resolve/main/Models/AbyssOrangeMix3/AOM3A3_orangemixs.safetensors -P ./models/checkpoints/\n", - "#!wget -c https://huggingface.co/Linaqruf/anything-v3.0/resolve/main/anything-v3-fp16-pruned.safetensors -P ./models/checkpoints/\n", - "\n", - "# Waifu Diffusion 1.5 (anime style SD2.x 768-v)\n", - "#!wget -c https://huggingface.co/waifu-diffusion/wd-1-5-beta3/resolve/main/wd-illusion-fp16.safetensors -P ./models/checkpoints/\n", - "\n", - "\n", - "# unCLIP models\n", - "#!wget -c https://huggingface.co/comfyanonymous/illuminatiDiffusionV1_v11_unCLIP/resolve/main/illuminatiDiffusionV1_v11-unclip-h-fp16.safetensors -P ./models/checkpoints/\n", - "#!wget -c https://huggingface.co/comfyanonymous/wd-1.5-beta2_unCLIP/resolve/main/wd-1-5-beta2-aesthetic-unclip-h-fp16.safetensors -P ./models/checkpoints/\n", - "\n", - "\n", - "# VAE\n", - "!wget -c https://huggingface.co/stabilityai/sd-vae-ft-mse-original/resolve/main/vae-ft-mse-840000-ema-pruned.safetensors -P ./models/vae/\n", - "#!wget -c https://huggingface.co/WarriorMama777/OrangeMixs/resolve/main/VAEs/orangemix.vae.pt -P ./models/vae/\n", - "#!wget -c https://huggingface.co/hakurei/waifu-diffusion-v1-4/resolve/main/vae/kl-f8-anime2.ckpt -P ./models/vae/\n", - "\n", - "\n", - "# Loras\n", - "#!wget -c https://civitai.com/api/download/models/10350 -O ./models/loras/theovercomer8sContrastFix_sd21768.safetensors #theovercomer8sContrastFix SD2.x 768-v\n", - "#!wget -c https://civitai.com/api/download/models/10638 -O ./models/loras/theovercomer8sContrastFix_sd15.safetensors #theovercomer8sContrastFix SD1.x\n", - "#!wget -c https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_offset_example-lora_1.0.safetensors -P ./models/loras/ #SDXL offset noise lora\n", - "\n", - "\n", - "# T2I-Adapter\n", - "#!wget -c https://huggingface.co/TencentARC/T2I-Adapter/resolve/main/models/t2iadapter_depth_sd14v1.pth -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/TencentARC/T2I-Adapter/resolve/main/models/t2iadapter_seg_sd14v1.pth -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/TencentARC/T2I-Adapter/resolve/main/models/t2iadapter_sketch_sd14v1.pth -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/TencentARC/T2I-Adapter/resolve/main/models/t2iadapter_keypose_sd14v1.pth -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/TencentARC/T2I-Adapter/resolve/main/models/t2iadapter_openpose_sd14v1.pth -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/TencentARC/T2I-Adapter/resolve/main/models/t2iadapter_color_sd14v1.pth -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/TencentARC/T2I-Adapter/resolve/main/models/t2iadapter_canny_sd14v1.pth -P ./models/controlnet/\n", - "\n", - "# T2I Styles Model\n", - "#!wget -c https://huggingface.co/TencentARC/T2I-Adapter/resolve/main/models/t2iadapter_style_sd14v1.pth -P ./models/style_models/\n", - "\n", - "# CLIPVision model (needed for styles model)\n", - "#!wget -c https://huggingface.co/openai/clip-vit-large-patch14/resolve/main/pytorch_model.bin -O ./models/clip_vision/clip_vit14.bin\n", - "\n", - "\n", - "# ControlNet\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11e_sd15_ip2p_fp16.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11e_sd15_shuffle_fp16.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15_canny_fp16.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11f1p_sd15_depth_fp16.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15_inpaint_fp16.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15_lineart_fp16.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15_mlsd_fp16.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15_normalbae_fp16.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15_openpose_fp16.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15_scribble_fp16.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15_seg_fp16.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15_softedge_fp16.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15s2_lineart_anime_fp16.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11u_sd15_tile_fp16.safetensors -P ./models/controlnet/\n", - "\n", - "# ControlNet SDXL\n", - "#!wget -c https://huggingface.co/stabilityai/control-lora/resolve/main/control-LoRAs-rank256/control-lora-canny-rank256.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/stabilityai/control-lora/resolve/main/control-LoRAs-rank256/control-lora-depth-rank256.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/stabilityai/control-lora/resolve/main/control-LoRAs-rank256/control-lora-recolor-rank256.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/stabilityai/control-lora/resolve/main/control-LoRAs-rank256/control-lora-sketch-rank256.safetensors -P ./models/controlnet/\n", - "\n", - "# Controlnet Preprocessor nodes by Fannovel16\n", - "#!cd custom_nodes && git clone https://github.com/Fannovel16/comfy_controlnet_preprocessors; cd comfy_controlnet_preprocessors && python install.py\n", - "\n", - "\n", - "# GLIGEN\n", - "#!wget -c https://huggingface.co/comfyanonymous/GLIGEN_pruned_safetensors/resolve/main/gligen_sd14_textbox_pruned_fp16.safetensors -P ./models/gligen/\n", - "\n", - "\n", - "# ESRGAN upscale model\n", - "#!wget -c https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth -P ./models/upscale_models/\n", - "#!wget -c https://huggingface.co/sberbank-ai/Real-ESRGAN/resolve/main/RealESRGAN_x2.pth -P ./models/upscale_models/\n", - "#!wget -c https://huggingface.co/sberbank-ai/Real-ESRGAN/resolve/main/RealESRGAN_x4.pth -P ./models/upscale_models/\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kkkkkkkkkkkkkkk" - }, - "source": [ - "### Run ComfyUI with cloudflared (Recommended Way)\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "jjjjjjjjjjjjjj" - }, - "outputs": [], - "source": [ - "!wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64.deb\n", - "!dpkg -i cloudflared-linux-amd64.deb\n", - "\n", - "import subprocess\n", - "import threading\n", - "import time\n", - "import socket\n", - "import urllib.request\n", - "\n", - "def iframe_thread(port):\n", - " while True:\n", - " time.sleep(0.5)\n", - " sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n", - " result = sock.connect_ex(('127.0.0.1', port))\n", - " if result == 0:\n", - " break\n", - " sock.close()\n", - " print(\"\\nComfyUI finished loading, trying to launch cloudflared (if it gets stuck here cloudflared is having issues)\\n\")\n", - "\n", - " p = subprocess.Popen([\"cloudflared\", \"tunnel\", \"--url\", \"http://127.0.0.1:{}\".format(port)], stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n", - " for line in p.stderr:\n", - " l = line.decode()\n", - " if \"trycloudflare.com \" in l:\n", - " print(\"This is the URL to access ComfyUI:\", l[l.find(\"http\"):], end='')\n", - " #print(l, end='')\n", - "\n", - "\n", - "threading.Thread(target=iframe_thread, daemon=True, args=(8188,)).start()\n", - "\n", - "!python main.py --dont-print-server" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kkkkkkkkkkkkkk" - }, - "source": [ - "### Run ComfyUI with localtunnel\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "jjjjjjjjjjjjj" - }, - "outputs": [], - "source": [ - "!npm install -g localtunnel\n", - "\n", - "import threading\n", - "\n", - "def iframe_thread(port):\n", - " while True:\n", - " time.sleep(0.5)\n", - " sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n", - " result = sock.connect_ex(('127.0.0.1', port))\n", - " if result == 0:\n", - " break\n", - " sock.close()\n", - " print(\"\\nComfyUI finished loading, trying to launch localtunnel (if it gets stuck here localtunnel is having issues)\\n\")\n", - "\n", - " print(\"The password/enpoint ip for localtunnel is:\", urllib.request.urlopen('https://ipv4.icanhazip.com').read().decode('utf8').strip(\"\\n\"))\n", - " p = subprocess.Popen([\"lt\", \"--port\", \"{}\".format(port)], stdout=subprocess.PIPE)\n", - " for line in p.stdout:\n", - " print(line.decode(), end='')\n", - "\n", - "\n", - "threading.Thread(target=iframe_thread, daemon=True, args=(8188,)).start()\n", - "\n", - "!python main.py --dont-print-server" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gggggggggg" - }, - "source": [ - "### Run ComfyUI with colab iframe (use only in case the previous way with localtunnel doesn't work)\n", - "\n", - "You should see the ui appear in an iframe. If you get a 403 error, it's your firefox settings or an extension that's messing things up.\n", - "\n", - "If you want to open it in another window use the link.\n", - "\n", - "Note that some UI features like live image previews won't work because the colab iframe blocks websockets." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "hhhhhhhhhh" - }, - "outputs": [], - "source": [ - "import threading\n", - "def iframe_thread(port):\n", - " while True:\n", - " time.sleep(0.5)\n", - " sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n", - " result = sock.connect_ex(('127.0.0.1', port))\n", - " if result == 0:\n", - " break\n", - " sock.close()\n", - " from google.colab import output\n", - " output.serve_kernel_port_as_iframe(port, height=1024)\n", - " print(\"to open it in a window you can open this link here:\")\n", - " output.serve_kernel_port_as_window(port)\n", - "\n", - "threading.Thread(target=iframe_thread, daemon=True, args=(8188,)).start()\n", - "\n", - "!python main.py --dont-print-server" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "provenance": [] - }, - "gpuClass": "standard", - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} From 1205afc708d963d160f38c1d6613a384ddf6c564 Mon Sep 17 00:00:00 2001 From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com> Date: Wed, 9 Jul 2025 23:41:22 +0800 Subject: [PATCH 014/243] Better training loop implementation (#8820) --- comfy_extras/nodes_train.py | 122 +++++++++++++++++++++++------------- 1 file changed, 80 insertions(+), 42 deletions(-) diff --git a/comfy_extras/nodes_train.py b/comfy_extras/nodes_train.py index 17caf5ad5..3d05fdab5 100644 --- a/comfy_extras/nodes_train.py +++ b/comfy_extras/nodes_train.py @@ -23,38 +23,78 @@ from comfy.comfy_types.node_typing import IO from comfy.weight_adapter import adapters +def make_batch_extra_option_dict(d, indicies, full_size=None): + new_dict = {} + for k, v in d.items(): + newv = v + if isinstance(v, dict): + newv = make_batch_extra_option_dict(v, indicies, full_size=full_size) + elif isinstance(v, torch.Tensor): + if full_size is None or v.size(0) == full_size: + newv = v[indicies] + elif isinstance(v, (list, tuple)) and len(v) == full_size: + newv = [v[i] for i in indicies] + new_dict[k] = newv + return new_dict + + class TrainSampler(comfy.samplers.Sampler): - def __init__(self, loss_fn, optimizer, loss_callback=None): + def __init__(self, loss_fn, optimizer, loss_callback=None, batch_size=1, total_steps=1, seed=0, training_dtype=torch.bfloat16): self.loss_fn = loss_fn self.optimizer = optimizer self.loss_callback = loss_callback + self.batch_size = batch_size + self.total_steps = total_steps + self.seed = seed + self.training_dtype = training_dtype def sample(self, model_wrap, sigmas, extra_args, callback, noise, latent_image=None, denoise_mask=None, disable_pbar=False): - self.optimizer.zero_grad() - noise = model_wrap.inner_model.model_sampling.noise_scaling(sigmas, noise, latent_image, False) - latent = model_wrap.inner_model.model_sampling.noise_scaling( - torch.zeros_like(sigmas), - torch.zeros_like(noise, requires_grad=True), - latent_image, - False - ) + cond = model_wrap.conds["positive"] + dataset_size = sigmas.size(0) + torch.cuda.empty_cache() + for i in (pbar:=tqdm.trange(self.total_steps, desc="Training LoRA", smoothing=0.01, disable=not comfy.utils.PROGRESS_BAR_ENABLED)): + noisegen = comfy_extras.nodes_custom_sampler.Noise_RandomNoise(self.seed + i * 1000) + indicies = torch.randperm(dataset_size)[:self.batch_size].tolist() - # Ensure model is in training mode and computing gradients - # x0 pred - denoised = model_wrap(noise, sigmas, **extra_args) - try: - loss = self.loss_fn(denoised, latent.clone()) - except RuntimeError as e: - if "does not require grad and does not have a grad_fn" in str(e): - logging.info("WARNING: This is likely due to the model is loaded in inference mode.") - loss.backward() - if self.loss_callback: - self.loss_callback(loss.item()) + batch_latent = torch.stack([latent_image[i] for i in indicies]) + batch_noise = noisegen.generate_noise({"samples": batch_latent}).to(batch_latent.device) + batch_sigmas = [ + model_wrap.inner_model.model_sampling.percent_to_sigma( + torch.rand((1,)).item() + ) for _ in range(min(self.batch_size, dataset_size)) + ] + batch_sigmas = torch.tensor(batch_sigmas).to(batch_latent.device) - self.optimizer.step() - # torch.cuda.memory._dump_snapshot("trainn.pickle") - # torch.cuda.memory._record_memory_history(enabled=None) + xt = model_wrap.inner_model.model_sampling.noise_scaling( + batch_sigmas, + batch_noise, + batch_latent, + False + ) + x0 = model_wrap.inner_model.model_sampling.noise_scaling( + torch.zeros_like(batch_sigmas), + torch.zeros_like(batch_noise), + batch_latent, + False + ) + + model_wrap.conds["positive"] = [ + cond[i] for i in indicies + ] + batch_extra_args = make_batch_extra_option_dict(extra_args, indicies, full_size=dataset_size) + + with torch.autocast(xt.device.type, dtype=self.training_dtype): + x0_pred = model_wrap(xt, batch_sigmas, **batch_extra_args) + loss = self.loss_fn(x0_pred, x0) + loss.backward() + if self.loss_callback: + self.loss_callback(loss.item()) + pbar.set_postfix({"loss": f"{loss.item():.4f}"}) + + self.optimizer.step() + self.optimizer.zero_grad() + torch.cuda.empty_cache() return torch.zeros_like(latent_image) @@ -584,36 +624,34 @@ class TrainLoraNode: loss_map = {"loss": []} def loss_callback(loss): loss_map["loss"].append(loss) - pbar.set_postfix({"loss": f"{loss:.4f}"}) train_sampler = TrainSampler( - criterion, optimizer, loss_callback=loss_callback + criterion, + optimizer, + loss_callback=loss_callback, + batch_size=batch_size, + total_steps=steps, + seed=seed, + training_dtype=dtype ) guider = comfy_extras.nodes_custom_sampler.Guider_Basic(mp) guider.set_conds(positive) # Set conditioning from input - # yoland: this currently resize to the first image in the dataset - # Training loop - torch.cuda.empty_cache() try: - for step in (pbar:=tqdm.trange(steps, desc="Training LoRA", smoothing=0.01, disable=not comfy.utils.PROGRESS_BAR_ENABLED)): - # Generate random sigma - sigmas = [mp.model.model_sampling.percent_to_sigma( - torch.rand((1,)).item() - ) for _ in range(min(batch_size, num_images))] - sigmas = torch.tensor(sigmas) - - noise = comfy_extras.nodes_custom_sampler.Noise_RandomNoise(step * 1000 + seed) - - indices = torch.randperm(num_images)[:batch_size] - batch_latent = latents[indices].clone() - guider.set_conds([positive[i] for i in indices]) # Set conditioning from input - guider.sample(noise.generate_noise({"samples": batch_latent}), batch_latent, train_sampler, sigmas, seed=noise.seed) + # Generate dummy sigmas and noise + sigmas = torch.tensor(range(num_images)) + noise = comfy_extras.nodes_custom_sampler.Noise_RandomNoise(seed) + guider.sample( + noise.generate_noise({"samples": latents}), + latents, + train_sampler, + sigmas, + seed=noise.seed + ) finally: for m in mp.model.modules(): unpatch(m) del train_sampler, optimizer - torch.cuda.empty_cache() for adapter in all_weight_adapters: adapter.requires_grad_(False) From 1fd306824d35bf2669f6be46fadab37efd7081c4 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 9 Jul 2025 22:03:27 -0700 Subject: [PATCH 015/243] Add warning to catch torch import mistakes. (#8852) --- main.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/main.py b/main.py index d488c0f4c..0b1987ef4 100644 --- a/main.py +++ b/main.py @@ -127,6 +127,9 @@ if __name__ == "__main__": import cuda_malloc +if 'torch' in sys.modules: + logging.warning("WARNING: Potential Error in code: Torch already imported, torch should never be imported before this point.") + import comfy.utils import execution From 2b653e8c18f18792e7f080df611c8a35f1d0fdf4 Mon Sep 17 00:00:00 2001 From: guill Date: Thu, 10 Jul 2025 11:46:19 -0700 Subject: [PATCH 016/243] Support for async node functions (#8830) * Support for async execution functions This commit adds support for node execution functions defined as async. When a node's execution function is defined as async, we can continue executing other nodes while it is processing. Standard uses of `await` should "just work", but people will still have to be careful if they spawn actual threads. Because torch doesn't really have async/await versions of functions, this won't particularly help with most locally-executing nodes, but it does work for e.g. web requests to other machines. In addition to the execute function, the `VALIDATE_INPUTS` and `check_lazy_status` functions can also be defined as async, though we'll only resolve one node at a time right now for those. * Add the execution model tests to CI * Add a missing file It looks like this got caught by .gitignore? There's probably a better place to put it, but I'm not sure what that is. * Add the websocket library for automated tests * Add additional tests for async error cases Also fixes one bug that was found when an async function throws an error after being scheduled on a task. * Add a feature flags message to reduce bandwidth We now only send 1 preview message of the latest type the client can support. We'll add a console warning when the client fails to send a feature flags message at some point in the future. * Add async tests to CI * Don't actually add new tests in this PR Will do it in a separate PR * Resolve unit test in GPU-less runner * Just remove the tests that GHA can't handle * Change line endings to UNIX-style * Avoid loading model_management.py so early Because model_management.py has a top-level `logging.info`, we have to be careful not to import that file before we call `setup_logging`. If we do, we end up having the default logging handler registered in addition to our custom one. --- comfy/utils.py | 5 +- comfy_api/feature_flags.py | 69 +++ comfy_execution/caching.py | 53 +-- comfy_execution/graph.py | 20 +- comfy_execution/progress.py | 347 +++++++++++++++ comfy_execution/utils.py | 46 ++ execution.py | 135 ++++-- main.py | 32 +- protocol.py | 7 + server.py | 98 ++++- tests-unit/feature_flags_test.py | 98 +++++ tests-unit/requirements.txt | 1 + tests-unit/websocket_feature_flags_test.py | 77 ++++ tests/inference/extra_model_paths.yaml | 2 +- tests/inference/test_async_nodes.py | 410 ++++++++++++++++++ tests/inference/test_execution.py | 65 ++- .../testing_nodes/testing-pack/__init__.py | 49 ++- .../testing-pack/async_test_nodes.py | 343 +++++++++++++++ .../testing-pack/specific_tests.py | 136 ++++++ 19 files changed, 1898 insertions(+), 95 deletions(-) create mode 100644 comfy_api/feature_flags.py create mode 100644 comfy_execution/progress.py create mode 100644 comfy_execution/utils.py create mode 100644 protocol.py create mode 100644 tests-unit/feature_flags_test.py create mode 100644 tests-unit/websocket_feature_flags_test.py create mode 100644 tests/inference/test_async_nodes.py create mode 100644 tests/inference/testing_nodes/testing-pack/async_test_nodes.py diff --git a/comfy/utils.py b/comfy/utils.py index 47981d8f6..f8e01f713 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -998,11 +998,12 @@ def set_progress_bar_global_hook(function): PROGRESS_BAR_HOOK = function class ProgressBar: - def __init__(self, total): + def __init__(self, total, node_id=None): global PROGRESS_BAR_HOOK self.total = total self.current = 0 self.hook = PROGRESS_BAR_HOOK + self.node_id = node_id def update_absolute(self, value, total=None, preview=None): if total is not None: @@ -1011,7 +1012,7 @@ class ProgressBar: value = self.total self.current = value if self.hook is not None: - self.hook(self.current, self.total, preview) + self.hook(self.current, self.total, preview, node_id=self.node_id) def update(self, value): self.update_absolute(self.current + value) diff --git a/comfy_api/feature_flags.py b/comfy_api/feature_flags.py new file mode 100644 index 000000000..0d4389a6e --- /dev/null +++ b/comfy_api/feature_flags.py @@ -0,0 +1,69 @@ +""" +Feature flags module for ComfyUI WebSocket protocol negotiation. + +This module handles capability negotiation between frontend and backend, +allowing graceful protocol evolution while maintaining backward compatibility. +""" + +from typing import Any, Dict + +from comfy.cli_args import args + +# Default server capabilities +SERVER_FEATURE_FLAGS: Dict[str, Any] = { + "supports_preview_metadata": True, + "max_upload_size": args.max_upload_size * 1024 * 1024, # Convert MB to bytes +} + + +def get_connection_feature( + sockets_metadata: Dict[str, Dict[str, Any]], + sid: str, + feature_name: str, + default: Any = False +) -> Any: + """ + Get a feature flag value for a specific connection. + + Args: + sockets_metadata: Dictionary of socket metadata + sid: Session ID of the connection + feature_name: Name of the feature to check + default: Default value if feature not found + + Returns: + Feature value or default if not found + """ + if sid not in sockets_metadata: + return default + + return sockets_metadata[sid].get("feature_flags", {}).get(feature_name, default) + + +def supports_feature( + sockets_metadata: Dict[str, Dict[str, Any]], + sid: str, + feature_name: str +) -> bool: + """ + Check if a connection supports a specific feature. + + Args: + sockets_metadata: Dictionary of socket metadata + sid: Session ID of the connection + feature_name: Name of the feature to check + + Returns: + Boolean indicating if feature is supported + """ + return get_connection_feature(sockets_metadata, sid, feature_name, False) is True + + +def get_server_features() -> Dict[str, Any]: + """ + Get the server's feature flags. + + Returns: + Dictionary of server feature flags + """ + return SERVER_FEATURE_FLAGS.copy() diff --git a/comfy_execution/caching.py b/comfy_execution/caching.py index dbb37b89f..41224ce3b 100644 --- a/comfy_execution/caching.py +++ b/comfy_execution/caching.py @@ -1,6 +1,7 @@ import itertools from typing import Sequence, Mapping, Dict from comfy_execution.graph import DynamicPrompt +from abc import ABC, abstractmethod import nodes @@ -16,12 +17,13 @@ def include_unique_id_in_input(class_type: str) -> bool: NODE_CLASS_CONTAINS_UNIQUE_ID[class_type] = "UNIQUE_ID" in class_def.INPUT_TYPES().get("hidden", {}).values() return NODE_CLASS_CONTAINS_UNIQUE_ID[class_type] -class CacheKeySet: +class CacheKeySet(ABC): def __init__(self, dynprompt, node_ids, is_changed_cache): self.keys = {} self.subcache_keys = {} - def add_keys(self, node_ids): + @abstractmethod + async def add_keys(self, node_ids): raise NotImplementedError() def all_node_ids(self): @@ -60,9 +62,8 @@ class CacheKeySetID(CacheKeySet): def __init__(self, dynprompt, node_ids, is_changed_cache): super().__init__(dynprompt, node_ids, is_changed_cache) self.dynprompt = dynprompt - self.add_keys(node_ids) - def add_keys(self, node_ids): + async def add_keys(self, node_ids): for node_id in node_ids: if node_id in self.keys: continue @@ -77,37 +78,36 @@ class CacheKeySetInputSignature(CacheKeySet): super().__init__(dynprompt, node_ids, is_changed_cache) self.dynprompt = dynprompt self.is_changed_cache = is_changed_cache - self.add_keys(node_ids) def include_node_id_in_input(self) -> bool: return False - def add_keys(self, node_ids): + async def add_keys(self, node_ids): for node_id in node_ids: if node_id in self.keys: continue if not self.dynprompt.has_node(node_id): continue node = self.dynprompt.get_node(node_id) - self.keys[node_id] = self.get_node_signature(self.dynprompt, node_id) + self.keys[node_id] = await self.get_node_signature(self.dynprompt, node_id) self.subcache_keys[node_id] = (node_id, node["class_type"]) - def get_node_signature(self, dynprompt, node_id): + async def get_node_signature(self, dynprompt, node_id): signature = [] ancestors, order_mapping = self.get_ordered_ancestry(dynprompt, node_id) - signature.append(self.get_immediate_node_signature(dynprompt, node_id, order_mapping)) + signature.append(await self.get_immediate_node_signature(dynprompt, node_id, order_mapping)) for ancestor_id in ancestors: - signature.append(self.get_immediate_node_signature(dynprompt, ancestor_id, order_mapping)) + signature.append(await self.get_immediate_node_signature(dynprompt, ancestor_id, order_mapping)) return to_hashable(signature) - def get_immediate_node_signature(self, dynprompt, node_id, ancestor_order_mapping): + async def get_immediate_node_signature(self, dynprompt, node_id, ancestor_order_mapping): if not dynprompt.has_node(node_id): # This node doesn't exist -- we can't cache it. return [float("NaN")] node = dynprompt.get_node(node_id) class_type = node["class_type"] class_def = nodes.NODE_CLASS_MAPPINGS[class_type] - signature = [class_type, self.is_changed_cache.get(node_id)] + signature = [class_type, await self.is_changed_cache.get(node_id)] if self.include_node_id_in_input() or (hasattr(class_def, "NOT_IDEMPOTENT") and class_def.NOT_IDEMPOTENT) or include_unique_id_in_input(class_type): signature.append(node_id) inputs = node["inputs"] @@ -150,9 +150,10 @@ class BasicCache: self.cache = {} self.subcaches = {} - def set_prompt(self, dynprompt, node_ids, is_changed_cache): + async def set_prompt(self, dynprompt, node_ids, is_changed_cache): self.dynprompt = dynprompt self.cache_key_set = self.key_class(dynprompt, node_ids, is_changed_cache) + await self.cache_key_set.add_keys(node_ids) self.is_changed_cache = is_changed_cache self.initialized = True @@ -201,13 +202,13 @@ class BasicCache: else: return None - def _ensure_subcache(self, node_id, children_ids): + async def _ensure_subcache(self, node_id, children_ids): subcache_key = self.cache_key_set.get_subcache_key(node_id) subcache = self.subcaches.get(subcache_key, None) if subcache is None: subcache = BasicCache(self.key_class) self.subcaches[subcache_key] = subcache - subcache.set_prompt(self.dynprompt, children_ids, self.is_changed_cache) + await subcache.set_prompt(self.dynprompt, children_ids, self.is_changed_cache) return subcache def _get_subcache(self, node_id): @@ -259,10 +260,10 @@ class HierarchicalCache(BasicCache): assert cache is not None cache._set_immediate(node_id, value) - def ensure_subcache_for(self, node_id, children_ids): + async def ensure_subcache_for(self, node_id, children_ids): cache = self._get_cache_for(node_id) assert cache is not None - return cache._ensure_subcache(node_id, children_ids) + return await cache._ensure_subcache(node_id, children_ids) class LRUCache(BasicCache): def __init__(self, key_class, max_size=100): @@ -273,8 +274,8 @@ class LRUCache(BasicCache): self.used_generation = {} self.children = {} - def set_prompt(self, dynprompt, node_ids, is_changed_cache): - super().set_prompt(dynprompt, node_ids, is_changed_cache) + async def set_prompt(self, dynprompt, node_ids, is_changed_cache): + await super().set_prompt(dynprompt, node_ids, is_changed_cache) self.generation += 1 for node_id in node_ids: self._mark_used(node_id) @@ -303,11 +304,11 @@ class LRUCache(BasicCache): self._mark_used(node_id) return self._set_immediate(node_id, value) - def ensure_subcache_for(self, node_id, children_ids): + async def ensure_subcache_for(self, node_id, children_ids): # Just uses subcaches for tracking 'live' nodes - super()._ensure_subcache(node_id, children_ids) + await super()._ensure_subcache(node_id, children_ids) - self.cache_key_set.add_keys(children_ids) + await self.cache_key_set.add_keys(children_ids) self._mark_used(node_id) cache_key = self.cache_key_set.get_data_key(node_id) self.children[cache_key] = [] @@ -337,7 +338,7 @@ class DependencyAwareCache(BasicCache): self.ancestors = {} # Maps node_id -> set of ancestor node_ids self.executed_nodes = set() # Tracks nodes that have been executed - def set_prompt(self, dynprompt, node_ids, is_changed_cache): + async def set_prompt(self, dynprompt, node_ids, is_changed_cache): """ Clear the entire cache and rebuild the dependency graph. @@ -354,7 +355,7 @@ class DependencyAwareCache(BasicCache): self.executed_nodes.clear() # Call the parent method to initialize the cache with the new prompt - super().set_prompt(dynprompt, node_ids, is_changed_cache) + await super().set_prompt(dynprompt, node_ids, is_changed_cache) # Rebuild the dependency graph self._build_dependency_graph(dynprompt, node_ids) @@ -405,7 +406,7 @@ class DependencyAwareCache(BasicCache): """ return self._get_immediate(node_id) - def ensure_subcache_for(self, node_id, children_ids): + async def ensure_subcache_for(self, node_id, children_ids): """ Ensure a subcache exists for a node and update dependencies. @@ -416,7 +417,7 @@ class DependencyAwareCache(BasicCache): Returns: The subcache object for the node. """ - subcache = super()._ensure_subcache(node_id, children_ids) + subcache = await super()._ensure_subcache(node_id, children_ids) for child_id in children_ids: self.descendants[node_id].add(child_id) self.ancestors[child_id].add(node_id) diff --git a/comfy_execution/graph.py b/comfy_execution/graph.py index a2799b52e..c79243e1e 100644 --- a/comfy_execution/graph.py +++ b/comfy_execution/graph.py @@ -2,6 +2,7 @@ from __future__ import annotations from typing import Type, Literal import nodes +import asyncio from comfy_execution.graph_utils import is_link from comfy.comfy_types.node_typing import ComfyNodeABC, InputTypeDict, InputTypeOptions @@ -100,6 +101,8 @@ class TopologicalSort: self.pendingNodes = {} self.blockCount = {} # Number of nodes this node is directly blocked by self.blocking = {} # Which nodes are blocked by this node + self.externalBlocks = 0 + self.unblockedEvent = asyncio.Event() def get_input_info(self, unique_id, input_name): class_type = self.dynprompt.get_node(unique_id)["class_type"] @@ -153,6 +156,16 @@ class TopologicalSort: for link in links: self.add_strong_link(*link) + def add_external_block(self, node_id): + assert node_id in self.blockCount, "Can't add external block to a node that isn't pending" + self.externalBlocks += 1 + self.blockCount[node_id] += 1 + def unblock(): + self.externalBlocks -= 1 + self.blockCount[node_id] -= 1 + self.unblockedEvent.set() + return unblock + def is_cached(self, node_id): return False @@ -181,11 +194,16 @@ class ExecutionList(TopologicalSort): def is_cached(self, node_id): return self.output_cache.get(node_id) is not None - def stage_node_execution(self): + async def stage_node_execution(self): assert self.staged_node_id is None if self.is_empty(): return None, None, None available = self.get_ready_nodes() + while len(available) == 0 and self.externalBlocks > 0: + # Wait for an external block to be released + await self.unblockedEvent.wait() + self.unblockedEvent.clear() + available = self.get_ready_nodes() if len(available) == 0: cycled_nodes = self.get_nodes_in_cycle() # Because cycles composed entirely of static nodes are caught during initial validation, diff --git a/comfy_execution/progress.py b/comfy_execution/progress.py new file mode 100644 index 000000000..5645b3e3c --- /dev/null +++ b/comfy_execution/progress.py @@ -0,0 +1,347 @@ +from typing import TypedDict, Dict, Optional +from typing_extensions import override +from PIL import Image +from enum import Enum +from abc import ABC +from tqdm import tqdm +from typing import TYPE_CHECKING +if TYPE_CHECKING: + from comfy_execution.graph import DynamicPrompt +from protocol import BinaryEventTypes +from comfy_api import feature_flags + + +class NodeState(Enum): + Pending = "pending" + Running = "running" + Finished = "finished" + Error = "error" + + +class NodeProgressState(TypedDict): + """ + A class to represent the state of a node's progress. + """ + + state: NodeState + value: float + max: float + + +class ProgressHandler(ABC): + """ + Abstract base class for progress handlers. + Progress handlers receive progress updates and display them in various ways. + """ + + def __init__(self, name: str): + self.name = name + self.enabled = True + + def set_registry(self, registry: "ProgressRegistry"): + pass + + def start_handler(self, node_id: str, state: NodeProgressState, prompt_id: str): + """Called when a node starts processing""" + pass + + def update_handler( + self, + node_id: str, + value: float, + max_value: float, + state: NodeProgressState, + prompt_id: str, + image: Optional[Image.Image] = None, + ): + """Called when a node's progress is updated""" + pass + + def finish_handler(self, node_id: str, state: NodeProgressState, prompt_id: str): + """Called when a node finishes processing""" + pass + + def reset(self): + """Called when the progress registry is reset""" + pass + + def enable(self): + """Enable this handler""" + self.enabled = True + + def disable(self): + """Disable this handler""" + self.enabled = False + + +class CLIProgressHandler(ProgressHandler): + """ + Handler that displays progress using tqdm progress bars in the CLI. + """ + + def __init__(self): + super().__init__("cli") + self.progress_bars: Dict[str, tqdm] = {} + + @override + def start_handler(self, node_id: str, state: NodeProgressState, prompt_id: str): + # Create a new tqdm progress bar + if node_id not in self.progress_bars: + self.progress_bars[node_id] = tqdm( + total=state["max"], + desc=f"Node {node_id}", + unit="steps", + leave=True, + position=len(self.progress_bars), + ) + + @override + def update_handler( + self, + node_id: str, + value: float, + max_value: float, + state: NodeProgressState, + prompt_id: str, + image: Optional[Image.Image] = None, + ): + # Handle case where start_handler wasn't called + if node_id not in self.progress_bars: + self.progress_bars[node_id] = tqdm( + total=max_value, + desc=f"Node {node_id}", + unit="steps", + leave=True, + position=len(self.progress_bars), + ) + self.progress_bars[node_id].update(value) + else: + # Update existing progress bar + if max_value != self.progress_bars[node_id].total: + self.progress_bars[node_id].total = max_value + # Calculate the update amount (difference from current position) + current_position = self.progress_bars[node_id].n + update_amount = value - current_position + if update_amount > 0: + self.progress_bars[node_id].update(update_amount) + + @override + def finish_handler(self, node_id: str, state: NodeProgressState, prompt_id: str): + # Complete and close the progress bar if it exists + if node_id in self.progress_bars: + # Ensure the bar shows 100% completion + remaining = state["max"] - self.progress_bars[node_id].n + if remaining > 0: + self.progress_bars[node_id].update(remaining) + self.progress_bars[node_id].close() + del self.progress_bars[node_id] + + @override + def reset(self): + # Close all progress bars + for bar in self.progress_bars.values(): + bar.close() + self.progress_bars.clear() + + +class WebUIProgressHandler(ProgressHandler): + """ + Handler that sends progress updates to the WebUI via WebSockets. + """ + + def __init__(self, server_instance): + super().__init__("webui") + self.server_instance = server_instance + + def set_registry(self, registry: "ProgressRegistry"): + self.registry = registry + + def _send_progress_state(self, prompt_id: str, nodes: Dict[str, NodeProgressState]): + """Send the current progress state to the client""" + if self.server_instance is None: + return + + # Only send info for non-pending nodes + active_nodes = { + node_id: { + "value": state["value"], + "max": state["max"], + "state": state["state"].value, + "node_id": node_id, + "prompt_id": prompt_id, + "display_node_id": self.registry.dynprompt.get_display_node_id(node_id), + "parent_node_id": self.registry.dynprompt.get_parent_node_id(node_id), + "real_node_id": self.registry.dynprompt.get_real_node_id(node_id), + } + for node_id, state in nodes.items() + if state["state"] != NodeState.Pending + } + + # Send a combined progress_state message with all node states + self.server_instance.send_sync( + "progress_state", {"prompt_id": prompt_id, "nodes": active_nodes} + ) + + @override + def start_handler(self, node_id: str, state: NodeProgressState, prompt_id: str): + # Send progress state of all nodes + if self.registry: + self._send_progress_state(prompt_id, self.registry.nodes) + + @override + def update_handler( + self, + node_id: str, + value: float, + max_value: float, + state: NodeProgressState, + prompt_id: str, + image: Optional[Image.Image] = None, + ): + # Send progress state of all nodes + if self.registry: + self._send_progress_state(prompt_id, self.registry.nodes) + if image: + # Only send new format if client supports it + if feature_flags.supports_feature( + self.server_instance.sockets_metadata, + self.server_instance.client_id, + "supports_preview_metadata", + ): + metadata = { + "node_id": node_id, + "prompt_id": prompt_id, + "display_node_id": self.registry.dynprompt.get_display_node_id( + node_id + ), + "parent_node_id": self.registry.dynprompt.get_parent_node_id( + node_id + ), + "real_node_id": self.registry.dynprompt.get_real_node_id(node_id), + } + self.server_instance.send_sync( + BinaryEventTypes.PREVIEW_IMAGE_WITH_METADATA, + (image, metadata), + self.server_instance.client_id, + ) + + @override + def finish_handler(self, node_id: str, state: NodeProgressState, prompt_id: str): + # Send progress state of all nodes + if self.registry: + self._send_progress_state(prompt_id, self.registry.nodes) + + +class ProgressRegistry: + """ + Registry that maintains node progress state and notifies registered handlers. + """ + + def __init__(self, prompt_id: str, dynprompt: "DynamicPrompt"): + self.prompt_id = prompt_id + self.dynprompt = dynprompt + self.nodes: Dict[str, NodeProgressState] = {} + self.handlers: Dict[str, ProgressHandler] = {} + + def register_handler(self, handler: ProgressHandler) -> None: + """Register a progress handler""" + self.handlers[handler.name] = handler + + def unregister_handler(self, handler_name: str) -> None: + """Unregister a progress handler""" + if handler_name in self.handlers: + # Allow handler to clean up resources + self.handlers[handler_name].reset() + del self.handlers[handler_name] + + def enable_handler(self, handler_name: str) -> None: + """Enable a progress handler""" + if handler_name in self.handlers: + self.handlers[handler_name].enable() + + def disable_handler(self, handler_name: str) -> None: + """Disable a progress handler""" + if handler_name in self.handlers: + self.handlers[handler_name].disable() + + def ensure_entry(self, node_id: str) -> NodeProgressState: + """Ensure a node entry exists""" + if node_id not in self.nodes: + self.nodes[node_id] = NodeProgressState( + state=NodeState.Pending, value=0, max=1 + ) + return self.nodes[node_id] + + def start_progress(self, node_id: str) -> None: + """Start progress tracking for a node""" + entry = self.ensure_entry(node_id) + entry["state"] = NodeState.Running + entry["value"] = 0.0 + entry["max"] = 1.0 + + # Notify all enabled handlers + for handler in self.handlers.values(): + if handler.enabled: + handler.start_handler(node_id, entry, self.prompt_id) + + def update_progress( + self, node_id: str, value: float, max_value: float, image: Optional[Image.Image] + ) -> None: + """Update progress for a node""" + entry = self.ensure_entry(node_id) + entry["state"] = NodeState.Running + entry["value"] = value + entry["max"] = max_value + + # Notify all enabled handlers + for handler in self.handlers.values(): + if handler.enabled: + handler.update_handler( + node_id, value, max_value, entry, self.prompt_id, image + ) + + def finish_progress(self, node_id: str) -> None: + """Finish progress tracking for a node""" + entry = self.ensure_entry(node_id) + entry["state"] = NodeState.Finished + entry["value"] = entry["max"] + + # Notify all enabled handlers + for handler in self.handlers.values(): + if handler.enabled: + handler.finish_handler(node_id, entry, self.prompt_id) + + def reset_handlers(self) -> None: + """Reset all handlers""" + for handler in self.handlers.values(): + handler.reset() + +# Global registry instance +global_progress_registry: ProgressRegistry | None = None + +def reset_progress_state(prompt_id: str, dynprompt: "DynamicPrompt") -> None: + global global_progress_registry + + # Reset existing handlers if registry exists + if global_progress_registry is not None: + global_progress_registry.reset_handlers() + + # Create new registry + global_progress_registry = ProgressRegistry(prompt_id, dynprompt) + + +def add_progress_handler(handler: ProgressHandler) -> None: + registry = get_progress_state() + handler.set_registry(registry) + registry.register_handler(handler) + + +def get_progress_state() -> ProgressRegistry: + global global_progress_registry + if global_progress_registry is None: + from comfy_execution.graph import DynamicPrompt + + global_progress_registry = ProgressRegistry( + prompt_id="", dynprompt=DynamicPrompt({}) + ) + return global_progress_registry diff --git a/comfy_execution/utils.py b/comfy_execution/utils.py new file mode 100644 index 000000000..62d32f101 --- /dev/null +++ b/comfy_execution/utils.py @@ -0,0 +1,46 @@ +import contextvars +from typing import Optional, NamedTuple + +class ExecutionContext(NamedTuple): + """ + Context information about the currently executing node. + + Attributes: + node_id: The ID of the currently executing node + list_index: The index in a list being processed (for operations on batches/lists) + """ + prompt_id: str + node_id: str + list_index: Optional[int] + +current_executing_context: contextvars.ContextVar[Optional[ExecutionContext]] = contextvars.ContextVar("current_executing_context", default=None) + +def get_executing_context() -> Optional[ExecutionContext]: + return current_executing_context.get(None) + +class CurrentNodeContext: + """ + Context manager for setting the current executing node context. + + Sets the current_executing_context on enter and resets it on exit. + + Example: + with CurrentNodeContext(node_id="123", list_index=0): + # Code that should run with the current node context set + process_image() + """ + def __init__(self, prompt_id: str, node_id: str, list_index: Optional[int] = None): + self.context = ExecutionContext( + prompt_id= prompt_id, + node_id= node_id, + list_index= list_index + ) + self.token = None + + def __enter__(self): + self.token = current_executing_context.set(self.context) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.token is not None: + current_executing_context.reset(self.token) diff --git a/execution.py b/execution.py index f6006fa12..90cefc023 100644 --- a/execution.py +++ b/execution.py @@ -8,12 +8,14 @@ import time import traceback from enum import Enum from typing import List, Literal, NamedTuple, Optional +import asyncio import torch import comfy.model_management import nodes from comfy_execution.caching import ( + BasicCache, CacheKeySetID, CacheKeySetInputSignature, DependencyAwareCache, @@ -28,6 +30,8 @@ from comfy_execution.graph import ( ) from comfy_execution.graph_utils import GraphBuilder, is_link from comfy_execution.validation import validate_node_input +from comfy_execution.progress import get_progress_state, reset_progress_state, add_progress_handler, WebUIProgressHandler +from comfy_execution.utils import CurrentNodeContext class ExecutionResult(Enum): @@ -39,12 +43,13 @@ class DuplicateNodeError(Exception): pass class IsChangedCache: - def __init__(self, dynprompt, outputs_cache): + def __init__(self, prompt_id: str, dynprompt: DynamicPrompt, outputs_cache: BasicCache): + self.prompt_id = prompt_id self.dynprompt = dynprompt self.outputs_cache = outputs_cache self.is_changed = {} - def get(self, node_id): + async def get(self, node_id): if node_id in self.is_changed: return self.is_changed[node_id] @@ -62,7 +67,8 @@ class IsChangedCache: # Intentionally do not use cached outputs here. We only want constants in IS_CHANGED input_data_all, _ = get_input_data(node["inputs"], class_def, node_id, None) try: - is_changed = _map_node_over_list(class_def, input_data_all, "IS_CHANGED") + is_changed = await _async_map_node_over_list(self.prompt_id, node_id, class_def, input_data_all, "IS_CHANGED") + is_changed = await resolve_map_node_over_list_results(is_changed) node["is_changed"] = [None if isinstance(x, ExecutionBlocker) else x for x in is_changed] except Exception as e: logging.warning("WARNING: {}".format(e)) @@ -164,7 +170,19 @@ def get_input_data(inputs, class_def, unique_id, outputs=None, dynprompt=None, e map_node_over_list = None #Don't hook this please -def _map_node_over_list(obj, input_data_all, func, allow_interrupt=False, execution_block_cb=None, pre_execute_cb=None): +async def resolve_map_node_over_list_results(results): + remaining = [x for x in results if isinstance(x, asyncio.Task) and not x.done()] + if len(remaining) == 0: + return [x.result() if isinstance(x, asyncio.Task) else x for x in results] + else: + done, pending = await asyncio.wait(remaining) + for task in done: + exc = task.exception() + if exc is not None: + raise exc + return [x.result() if isinstance(x, asyncio.Task) else x for x in results] + +async def _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, func, allow_interrupt=False, execution_block_cb=None, pre_execute_cb=None): # check if node wants the lists input_is_list = getattr(obj, "INPUT_IS_LIST", False) @@ -178,7 +196,7 @@ def _map_node_over_list(obj, input_data_all, func, allow_interrupt=False, execut return {k: v[i if len(v) > i else -1] for k, v in d.items()} results = [] - def process_inputs(inputs, index=None, input_is_list=False): + async def process_inputs(inputs, index=None, input_is_list=False): if allow_interrupt: nodes.before_node_execution() execution_block = None @@ -194,20 +212,37 @@ def _map_node_over_list(obj, input_data_all, func, allow_interrupt=False, execut if execution_block is None: if pre_execute_cb is not None and index is not None: pre_execute_cb(index) - results.append(getattr(obj, func)(**inputs)) + f = getattr(obj, func) + if inspect.iscoroutinefunction(f): + async def async_wrapper(f, prompt_id, unique_id, list_index, args): + with CurrentNodeContext(prompt_id, unique_id, list_index): + return await f(**args) + task = asyncio.create_task(async_wrapper(f, prompt_id, unique_id, index, args=inputs)) + # Give the task a chance to execute without yielding + await asyncio.sleep(0) + if task.done(): + result = task.result() + results.append(result) + else: + results.append(task) + else: + with CurrentNodeContext(prompt_id, unique_id, index): + result = f(**inputs) + results.append(result) else: results.append(execution_block) if input_is_list: - process_inputs(input_data_all, 0, input_is_list=input_is_list) + await process_inputs(input_data_all, 0, input_is_list=input_is_list) elif max_len_input == 0: - process_inputs({}) + await process_inputs({}) else: for i in range(max_len_input): input_dict = slice_dict(input_data_all, i) - process_inputs(input_dict, i) + await process_inputs(input_dict, i) return results + def merge_result_data(results, obj): # check which outputs need concatenating output = [] @@ -229,11 +264,18 @@ def merge_result_data(results, obj): output.append([o[i] for o in results]) return output -def get_output_data(obj, input_data_all, execution_block_cb=None, pre_execute_cb=None): +async def get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=None, pre_execute_cb=None): + return_values = await _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, obj.FUNCTION, allow_interrupt=True, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb) + has_pending_task = any(isinstance(r, asyncio.Task) and not r.done() for r in return_values) + if has_pending_task: + return return_values, {}, False, has_pending_task + output, ui, has_subgraph = get_output_from_returns(return_values, obj) + return output, ui, has_subgraph, False + +def get_output_from_returns(return_values, obj): results = [] uis = [] subgraph_results = [] - return_values = _map_node_over_list(obj, input_data_all, obj.FUNCTION, allow_interrupt=True, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb) has_subgraph = False for i in range(len(return_values)): r = return_values[i] @@ -267,6 +309,10 @@ def get_output_data(obj, input_data_all, execution_block_cb=None, pre_execute_cb else: output = [] ui = dict() + # TODO: Think there's an existing bug here + # If we're performing a subgraph expansion, we probably shouldn't be returning UI values yet. + # They'll get cached without the completed subgraphs. It's an edge case and I'm not aware of + # any nodes that use both subgraph expansion and custom UI outputs, but might be a problem in the future. if len(uis) > 0: ui = {k: [y for x in uis for y in x[k]] for k in uis[0].keys()} return output, ui, has_subgraph @@ -279,7 +325,7 @@ def format_value(x): else: return str(x) -def execute(server, dynprompt, caches, current_item, extra_data, executed, prompt_id, execution_list, pending_subgraph_results): +async def execute(server, dynprompt, caches, current_item, extra_data, executed, prompt_id, execution_list, pending_subgraph_results, pending_async_nodes): unique_id = current_item real_node_id = dynprompt.get_real_node_id(unique_id) display_node_id = dynprompt.get_display_node_id(unique_id) @@ -291,11 +337,26 @@ def execute(server, dynprompt, caches, current_item, extra_data, executed, promp if server.client_id is not None: cached_output = caches.ui.get(unique_id) or {} server.send_sync("executed", { "node": unique_id, "display_node": display_node_id, "output": cached_output.get("output",None), "prompt_id": prompt_id }, server.client_id) + get_progress_state().finish_progress(unique_id) return (ExecutionResult.SUCCESS, None, None) input_data_all = None try: - if unique_id in pending_subgraph_results: + if unique_id in pending_async_nodes: + results = [] + for r in pending_async_nodes[unique_id]: + if isinstance(r, asyncio.Task): + try: + results.append(r.result()) + except Exception as ex: + # An async task failed - propagate the exception up + del pending_async_nodes[unique_id] + raise ex + else: + results.append(r) + del pending_async_nodes[unique_id] + output_data, output_ui, has_subgraph = get_output_from_returns(results, class_def) + elif unique_id in pending_subgraph_results: cached_results = pending_subgraph_results[unique_id] resolved_outputs = [] for is_subgraph, result in cached_results: @@ -317,6 +378,7 @@ def execute(server, dynprompt, caches, current_item, extra_data, executed, promp output_ui = [] has_subgraph = False else: + get_progress_state().start_progress(unique_id) input_data_all, missing_keys = get_input_data(inputs, class_def, unique_id, caches.outputs, dynprompt, extra_data) if server.client_id is not None: server.last_node_id = display_node_id @@ -328,7 +390,8 @@ def execute(server, dynprompt, caches, current_item, extra_data, executed, promp caches.objects.set(unique_id, obj) if hasattr(obj, "check_lazy_status"): - required_inputs = _map_node_over_list(obj, input_data_all, "check_lazy_status", allow_interrupt=True) + required_inputs = await _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, "check_lazy_status", allow_interrupt=True) + required_inputs = await resolve_map_node_over_list_results(required_inputs) required_inputs = set(sum([r for r in required_inputs if isinstance(r,list)], [])) required_inputs = [x for x in required_inputs if isinstance(x,str) and ( x not in input_data_all or x in missing_keys @@ -357,8 +420,18 @@ def execute(server, dynprompt, caches, current_item, extra_data, executed, promp else: return block def pre_execute_cb(call_index): + # TODO - How to handle this with async functions without contextvars (which requires Python 3.12)? GraphBuilder.set_default_prefix(unique_id, call_index, 0) - output_data, output_ui, has_subgraph = get_output_data(obj, input_data_all, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb) + output_data, output_ui, has_subgraph, has_pending_tasks = await get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb) + if has_pending_tasks: + pending_async_nodes[unique_id] = output_data + unblock = execution_list.add_external_block(unique_id) + async def await_completion(): + tasks = [x for x in output_data if isinstance(x, asyncio.Task)] + await asyncio.gather(*tasks, return_exceptions=True) + unblock() + asyncio.create_task(await_completion()) + return (ExecutionResult.PENDING, None, None) if len(output_ui) > 0: caches.ui.set(unique_id, { "meta": { @@ -401,7 +474,8 @@ def execute(server, dynprompt, caches, current_item, extra_data, executed, promp cached_outputs.append((True, node_outputs)) new_node_ids = set(new_node_ids) for cache in caches.all: - cache.ensure_subcache_for(unique_id, new_node_ids).clean_unused() + subcache = await cache.ensure_subcache_for(unique_id, new_node_ids) + subcache.clean_unused() for node_id in new_output_ids: execution_list.add_node(node_id) for link in new_output_links: @@ -446,6 +520,7 @@ def execute(server, dynprompt, caches, current_item, extra_data, executed, promp return (ExecutionResult.FAILURE, error_details, ex) + get_progress_state().finish_progress(unique_id) executed.add(unique_id) return (ExecutionResult.SUCCESS, None, None) @@ -500,6 +575,11 @@ class PromptExecutor: self.add_message("execution_error", mes, broadcast=False) def execute(self, prompt, prompt_id, extra_data={}, execute_outputs=[]): + asyncio_loop = asyncio.new_event_loop() + asyncio.set_event_loop(asyncio_loop) + asyncio.run(self.execute_async(prompt, prompt_id, extra_data, execute_outputs)) + + async def execute_async(self, prompt, prompt_id, extra_data={}, execute_outputs=[]): nodes.interrupt_processing(False) if "client_id" in extra_data: @@ -512,9 +592,11 @@ class PromptExecutor: with torch.inference_mode(): dynamic_prompt = DynamicPrompt(prompt) - is_changed_cache = IsChangedCache(dynamic_prompt, self.caches.outputs) + reset_progress_state(prompt_id, dynamic_prompt) + add_progress_handler(WebUIProgressHandler(self.server)) + is_changed_cache = IsChangedCache(prompt_id, dynamic_prompt, self.caches.outputs) for cache in self.caches.all: - cache.set_prompt(dynamic_prompt, prompt.keys(), is_changed_cache) + await cache.set_prompt(dynamic_prompt, prompt.keys(), is_changed_cache) cache.clean_unused() cached_nodes = [] @@ -527,6 +609,7 @@ class PromptExecutor: { "nodes": cached_nodes, "prompt_id": prompt_id}, broadcast=False) pending_subgraph_results = {} + pending_async_nodes = {} # TODO - Unify this with pending_subgraph_results executed = set() execution_list = ExecutionList(dynamic_prompt, self.caches.outputs) current_outputs = self.caches.outputs.all_node_ids() @@ -534,12 +617,13 @@ class PromptExecutor: execution_list.add_node(node_id) while not execution_list.is_empty(): - node_id, error, ex = execution_list.stage_node_execution() + node_id, error, ex = await execution_list.stage_node_execution() if error is not None: self.handle_execution_error(prompt_id, dynamic_prompt.original_prompt, current_outputs, executed, error, ex) break - result, error, ex = execute(self.server, dynamic_prompt, self.caches, node_id, extra_data, executed, prompt_id, execution_list, pending_subgraph_results) + assert node_id is not None, "Node ID should not be None at this point" + result, error, ex = await execute(self.server, dynamic_prompt, self.caches, node_id, extra_data, executed, prompt_id, execution_list, pending_subgraph_results, pending_async_nodes) self.success = result != ExecutionResult.FAILURE if result == ExecutionResult.FAILURE: self.handle_execution_error(prompt_id, dynamic_prompt.original_prompt, current_outputs, executed, error, ex) @@ -569,7 +653,7 @@ class PromptExecutor: comfy.model_management.unload_all_models() -def validate_inputs(prompt, item, validated): +async def validate_inputs(prompt_id, prompt, item, validated): unique_id = item if unique_id in validated: return validated[unique_id] @@ -646,7 +730,7 @@ def validate_inputs(prompt, item, validated): errors.append(error) continue try: - r = validate_inputs(prompt, o_id, validated) + r = await validate_inputs(prompt_id, prompt, o_id, validated) if r[0] is False: # `r` will be set in `validated[o_id]` already valid = False @@ -771,7 +855,8 @@ def validate_inputs(prompt, item, validated): input_filtered['input_types'] = [received_types] #ret = obj_class.VALIDATE_INPUTS(**input_filtered) - ret = _map_node_over_list(obj_class, input_filtered, "VALIDATE_INPUTS") + ret = await _async_map_node_over_list(prompt_id, unique_id, obj_class, input_filtered, "VALIDATE_INPUTS") + ret = await resolve_map_node_over_list_results(ret) for x in input_filtered: for i, r in enumerate(ret): if r is not True and not isinstance(r, ExecutionBlocker): @@ -804,7 +889,7 @@ def full_type_name(klass): return klass.__qualname__ return module + '.' + klass.__qualname__ -def validate_prompt(prompt): +async def validate_prompt(prompt_id, prompt): outputs = set() for x in prompt: if 'class_type' not in prompt[x]: @@ -847,7 +932,7 @@ def validate_prompt(prompt): valid = False reasons = [] try: - m = validate_inputs(prompt, o, validated) + m = await validate_inputs(prompt_id, prompt, o, validated) valid = m[0] reasons = m[1] except Exception as ex: diff --git a/main.py b/main.py index 0b1987ef4..2b4ffafd4 100644 --- a/main.py +++ b/main.py @@ -11,6 +11,9 @@ import itertools import utils.extra_config import logging import sys +from comfy_execution.progress import get_progress_state +from comfy_execution.utils import get_executing_context +from comfy_api import feature_flags if __name__ == "__main__": #NOTE: These do not do anything on core ComfyUI, they are for custom nodes. @@ -134,7 +137,7 @@ import comfy.utils import execution import server -from server import BinaryEventTypes +from protocol import BinaryEventTypes import nodes import comfy.model_management import comfyui_version @@ -230,15 +233,34 @@ async def run(server_instance, address='', port=8188, verbose=True, call_on_star server_instance.start_multi_address(addresses, call_on_start, verbose), server_instance.publish_loop() ) - def hijack_progress(server_instance): - def hook(value, total, preview_image): + def hook(value, total, preview_image, prompt_id=None, node_id=None): + executing_context = get_executing_context() + if prompt_id is None and executing_context is not None: + prompt_id = executing_context.prompt_id + if node_id is None and executing_context is not None: + node_id = executing_context.node_id comfy.model_management.throw_exception_if_processing_interrupted() - progress = {"value": value, "max": total, "prompt_id": server_instance.last_prompt_id, "node": server_instance.last_node_id} + if prompt_id is None: + prompt_id = server_instance.last_prompt_id + if node_id is None: + node_id = server_instance.last_node_id + progress = {"value": value, "max": total, "prompt_id": prompt_id, "node": node_id} + get_progress_state().update_progress(node_id, value, total, preview_image) server_instance.send_sync("progress", progress, server_instance.client_id) if preview_image is not None: - server_instance.send_sync(BinaryEventTypes.UNENCODED_PREVIEW_IMAGE, preview_image, server_instance.client_id) + # Only send old method if client doesn't support preview metadata + if not feature_flags.supports_feature( + server_instance.sockets_metadata, + server_instance.client_id, + "supports_preview_metadata", + ): + server_instance.send_sync( + BinaryEventTypes.UNENCODED_PREVIEW_IMAGE, + preview_image, + server_instance.client_id, + ) comfy.utils.set_progress_bar_global_hook(hook) diff --git a/protocol.py b/protocol.py new file mode 100644 index 000000000..038a0a840 --- /dev/null +++ b/protocol.py @@ -0,0 +1,7 @@ + +class BinaryEventTypes: + PREVIEW_IMAGE = 1 + UNENCODED_PREVIEW_IMAGE = 2 + TEXT = 3 + PREVIEW_IMAGE_WITH_METADATA = 4 + diff --git a/server.py b/server.py index 878b5eeb1..e8bad9f4e 100644 --- a/server.py +++ b/server.py @@ -26,6 +26,7 @@ import mimetypes from comfy.cli_args import args import comfy.utils import comfy.model_management +from comfy_api import feature_flags import node_helpers from comfyui_version import __version__ from app.frontend_management import FrontendManager @@ -35,11 +36,7 @@ from app.model_manager import ModelFileManager from app.custom_node_manager import CustomNodeManager from typing import Optional, Union from api_server.routes.internal.internal_routes import InternalRoutes - -class BinaryEventTypes: - PREVIEW_IMAGE = 1 - UNENCODED_PREVIEW_IMAGE = 2 - TEXT = 3 +from protocol import BinaryEventTypes async def send_socket_catch_exception(function, message): try: @@ -178,6 +175,7 @@ class PromptServer(): max_upload_size = round(args.max_upload_size * 1024 * 1024) self.app = web.Application(client_max_size=max_upload_size, middlewares=middlewares) self.sockets = dict() + self.sockets_metadata = dict() self.web_root = ( FrontendManager.init_frontend(args.front_end_version) if args.front_end_root is None @@ -202,20 +200,53 @@ class PromptServer(): else: sid = uuid.uuid4().hex + # Store WebSocket for backward compatibility self.sockets[sid] = ws + # Store metadata separately + self.sockets_metadata[sid] = {"feature_flags": {}} try: # Send initial state to the new client - await self.send("status", { "status": self.get_queue_info(), 'sid': sid }, sid) + await self.send("status", {"status": self.get_queue_info(), "sid": sid}, sid) # On reconnect if we are the currently executing client send the current node if self.client_id == sid and self.last_node_id is not None: await self.send("executing", { "node": self.last_node_id }, sid) + # Flag to track if we've received the first message + first_message = True + async for msg in ws: if msg.type == aiohttp.WSMsgType.ERROR: logging.warning('ws connection closed with exception %s' % ws.exception()) + elif msg.type == aiohttp.WSMsgType.TEXT: + try: + data = json.loads(msg.data) + # Check if first message is feature flags + if first_message and data.get("type") == "feature_flags": + # Store client feature flags + client_flags = data.get("data", {}) + self.sockets_metadata[sid]["feature_flags"] = client_flags + + # Send server feature flags in response + await self.send( + "feature_flags", + feature_flags.get_server_features(), + sid, + ) + + logging.info( + f"Feature flags negotiated for client {sid}: {client_flags}" + ) + first_message = False + except json.JSONDecodeError: + logging.warning( + f"Invalid JSON received from client {sid}: {msg.data}" + ) + except Exception as e: + logging.error(f"Error processing WebSocket message: {e}") finally: self.sockets.pop(sid, None) + self.sockets_metadata.pop(sid, None) return ws @routes.get("/") @@ -548,6 +579,10 @@ class PromptServer(): } return web.json_response(system_stats) + @routes.get("/features") + async def get_features(request): + return web.json_response(feature_flags.get_server_features()) + @routes.get("/prompt") async def get_prompt(request): return web.json_response(self.get_queue_info()) @@ -643,7 +678,8 @@ class PromptServer(): if "prompt" in json_data: prompt = json_data["prompt"] - valid = execution.validate_prompt(prompt) + prompt_id = str(uuid.uuid4()) + valid = await execution.validate_prompt(prompt_id, prompt) extra_data = {} if "extra_data" in json_data: extra_data = json_data["extra_data"] @@ -651,7 +687,6 @@ class PromptServer(): if "client_id" in json_data: extra_data["client_id"] = json_data["client_id"] if valid[0]: - prompt_id = str(uuid.uuid4()) outputs_to_execute = valid[2] self.prompt_queue.put((number, prompt_id, prompt, extra_data, outputs_to_execute)) response = {"prompt_id": prompt_id, "number": number, "node_errors": valid[3]} @@ -766,6 +801,10 @@ class PromptServer(): async def send(self, event, data, sid=None): if event == BinaryEventTypes.UNENCODED_PREVIEW_IMAGE: await self.send_image(data, sid=sid) + elif event == BinaryEventTypes.PREVIEW_IMAGE_WITH_METADATA: + # data is (preview_image, metadata) + preview_image, metadata = data + await self.send_image_with_metadata(preview_image, metadata, sid=sid) elif isinstance(data, (bytes, bytearray)): await self.send_bytes(event, data, sid) else: @@ -804,6 +843,43 @@ class PromptServer(): preview_bytes = bytesIO.getvalue() await self.send_bytes(BinaryEventTypes.PREVIEW_IMAGE, preview_bytes, sid=sid) + async def send_image_with_metadata(self, image_data, metadata=None, sid=None): + image_type = image_data[0] + image = image_data[1] + max_size = image_data[2] + if max_size is not None: + if hasattr(Image, 'Resampling'): + resampling = Image.Resampling.BILINEAR + else: + resampling = Image.Resampling.LANCZOS + + image = ImageOps.contain(image, (max_size, max_size), resampling) + + mimetype = "image/png" if image_type == "PNG" else "image/jpeg" + + # Prepare metadata + if metadata is None: + metadata = {} + metadata["image_type"] = mimetype + + # Serialize metadata as JSON + import json + metadata_json = json.dumps(metadata).encode('utf-8') + metadata_length = len(metadata_json) + + # Prepare image data + bytesIO = BytesIO() + image.save(bytesIO, format=image_type, quality=95, compress_level=1) + image_bytes = bytesIO.getvalue() + + # Combine metadata and image + combined_data = bytearray() + combined_data.extend(struct.pack(">I", metadata_length)) + combined_data.extend(metadata_json) + combined_data.extend(image_bytes) + + await self.send_bytes(BinaryEventTypes.PREVIEW_IMAGE_WITH_METADATA, combined_data, sid=sid) + async def send_bytes(self, event, data, sid=None): message = self.encode_bytes(event, data) @@ -845,10 +921,10 @@ class PromptServer(): ssl_ctx = None scheme = "http" if args.tls_keyfile and args.tls_certfile: - ssl_ctx = ssl.SSLContext(protocol=ssl.PROTOCOL_TLS_SERVER, verify_mode=ssl.CERT_NONE) - ssl_ctx.load_cert_chain(certfile=args.tls_certfile, + ssl_ctx = ssl.SSLContext(protocol=ssl.PROTOCOL_TLS_SERVER, verify_mode=ssl.CERT_NONE) + ssl_ctx.load_cert_chain(certfile=args.tls_certfile, keyfile=args.tls_keyfile) - scheme = "https" + scheme = "https" if verbose: logging.info("Starting server\n") diff --git a/tests-unit/feature_flags_test.py b/tests-unit/feature_flags_test.py new file mode 100644 index 000000000..f2702cfc8 --- /dev/null +++ b/tests-unit/feature_flags_test.py @@ -0,0 +1,98 @@ +"""Tests for feature flags functionality.""" + +from comfy_api.feature_flags import ( + get_connection_feature, + supports_feature, + get_server_features, + SERVER_FEATURE_FLAGS, +) + + +class TestFeatureFlags: + """Test suite for feature flags functions.""" + + def test_get_server_features_returns_copy(self): + """Test that get_server_features returns a copy of the server flags.""" + features = get_server_features() + # Verify it's a copy by modifying it + features["test_flag"] = True + # Original should be unchanged + assert "test_flag" not in SERVER_FEATURE_FLAGS + + def test_get_server_features_contains_expected_flags(self): + """Test that server features contain expected flags.""" + features = get_server_features() + assert "supports_preview_metadata" in features + assert features["supports_preview_metadata"] is True + assert "max_upload_size" in features + assert isinstance(features["max_upload_size"], (int, float)) + + def test_get_connection_feature_with_missing_sid(self): + """Test getting feature for non-existent session ID.""" + sockets_metadata = {} + result = get_connection_feature(sockets_metadata, "missing_sid", "some_feature") + assert result is False # Default value + + def test_get_connection_feature_with_custom_default(self): + """Test getting feature with custom default value.""" + sockets_metadata = {} + result = get_connection_feature( + sockets_metadata, "missing_sid", "some_feature", default="custom_default" + ) + assert result == "custom_default" + + def test_get_connection_feature_with_feature_flags(self): + """Test getting feature from connection with feature flags.""" + sockets_metadata = { + "sid1": { + "feature_flags": { + "supports_preview_metadata": True, + "custom_feature": "value", + }, + } + } + result = get_connection_feature(sockets_metadata, "sid1", "supports_preview_metadata") + assert result is True + + result = get_connection_feature(sockets_metadata, "sid1", "custom_feature") + assert result == "value" + + def test_get_connection_feature_missing_feature(self): + """Test getting non-existent feature from connection.""" + sockets_metadata = { + "sid1": {"feature_flags": {"existing_feature": True}} + } + result = get_connection_feature(sockets_metadata, "sid1", "missing_feature") + assert result is False + + def test_supports_feature_returns_boolean(self): + """Test that supports_feature always returns boolean.""" + sockets_metadata = { + "sid1": { + "feature_flags": { + "bool_feature": True, + "string_feature": "value", + "none_feature": None, + }, + } + } + + # True boolean feature + assert supports_feature(sockets_metadata, "sid1", "bool_feature") is True + + # Non-boolean values should return False + assert supports_feature(sockets_metadata, "sid1", "string_feature") is False + assert supports_feature(sockets_metadata, "sid1", "none_feature") is False + assert supports_feature(sockets_metadata, "sid1", "missing_feature") is False + + def test_supports_feature_with_missing_connection(self): + """Test supports_feature with missing connection.""" + sockets_metadata = {} + assert supports_feature(sockets_metadata, "missing_sid", "any_feature") is False + + def test_empty_feature_flags_dict(self): + """Test connection with empty feature flags dictionary.""" + sockets_metadata = {"sid1": {"feature_flags": {}}} + result = get_connection_feature(sockets_metadata, "sid1", "any_feature") + assert result is False + assert supports_feature(sockets_metadata, "sid1", "any_feature") is False diff --git a/tests-unit/requirements.txt b/tests-unit/requirements.txt index d70d00f4b..3a6790ee0 100644 --- a/tests-unit/requirements.txt +++ b/tests-unit/requirements.txt @@ -1,3 +1,4 @@ pytest>=7.8.0 pytest-aiohttp pytest-asyncio +websocket-client diff --git a/tests-unit/websocket_feature_flags_test.py b/tests-unit/websocket_feature_flags_test.py new file mode 100644 index 000000000..e93b2e1dd --- /dev/null +++ b/tests-unit/websocket_feature_flags_test.py @@ -0,0 +1,77 @@ +"""Simplified tests for WebSocket feature flags functionality.""" +from comfy_api import feature_flags + + +class TestWebSocketFeatureFlags: + """Test suite for WebSocket feature flags integration.""" + + def test_server_feature_flags_response(self): + """Test server feature flags are properly formatted.""" + features = feature_flags.get_server_features() + + # Check expected server features + assert "supports_preview_metadata" in features + assert features["supports_preview_metadata"] is True + assert "max_upload_size" in features + assert isinstance(features["max_upload_size"], (int, float)) + + def test_progress_py_checks_feature_flags(self): + """Test that progress.py checks feature flags before sending metadata.""" + # This simulates the check in progress.py + client_id = "test_client" + sockets_metadata = {"test_client": {"feature_flags": {}}} + + # The actual check would be in progress.py + supports_metadata = feature_flags.supports_feature( + sockets_metadata, client_id, "supports_preview_metadata" + ) + + assert supports_metadata is False + + def test_multiple_clients_different_features(self): + """Test handling multiple clients with different feature support.""" + sockets_metadata = { + "modern_client": { + "feature_flags": {"supports_preview_metadata": True} + }, + "legacy_client": { + "feature_flags": {} + } + } + + # Check modern client + assert feature_flags.supports_feature( + sockets_metadata, "modern_client", "supports_preview_metadata" + ) is True + + # Check legacy client + assert feature_flags.supports_feature( + sockets_metadata, "legacy_client", "supports_preview_metadata" + ) is False + + def test_feature_negotiation_message_format(self): + """Test the format of feature negotiation messages.""" + # Client message format + client_message = { + "type": "feature_flags", + "data": { + "supports_preview_metadata": True, + "api_version": "1.0.0" + } + } + + # Verify structure + assert client_message["type"] == "feature_flags" + assert "supports_preview_metadata" in client_message["data"] + + # Server response format (what would be sent) + server_features = feature_flags.get_server_features() + server_message = { + "type": "feature_flags", + "data": server_features + } + + # Verify structure + assert server_message["type"] == "feature_flags" + assert "supports_preview_metadata" in server_message["data"] + assert server_message["data"]["supports_preview_metadata"] is True diff --git a/tests/inference/extra_model_paths.yaml b/tests/inference/extra_model_paths.yaml index 75b2e1ae4..68e056564 100644 --- a/tests/inference/extra_model_paths.yaml +++ b/tests/inference/extra_model_paths.yaml @@ -1,4 +1,4 @@ # Config for testing nodes testing: - custom_nodes: tests/inference/testing_nodes + custom_nodes: testing_nodes diff --git a/tests/inference/test_async_nodes.py b/tests/inference/test_async_nodes.py new file mode 100644 index 000000000..b243bbca9 --- /dev/null +++ b/tests/inference/test_async_nodes.py @@ -0,0 +1,410 @@ +import pytest +import time +import torch +import urllib.error +import numpy as np +import subprocess + +from pytest import fixture +from comfy_execution.graph_utils import GraphBuilder +from tests.inference.test_execution import ComfyClient + + +@pytest.mark.execution +class TestAsyncNodes: + @fixture(scope="class", autouse=True, params=[ + (False, 0), + (True, 0), + (True, 100), + ]) + def _server(self, args_pytest, request): + pargs = [ + 'python','main.py', + '--output-directory', args_pytest["output_dir"], + '--listen', args_pytest["listen"], + '--port', str(args_pytest["port"]), + '--extra-model-paths-config', 'tests/inference/extra_model_paths.yaml', + ] + use_lru, lru_size = request.param + if use_lru: + pargs += ['--cache-lru', str(lru_size)] + # Running server with args: pargs + p = subprocess.Popen(pargs) + yield + p.kill() + torch.cuda.empty_cache() + + @fixture(scope="class", autouse=True) + def shared_client(self, args_pytest, _server): + client = ComfyClient() + n_tries = 5 + for i in range(n_tries): + time.sleep(4) + try: + client.connect(listen=args_pytest["listen"], port=args_pytest["port"]) + except ConnectionRefusedError: + # Retrying... + pass + else: + break + yield client + del client + torch.cuda.empty_cache() + + @fixture + def client(self, shared_client, request): + shared_client.set_test_name(f"async_nodes[{request.node.name}]") + yield shared_client + + @fixture + def builder(self, request): + yield GraphBuilder(prefix=request.node.name) + + # Happy Path Tests + + def test_basic_async_execution(self, client: ComfyClient, builder: GraphBuilder): + """Test that a basic async node executes correctly.""" + g = builder + image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + sleep_node = g.node("TestSleep", value=image.out(0), seconds=0.1) + output = g.node("SaveImage", images=sleep_node.out(0)) + + result = client.run(g) + + # Verify execution completed + assert result.did_run(sleep_node), "Async sleep node should have executed" + assert result.did_run(output), "Output node should have executed" + + # Verify the image passed through correctly + result_images = result.get_images(output) + assert len(result_images) == 1, "Should have 1 image" + assert np.array(result_images[0]).min() == 0 and np.array(result_images[0]).max() == 0, "Image should be black" + + def test_multiple_async_parallel_execution(self, client: ComfyClient, builder: GraphBuilder): + """Test that multiple async nodes execute in parallel.""" + g = builder + image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + + # Create multiple async sleep nodes with different durations + sleep1 = g.node("TestSleep", value=image.out(0), seconds=0.3) + sleep2 = g.node("TestSleep", value=image.out(0), seconds=0.4) + sleep3 = g.node("TestSleep", value=image.out(0), seconds=0.5) + + # Add outputs for each + _output1 = g.node("PreviewImage", images=sleep1.out(0)) + _output2 = g.node("PreviewImage", images=sleep2.out(0)) + _output3 = g.node("PreviewImage", images=sleep3.out(0)) + + start_time = time.time() + result = client.run(g) + elapsed_time = time.time() - start_time + + # Should take ~0.5s (max duration) not 1.2s (sum of durations) + assert elapsed_time < 0.8, f"Parallel execution took {elapsed_time}s, expected < 0.8s" + + # Verify all nodes executed + assert result.did_run(sleep1) and result.did_run(sleep2) and result.did_run(sleep3) + + def test_async_with_dependencies(self, client: ComfyClient, builder: GraphBuilder): + """Test async nodes with proper dependency handling.""" + g = builder + image1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + image2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) + + # Chain of async operations + sleep1 = g.node("TestSleep", value=image1.out(0), seconds=0.2) + sleep2 = g.node("TestSleep", value=image2.out(0), seconds=0.2) + + # Average depends on both async results + average = g.node("TestVariadicAverage", input1=sleep1.out(0), input2=sleep2.out(0)) + output = g.node("SaveImage", images=average.out(0)) + + result = client.run(g) + + # Verify execution order + assert result.did_run(sleep1) and result.did_run(sleep2) + assert result.did_run(average) and result.did_run(output) + + # Verify averaged result + result_images = result.get_images(output) + avg_value = np.array(result_images[0]).mean() + assert abs(avg_value - 127.5) < 1, f"Average value {avg_value} should be ~127.5" + + def test_async_validate_inputs(self, client: ComfyClient, builder: GraphBuilder): + """Test async VALIDATE_INPUTS function.""" + g = builder + # Create a test node with async validation + validation_node = g.node("TestAsyncValidation", value=5.0, threshold=10.0) + g.node("SaveImage", images=validation_node.out(0)) + + # Should pass validation + result = client.run(g) + assert result.did_run(validation_node) + + # Test validation failure + validation_node.inputs['threshold'] = 3.0 # Will fail since value > threshold + with pytest.raises(urllib.error.HTTPError): + client.run(g) + + def test_async_lazy_evaluation(self, client: ComfyClient, builder: GraphBuilder): + """Test async nodes with lazy evaluation.""" + g = builder + input1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + input2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) + mask = g.node("StubMask", value=0.0, height=512, width=512, batch_size=1) + + # Create async nodes that will be evaluated lazily + sleep1 = g.node("TestSleep", value=input1.out(0), seconds=0.3) + sleep2 = g.node("TestSleep", value=input2.out(0), seconds=0.3) + + # Use lazy mix that only needs sleep1 (mask=0.0) + lazy_mix = g.node("TestLazyMixImages", image1=sleep1.out(0), image2=sleep2.out(0), mask=mask.out(0)) + g.node("SaveImage", images=lazy_mix.out(0)) + + start_time = time.time() + result = client.run(g) + elapsed_time = time.time() - start_time + + # Should only execute sleep1, not sleep2 + assert elapsed_time < 0.5, f"Should skip sleep2, took {elapsed_time}s" + assert result.did_run(sleep1), "Sleep1 should have executed" + assert not result.did_run(sleep2), "Sleep2 should have been skipped" + + def test_async_check_lazy_status(self, client: ComfyClient, builder: GraphBuilder): + """Test async check_lazy_status function.""" + g = builder + # Create a node with async check_lazy_status + lazy_node = g.node("TestAsyncLazyCheck", + input1="value1", + input2="value2", + condition=True) + g.node("SaveImage", images=lazy_node.out(0)) + + result = client.run(g) + assert result.did_run(lazy_node) + + # Error Handling Tests + + def test_async_execution_error(self, client: ComfyClient, builder: GraphBuilder): + """Test that async execution errors are properly handled.""" + g = builder + image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + # Create an async node that will error + error_node = g.node("TestAsyncError", value=image.out(0), error_after=0.1) + g.node("SaveImage", images=error_node.out(0)) + + try: + client.run(g) + assert False, "Should have raised an error" + except Exception as e: + assert 'prompt_id' in e.args[0], f"Did not get proper error message: {e}" + assert e.args[0]['node_id'] == error_node.id, "Error should be from async error node" + + def test_async_validation_error(self, client: ComfyClient, builder: GraphBuilder): + """Test async validation error handling.""" + g = builder + # Node with async validation that will fail + validation_node = g.node("TestAsyncValidationError", value=15.0, max_value=10.0) + g.node("SaveImage", images=validation_node.out(0)) + + with pytest.raises(urllib.error.HTTPError) as exc_info: + client.run(g) + # Verify it's a validation error + assert exc_info.value.code == 400 + + def test_async_timeout_handling(self, client: ComfyClient, builder: GraphBuilder): + """Test handling of async operations that timeout.""" + g = builder + image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + # Very long sleep that would timeout + timeout_node = g.node("TestAsyncTimeout", value=image.out(0), timeout=0.5, operation_time=2.0) + g.node("SaveImage", images=timeout_node.out(0)) + + try: + client.run(g) + assert False, "Should have raised a timeout error" + except Exception as e: + assert 'timeout' in str(e).lower(), f"Expected timeout error, got: {e}" + + def test_concurrent_async_error_recovery(self, client: ComfyClient, builder: GraphBuilder): + """Test that workflow can recover after async errors.""" + g = builder + image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + + # First run with error + error_node = g.node("TestAsyncError", value=image.out(0), error_after=0.1) + g.node("SaveImage", images=error_node.out(0)) + + try: + client.run(g) + except Exception: + pass # Expected + + # Second run should succeed + g2 = GraphBuilder(prefix="recovery_test") + image2 = g2.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) + sleep_node = g2.node("TestSleep", value=image2.out(0), seconds=0.1) + g2.node("SaveImage", images=sleep_node.out(0)) + + result = client.run(g2) + assert result.did_run(sleep_node), "Should be able to run after error" + + def test_sync_error_during_async_execution(self, client: ComfyClient, builder: GraphBuilder): + """Test handling when sync node errors while async node is executing.""" + g = builder + image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + + # Async node that takes time + sleep_node = g.node("TestSleep", value=image.out(0), seconds=0.5) + + # Sync node that will error immediately + error_node = g.node("TestSyncError", value=image.out(0)) + + # Both feed into output + g.node("PreviewImage", images=sleep_node.out(0)) + g.node("PreviewImage", images=error_node.out(0)) + + try: + client.run(g) + assert False, "Should have raised an error" + except Exception as e: + # Verify the sync error was caught even though async was running + assert 'prompt_id' in e.args[0] + + # Edge Cases + + def test_async_with_execution_blocker(self, client: ComfyClient, builder: GraphBuilder): + """Test async nodes with execution blockers.""" + g = builder + image1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + image2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) + + # Async sleep nodes + sleep1 = g.node("TestSleep", value=image1.out(0), seconds=0.2) + sleep2 = g.node("TestSleep", value=image2.out(0), seconds=0.2) + + # Create list of images + image_list = g.node("TestMakeListNode", value1=sleep1.out(0), value2=sleep2.out(0)) + + # Create list of blocking conditions - [False, True] to block only the second item + int1 = g.node("StubInt", value=1) + int2 = g.node("StubInt", value=2) + block_list = g.node("TestMakeListNode", value1=int1.out(0), value2=int2.out(0)) + + # Compare each value against 2, so first is False (1 != 2) and second is True (2 == 2) + compare = g.node("TestIntConditions", a=block_list.out(0), b=2, operation="==") + + # Block based on the comparison results + blocker = g.node("TestExecutionBlocker", input=image_list.out(0), block=compare.out(0), verbose=False) + + output = g.node("PreviewImage", images=blocker.out(0)) + + result = client.run(g) + images = result.get_images(output) + assert len(images) == 1, "Should have blocked second image" + + def test_async_caching_behavior(self, client: ComfyClient, builder: GraphBuilder): + """Test that async nodes are properly cached.""" + g = builder + image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + sleep_node = g.node("TestSleep", value=image.out(0), seconds=0.2) + g.node("SaveImage", images=sleep_node.out(0)) + + # First run + result1 = client.run(g) + assert result1.did_run(sleep_node), "Should run first time" + + # Second run - should be cached + start_time = time.time() + result2 = client.run(g) + elapsed_time = time.time() - start_time + + assert not result2.did_run(sleep_node), "Should be cached" + assert elapsed_time < 0.1, f"Cached run took {elapsed_time}s, should be instant" + + def test_async_with_dynamic_prompts(self, client: ComfyClient, builder: GraphBuilder): + """Test async nodes within dynamically generated prompts.""" + g = builder + image1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + image2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) + + # Node that generates async nodes dynamically + dynamic_async = g.node("TestDynamicAsyncGeneration", + image1=image1.out(0), + image2=image2.out(0), + num_async_nodes=3, + sleep_duration=0.2) + g.node("SaveImage", images=dynamic_async.out(0)) + + start_time = time.time() + result = client.run(g) + elapsed_time = time.time() - start_time + + # Should execute async nodes in parallel within dynamic prompt + assert elapsed_time < 0.5, f"Dynamic async execution took {elapsed_time}s" + assert result.did_run(dynamic_async) + + def test_async_resource_cleanup(self, client: ComfyClient, builder: GraphBuilder): + """Test that async resources are properly cleaned up.""" + g = builder + image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + + # Create multiple async nodes that use resources + resource_nodes = [] + for i in range(5): + node = g.node("TestAsyncResourceUser", + value=image.out(0), + resource_id=f"resource_{i}", + duration=0.1) + resource_nodes.append(node) + g.node("PreviewImage", images=node.out(0)) + + result = client.run(g) + + # Verify all nodes executed + for node in resource_nodes: + assert result.did_run(node) + + # Run again to ensure resources were cleaned up + result2 = client.run(g) + # Should be cached but not error due to resource conflicts + for node in resource_nodes: + assert not result2.did_run(node), "Should be cached" + + def test_async_cancellation(self, client: ComfyClient, builder: GraphBuilder): + """Test cancellation of async operations.""" + # This would require implementing cancellation in the client + # For now, we'll test that long-running async operations can be interrupted + pass # TODO: Implement when cancellation API is available + + def test_mixed_sync_async_execution(self, client: ComfyClient, builder: GraphBuilder): + """Test workflows with both sync and async nodes.""" + g = builder + image1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + image2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) + mask = g.node("StubMask", value=0.5, height=512, width=512, batch_size=1) + + # Mix of sync and async operations + # Sync: lazy mix images + sync_op1 = g.node("TestLazyMixImages", image1=image1.out(0), image2=image2.out(0), mask=mask.out(0)) + # Async: sleep + async_op1 = g.node("TestSleep", value=sync_op1.out(0), seconds=0.2) + # Sync: custom validation + sync_op2 = g.node("TestCustomValidation1", input1=async_op1.out(0), input2=0.5) + # Async: sleep again + async_op2 = g.node("TestSleep", value=sync_op2.out(0), seconds=0.2) + + output = g.node("SaveImage", images=async_op2.out(0)) + + result = client.run(g) + + # Verify all nodes executed in correct order + assert result.did_run(sync_op1) + assert result.did_run(async_op1) + assert result.did_run(sync_op2) + assert result.did_run(async_op2) + + # Image should be a mix of black and white (gray) + result_images = result.get_images(output) + avg_value = np.array(result_images[0]).mean() + assert abs(avg_value - 63.75) < 5, f"Average value {avg_value} should be ~63.75" diff --git a/tests/inference/test_execution.py b/tests/inference/test_execution.py index 5cda5c1ae..9d3d685cc 100644 --- a/tests/inference/test_execution.py +++ b/tests/inference/test_execution.py @@ -252,7 +252,7 @@ class TestExecution: @pytest.mark.parametrize("test_type, test_value", [ ("StubInt", 5), - ("StubFloat", 5.0) + ("StubMask", 5.0) ]) def test_validation_error_edge1(self, test_type, test_value, client: ComfyClient, builder: GraphBuilder): g = builder @@ -497,6 +497,69 @@ class TestExecution: assert numpy.array(images[0]).min() == 63 and numpy.array(images[0]).max() == 63, "Image should have value 0.25" assert not result.did_run(test_node), "The execution should have been cached" + def test_parallel_sleep_nodes(self, client: ComfyClient, builder: GraphBuilder): + g = builder + image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + + # Create sleep nodes for each duration + sleep_node1 = g.node("TestSleep", value=image.out(0), seconds=2.8) + sleep_node2 = g.node("TestSleep", value=image.out(0), seconds=2.9) + sleep_node3 = g.node("TestSleep", value=image.out(0), seconds=3.0) + + # Add outputs to verify the execution + _output1 = g.node("PreviewImage", images=sleep_node1.out(0)) + _output2 = g.node("PreviewImage", images=sleep_node2.out(0)) + _output3 = g.node("PreviewImage", images=sleep_node3.out(0)) + + start_time = time.time() + result = client.run(g) + elapsed_time = time.time() - start_time + + # The test should take around 0.4 seconds (the longest sleep duration) + # plus some overhead, but definitely less than the sum of all sleeps (0.9s) + # We'll allow for up to 0.8s total to account for overhead + assert elapsed_time < 4.0, f"Parallel execution took {elapsed_time}s, expected less than 0.8s" + + # Verify that all nodes executed + assert result.did_run(sleep_node1), "Sleep node 1 should have run" + assert result.did_run(sleep_node2), "Sleep node 2 should have run" + assert result.did_run(sleep_node3), "Sleep node 3 should have run" + + def test_parallel_sleep_expansion(self, client: ComfyClient, builder: GraphBuilder): + g = builder + # Create input images with different values + image1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + image2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) + image3 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) + + # Create a TestParallelSleep node that expands into multiple TestSleep nodes + parallel_sleep = g.node("TestParallelSleep", + image1=image1.out(0), + image2=image2.out(0), + image3=image3.out(0), + sleep1=0.4, + sleep2=0.5, + sleep3=0.6) + output = g.node("SaveImage", images=parallel_sleep.out(0)) + + start_time = time.time() + result = client.run(g) + elapsed_time = time.time() - start_time + + # Similar to the previous test, expect parallel execution of the sleep nodes + # which should complete in less than the sum of all sleeps + assert elapsed_time < 0.8, f"Expansion execution took {elapsed_time}s, expected less than 0.8s" + + # Verify the parallel sleep node executed + assert result.did_run(parallel_sleep), "ParallelSleep node should have run" + + # Verify we get an image as output (blend of the three input images) + result_images = result.get_images(output) + assert len(result_images) == 1, "Should have 1 image" + # Average pixel value should be around 170 (255 * 2 // 3) + avg_value = numpy.array(result_images[0]).mean() + assert avg_value == 170, f"Image average value {avg_value} should be 170" + # This tests that nodes with OUTPUT_IS_LIST function correctly when they receive an ExecutionBlocker # as input. We also test that when that list (containing an ExecutionBlocker) is passed to a node, # only that one entry in the list is blocked. diff --git a/tests/inference/testing_nodes/testing-pack/__init__.py b/tests/inference/testing_nodes/testing-pack/__init__.py index dcc71659a..20f9533c7 100644 --- a/tests/inference/testing_nodes/testing-pack/__init__.py +++ b/tests/inference/testing_nodes/testing-pack/__init__.py @@ -1,23 +1,26 @@ -from .specific_tests import TEST_NODE_CLASS_MAPPINGS, TEST_NODE_DISPLAY_NAME_MAPPINGS -from .flow_control import FLOW_CONTROL_NODE_CLASS_MAPPINGS, FLOW_CONTROL_NODE_DISPLAY_NAME_MAPPINGS -from .util import UTILITY_NODE_CLASS_MAPPINGS, UTILITY_NODE_DISPLAY_NAME_MAPPINGS -from .conditions import CONDITION_NODE_CLASS_MAPPINGS, CONDITION_NODE_DISPLAY_NAME_MAPPINGS -from .stubs import TEST_STUB_NODE_CLASS_MAPPINGS, TEST_STUB_NODE_DISPLAY_NAME_MAPPINGS - -# NODE_CLASS_MAPPINGS = GENERAL_NODE_CLASS_MAPPINGS.update(COMPONENT_NODE_CLASS_MAPPINGS) -# NODE_DISPLAY_NAME_MAPPINGS = GENERAL_NODE_DISPLAY_NAME_MAPPINGS.update(COMPONENT_NODE_DISPLAY_NAME_MAPPINGS) - -NODE_CLASS_MAPPINGS = {} -NODE_CLASS_MAPPINGS.update(TEST_NODE_CLASS_MAPPINGS) -NODE_CLASS_MAPPINGS.update(FLOW_CONTROL_NODE_CLASS_MAPPINGS) -NODE_CLASS_MAPPINGS.update(UTILITY_NODE_CLASS_MAPPINGS) -NODE_CLASS_MAPPINGS.update(CONDITION_NODE_CLASS_MAPPINGS) -NODE_CLASS_MAPPINGS.update(TEST_STUB_NODE_CLASS_MAPPINGS) - -NODE_DISPLAY_NAME_MAPPINGS = {} -NODE_DISPLAY_NAME_MAPPINGS.update(TEST_NODE_DISPLAY_NAME_MAPPINGS) -NODE_DISPLAY_NAME_MAPPINGS.update(FLOW_CONTROL_NODE_DISPLAY_NAME_MAPPINGS) -NODE_DISPLAY_NAME_MAPPINGS.update(UTILITY_NODE_DISPLAY_NAME_MAPPINGS) -NODE_DISPLAY_NAME_MAPPINGS.update(CONDITION_NODE_DISPLAY_NAME_MAPPINGS) -NODE_DISPLAY_NAME_MAPPINGS.update(TEST_STUB_NODE_DISPLAY_NAME_MAPPINGS) - +from .specific_tests import TEST_NODE_CLASS_MAPPINGS, TEST_NODE_DISPLAY_NAME_MAPPINGS +from .flow_control import FLOW_CONTROL_NODE_CLASS_MAPPINGS, FLOW_CONTROL_NODE_DISPLAY_NAME_MAPPINGS +from .util import UTILITY_NODE_CLASS_MAPPINGS, UTILITY_NODE_DISPLAY_NAME_MAPPINGS +from .conditions import CONDITION_NODE_CLASS_MAPPINGS, CONDITION_NODE_DISPLAY_NAME_MAPPINGS +from .stubs import TEST_STUB_NODE_CLASS_MAPPINGS, TEST_STUB_NODE_DISPLAY_NAME_MAPPINGS +from .async_test_nodes import ASYNC_TEST_NODE_CLASS_MAPPINGS, ASYNC_TEST_NODE_DISPLAY_NAME_MAPPINGS + +# NODE_CLASS_MAPPINGS = GENERAL_NODE_CLASS_MAPPINGS.update(COMPONENT_NODE_CLASS_MAPPINGS) +# NODE_DISPLAY_NAME_MAPPINGS = GENERAL_NODE_DISPLAY_NAME_MAPPINGS.update(COMPONENT_NODE_DISPLAY_NAME_MAPPINGS) + +NODE_CLASS_MAPPINGS = {} +NODE_CLASS_MAPPINGS.update(TEST_NODE_CLASS_MAPPINGS) +NODE_CLASS_MAPPINGS.update(FLOW_CONTROL_NODE_CLASS_MAPPINGS) +NODE_CLASS_MAPPINGS.update(UTILITY_NODE_CLASS_MAPPINGS) +NODE_CLASS_MAPPINGS.update(CONDITION_NODE_CLASS_MAPPINGS) +NODE_CLASS_MAPPINGS.update(TEST_STUB_NODE_CLASS_MAPPINGS) +NODE_CLASS_MAPPINGS.update(ASYNC_TEST_NODE_CLASS_MAPPINGS) + +NODE_DISPLAY_NAME_MAPPINGS = {} +NODE_DISPLAY_NAME_MAPPINGS.update(TEST_NODE_DISPLAY_NAME_MAPPINGS) +NODE_DISPLAY_NAME_MAPPINGS.update(FLOW_CONTROL_NODE_DISPLAY_NAME_MAPPINGS) +NODE_DISPLAY_NAME_MAPPINGS.update(UTILITY_NODE_DISPLAY_NAME_MAPPINGS) +NODE_DISPLAY_NAME_MAPPINGS.update(CONDITION_NODE_DISPLAY_NAME_MAPPINGS) +NODE_DISPLAY_NAME_MAPPINGS.update(TEST_STUB_NODE_DISPLAY_NAME_MAPPINGS) +NODE_DISPLAY_NAME_MAPPINGS.update(ASYNC_TEST_NODE_DISPLAY_NAME_MAPPINGS) + diff --git a/tests/inference/testing_nodes/testing-pack/async_test_nodes.py b/tests/inference/testing_nodes/testing-pack/async_test_nodes.py new file mode 100644 index 000000000..547eea6f4 --- /dev/null +++ b/tests/inference/testing_nodes/testing-pack/async_test_nodes.py @@ -0,0 +1,343 @@ +import torch +import asyncio +from typing import Dict +from comfy.utils import ProgressBar +from comfy_execution.graph_utils import GraphBuilder +from comfy.comfy_types.node_typing import ComfyNodeABC +from comfy.comfy_types import IO + + +class TestAsyncValidation(ComfyNodeABC): + """Test node with async VALIDATE_INPUTS.""" + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "value": ("FLOAT", {"default": 5.0}), + "threshold": ("FLOAT", {"default": 10.0}), + }, + } + + RETURN_TYPES = ("IMAGE",) + FUNCTION = "process" + CATEGORY = "_for_testing/async" + + @classmethod + async def VALIDATE_INPUTS(cls, value, threshold): + # Simulate async validation (e.g., checking remote service) + await asyncio.sleep(0.05) + + if value > threshold: + return f"Value {value} exceeds threshold {threshold}" + return True + + def process(self, value, threshold): + # Create image based on value + intensity = value / 10.0 + image = torch.ones([1, 512, 512, 3]) * intensity + return (image,) + + +class TestAsyncError(ComfyNodeABC): + """Test node that errors during async execution.""" + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "value": (IO.ANY, {}), + "error_after": ("FLOAT", {"default": 0.1, "min": 0.0, "max": 10.0}), + }, + } + + RETURN_TYPES = (IO.ANY,) + FUNCTION = "error_execution" + CATEGORY = "_for_testing/async" + + async def error_execution(self, value, error_after): + await asyncio.sleep(error_after) + raise RuntimeError("Intentional async execution error for testing") + + +class TestAsyncValidationError(ComfyNodeABC): + """Test node with async validation that always fails.""" + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "value": ("FLOAT", {"default": 5.0}), + "max_value": ("FLOAT", {"default": 10.0}), + }, + } + + RETURN_TYPES = ("IMAGE",) + FUNCTION = "process" + CATEGORY = "_for_testing/async" + + @classmethod + async def VALIDATE_INPUTS(cls, value, max_value): + await asyncio.sleep(0.05) + # Always fail validation for values > max_value + if value > max_value: + return f"Async validation failed: {value} > {max_value}" + return True + + def process(self, value, max_value): + # This won't be reached if validation fails + image = torch.ones([1, 512, 512, 3]) * (value / max_value) + return (image,) + + +class TestAsyncTimeout(ComfyNodeABC): + """Test node that simulates timeout scenarios.""" + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "value": (IO.ANY, {}), + "timeout": ("FLOAT", {"default": 1.0, "min": 0.1, "max": 10.0}), + "operation_time": ("FLOAT", {"default": 2.0, "min": 0.1, "max": 10.0}), + }, + } + + RETURN_TYPES = (IO.ANY,) + FUNCTION = "timeout_execution" + CATEGORY = "_for_testing/async" + + async def timeout_execution(self, value, timeout, operation_time): + try: + # This will timeout if operation_time > timeout + await asyncio.wait_for(asyncio.sleep(operation_time), timeout=timeout) + return (value,) + except asyncio.TimeoutError: + raise RuntimeError(f"Operation timed out after {timeout} seconds") + + +class TestSyncError(ComfyNodeABC): + """Test node that errors synchronously (for mixed sync/async testing).""" + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "value": (IO.ANY, {}), + }, + } + + RETURN_TYPES = (IO.ANY,) + FUNCTION = "sync_error" + CATEGORY = "_for_testing/async" + + def sync_error(self, value): + raise RuntimeError("Intentional sync execution error for testing") + + +class TestAsyncLazyCheck(ComfyNodeABC): + """Test node with async check_lazy_status.""" + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "input1": (IO.ANY, {"lazy": True}), + "input2": (IO.ANY, {"lazy": True}), + "condition": ("BOOLEAN", {"default": True}), + }, + } + + RETURN_TYPES = ("IMAGE",) + FUNCTION = "process" + CATEGORY = "_for_testing/async" + + async def check_lazy_status(self, condition, input1, input2): + # Simulate async checking (e.g., querying remote service) + await asyncio.sleep(0.05) + + needed = [] + if condition and input1 is None: + needed.append("input1") + if not condition and input2 is None: + needed.append("input2") + return needed + + def process(self, input1, input2, condition): + # Return a simple image + return (torch.ones([1, 512, 512, 3]),) + + +class TestDynamicAsyncGeneration(ComfyNodeABC): + """Test node that dynamically generates async nodes.""" + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "image1": ("IMAGE",), + "image2": ("IMAGE",), + "num_async_nodes": ("INT", {"default": 3, "min": 1, "max": 10}), + "sleep_duration": ("FLOAT", {"default": 0.2, "min": 0.1, "max": 1.0}), + }, + } + + RETURN_TYPES = ("IMAGE",) + FUNCTION = "generate_async_workflow" + CATEGORY = "_for_testing/async" + + def generate_async_workflow(self, image1, image2, num_async_nodes, sleep_duration): + g = GraphBuilder() + + # Create multiple async sleep nodes + sleep_nodes = [] + for i in range(num_async_nodes): + image = image1 if i % 2 == 0 else image2 + sleep_node = g.node("TestSleep", value=image, seconds=sleep_duration) + sleep_nodes.append(sleep_node) + + # Average all results + if len(sleep_nodes) == 1: + final_node = sleep_nodes[0] + else: + avg_inputs = {"input1": sleep_nodes[0].out(0)} + for i, node in enumerate(sleep_nodes[1:], 2): + avg_inputs[f"input{i}"] = node.out(0) + final_node = g.node("TestVariadicAverage", **avg_inputs) + + return { + "result": (final_node.out(0),), + "expand": g.finalize(), + } + + +class TestAsyncResourceUser(ComfyNodeABC): + """Test node that uses resources during async execution.""" + + # Class-level resource tracking for testing + _active_resources: Dict[str, bool] = {} + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "value": (IO.ANY, {}), + "resource_id": ("STRING", {"default": "resource_0"}), + "duration": ("FLOAT", {"default": 0.1, "min": 0.0, "max": 1.0}), + }, + } + + RETURN_TYPES = (IO.ANY,) + FUNCTION = "use_resource" + CATEGORY = "_for_testing/async" + + async def use_resource(self, value, resource_id, duration): + # Check if resource is already in use + if self._active_resources.get(resource_id, False): + raise RuntimeError(f"Resource {resource_id} is already in use!") + + # Mark resource as in use + self._active_resources[resource_id] = True + + try: + # Simulate resource usage + await asyncio.sleep(duration) + return (value,) + finally: + # Always clean up resource + self._active_resources[resource_id] = False + + +class TestAsyncBatchProcessing(ComfyNodeABC): + """Test async processing of batched inputs.""" + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "images": ("IMAGE",), + "process_time_per_item": ("FLOAT", {"default": 0.1, "min": 0.01, "max": 1.0}), + }, + "hidden": { + "unique_id": "UNIQUE_ID", + }, + } + + RETURN_TYPES = ("IMAGE",) + FUNCTION = "process_batch" + CATEGORY = "_for_testing/async" + + async def process_batch(self, images, process_time_per_item, unique_id): + batch_size = images.shape[0] + pbar = ProgressBar(batch_size, node_id=unique_id) + + # Process each image in the batch + processed = [] + for i in range(batch_size): + # Simulate async processing + await asyncio.sleep(process_time_per_item) + + # Simple processing: invert the image + processed_image = 1.0 - images[i:i+1] + processed.append(processed_image) + + pbar.update(1) + + # Stack processed images + result = torch.cat(processed, dim=0) + return (result,) + + +class TestAsyncConcurrentLimit(ComfyNodeABC): + """Test concurrent execution limits for async nodes.""" + + _semaphore = asyncio.Semaphore(2) # Only allow 2 concurrent executions + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "value": (IO.ANY, {}), + "duration": ("FLOAT", {"default": 0.5, "min": 0.1, "max": 2.0}), + "node_id": ("INT", {"default": 0}), + }, + } + + RETURN_TYPES = (IO.ANY,) + FUNCTION = "limited_execution" + CATEGORY = "_for_testing/async" + + async def limited_execution(self, value, duration, node_id): + async with self._semaphore: + # Node {node_id} acquired semaphore + await asyncio.sleep(duration) + # Node {node_id} releasing semaphore + return (value,) + + +# Add node mappings +ASYNC_TEST_NODE_CLASS_MAPPINGS = { + "TestAsyncValidation": TestAsyncValidation, + "TestAsyncError": TestAsyncError, + "TestAsyncValidationError": TestAsyncValidationError, + "TestAsyncTimeout": TestAsyncTimeout, + "TestSyncError": TestSyncError, + "TestAsyncLazyCheck": TestAsyncLazyCheck, + "TestDynamicAsyncGeneration": TestDynamicAsyncGeneration, + "TestAsyncResourceUser": TestAsyncResourceUser, + "TestAsyncBatchProcessing": TestAsyncBatchProcessing, + "TestAsyncConcurrentLimit": TestAsyncConcurrentLimit, +} + +ASYNC_TEST_NODE_DISPLAY_NAME_MAPPINGS = { + "TestAsyncValidation": "Test Async Validation", + "TestAsyncError": "Test Async Error", + "TestAsyncValidationError": "Test Async Validation Error", + "TestAsyncTimeout": "Test Async Timeout", + "TestSyncError": "Test Sync Error", + "TestAsyncLazyCheck": "Test Async Lazy Check", + "TestDynamicAsyncGeneration": "Test Dynamic Async Generation", + "TestAsyncResourceUser": "Test Async Resource User", + "TestAsyncBatchProcessing": "Test Async Batch Processing", + "TestAsyncConcurrentLimit": "Test Async Concurrent Limit", +} diff --git a/tests/inference/testing_nodes/testing-pack/specific_tests.py b/tests/inference/testing_nodes/testing-pack/specific_tests.py index 9d05ab14f..657d49f2f 100644 --- a/tests/inference/testing_nodes/testing-pack/specific_tests.py +++ b/tests/inference/testing_nodes/testing-pack/specific_tests.py @@ -1,6 +1,11 @@ import torch +import time +import asyncio +from comfy.utils import ProgressBar from .tools import VariantSupport from comfy_execution.graph_utils import GraphBuilder +from comfy.comfy_types.node_typing import ComfyNodeABC +from comfy.comfy_types import IO class TestLazyMixImages: @classmethod @@ -333,6 +338,131 @@ class TestMixedExpansionReturns: "expand": g.finalize(), } +class TestSamplingInExpansion: + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "model": ("MODEL",), + "clip": ("CLIP",), + "vae": ("VAE",), + "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}), + "steps": ("INT", {"default": 20, "min": 1, "max": 100}), + "cfg": ("FLOAT", {"default": 7.0, "min": 0.0, "max": 30.0}), + "prompt": ("STRING", {"multiline": True, "default": "a beautiful landscape with mountains and trees"}), + "negative_prompt": ("STRING", {"multiline": True, "default": "blurry, bad quality, worst quality"}), + }, + } + + RETURN_TYPES = ("IMAGE",) + FUNCTION = "sampling_in_expansion" + + CATEGORY = "Testing/Nodes" + + def sampling_in_expansion(self, model, clip, vae, seed, steps, cfg, prompt, negative_prompt): + g = GraphBuilder() + + # Create a basic image generation workflow using the input model, clip and vae + # 1. Setup text prompts using the provided CLIP model + positive_prompt = g.node("CLIPTextEncode", + text=prompt, + clip=clip) + negative_prompt = g.node("CLIPTextEncode", + text=negative_prompt, + clip=clip) + + # 2. Create empty latent with specified size + empty_latent = g.node("EmptyLatentImage", width=512, height=512, batch_size=1) + + # 3. Setup sampler and generate image latent + sampler = g.node("KSampler", + model=model, + positive=positive_prompt.out(0), + negative=negative_prompt.out(0), + latent_image=empty_latent.out(0), + seed=seed, + steps=steps, + cfg=cfg, + sampler_name="euler_ancestral", + scheduler="normal") + + # 4. Decode latent to image using VAE + output = g.node("VAEDecode", samples=sampler.out(0), vae=vae) + + return { + "result": (output.out(0),), + "expand": g.finalize(), + } + +class TestSleep(ComfyNodeABC): + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "value": (IO.ANY, {}), + "seconds": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 9999.0, "step": 0.01, "tooltip": "The amount of seconds to sleep."}), + }, + "hidden": { + "unique_id": "UNIQUE_ID", + }, + } + RETURN_TYPES = (IO.ANY,) + FUNCTION = "sleep" + + CATEGORY = "_for_testing" + + async def sleep(self, value, seconds, unique_id): + pbar = ProgressBar(seconds, node_id=unique_id) + start = time.time() + expiration = start + seconds + now = start + while now < expiration: + now = time.time() + pbar.update_absolute(now - start) + await asyncio.sleep(0.01) + return (value,) + +class TestParallelSleep(ComfyNodeABC): + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "image1": ("IMAGE", ), + "image2": ("IMAGE", ), + "image3": ("IMAGE", ), + "sleep1": ("FLOAT", {"default": 0.5, "min": 0.0, "max": 10.0, "step": 0.01}), + "sleep2": ("FLOAT", {"default": 0.5, "min": 0.0, "max": 10.0, "step": 0.01}), + "sleep3": ("FLOAT", {"default": 0.5, "min": 0.0, "max": 10.0, "step": 0.01}), + }, + "hidden": { + "unique_id": "UNIQUE_ID", + }, + } + RETURN_TYPES = ("IMAGE",) + FUNCTION = "parallel_sleep" + CATEGORY = "_for_testing" + OUTPUT_NODE = True + + def parallel_sleep(self, image1, image2, image3, sleep1, sleep2, sleep3, unique_id): + # Create a graph dynamically with three TestSleep nodes + g = GraphBuilder() + + # Create sleep nodes for each duration and image + sleep_node1 = g.node("TestSleep", value=image1, seconds=sleep1) + sleep_node2 = g.node("TestSleep", value=image2, seconds=sleep2) + sleep_node3 = g.node("TestSleep", value=image3, seconds=sleep3) + + # Blend the results using TestVariadicAverage + blend = g.node("TestVariadicAverage", + input1=sleep_node1.out(0), + input2=sleep_node2.out(0), + input3=sleep_node3.out(0)) + + return { + "result": (blend.out(0),), + "expand": g.finalize(), + } + TEST_NODE_CLASS_MAPPINGS = { "TestLazyMixImages": TestLazyMixImages, "TestVariadicAverage": TestVariadicAverage, @@ -345,6 +475,9 @@ TEST_NODE_CLASS_MAPPINGS = { "TestCustomValidation5": TestCustomValidation5, "TestDynamicDependencyCycle": TestDynamicDependencyCycle, "TestMixedExpansionReturns": TestMixedExpansionReturns, + "TestSamplingInExpansion": TestSamplingInExpansion, + "TestSleep": TestSleep, + "TestParallelSleep": TestParallelSleep, } TEST_NODE_DISPLAY_NAME_MAPPINGS = { @@ -359,4 +492,7 @@ TEST_NODE_DISPLAY_NAME_MAPPINGS = { "TestCustomValidation5": "Custom Validation 5", "TestDynamicDependencyCycle": "Dynamic Dependency Cycle", "TestMixedExpansionReturns": "Mixed Expansion Returns", + "TestSamplingInExpansion": "Sampling In Expansion", + "TestSleep": "Test Sleep", + "TestParallelSleep": "Test Parallel Sleep", } From b7ff5bd14def548519cc09770b794ad176868761 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 10 Jul 2025 12:21:18 -0700 Subject: [PATCH 017/243] Fix python3.9 (#8858) --- comfy_execution/progress.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_execution/progress.py b/comfy_execution/progress.py index 5645b3e3c..73dba3f75 100644 --- a/comfy_execution/progress.py +++ b/comfy_execution/progress.py @@ -317,7 +317,7 @@ class ProgressRegistry: handler.reset() # Global registry instance -global_progress_registry: ProgressRegistry | None = None +global_progress_registry: ProgressRegistry = None def reset_progress_state(prompt_id: str, dynprompt: "DynamicPrompt") -> None: global global_progress_registry From 8f05fb48eaab6b5d6fd0747c5be358a60d93e186 Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Thu, 10 Jul 2025 15:00:29 -0700 Subject: [PATCH 018/243] [fix] increase Kling API polling timeout to prevent user timeouts (#8860) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends polling duration from 10 minutes to ~68 minutes (256 attempts × 16 seconds) to accommodate longer Kling API operations that were frequently timing out for users. --- comfy_api_nodes/nodes_kling.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index 641cd6353..69e9e5cf0 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -132,6 +132,8 @@ def poll_until_finished( result_url_extractor=result_url_extractor, estimated_duration=estimated_duration, node_id=node_id, + poll_interval=16.0, + max_poll_attempts=256, ).execute() From 938d3e8216b9936913f105593b6127428feef473 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 10 Jul 2025 23:37:51 -0700 Subject: [PATCH 019/243] Remove windows line endings. (#8866) --- comfy/ldm/pixart/pixartms.py | 512 +++++++++++++++---------------- comfy/text_encoders/pixart_t5.py | 84 ++--- 2 files changed, 298 insertions(+), 298 deletions(-) diff --git a/comfy/ldm/pixart/pixartms.py b/comfy/ldm/pixart/pixartms.py index 7d4eebdce..d1ac49d84 100644 --- a/comfy/ldm/pixart/pixartms.py +++ b/comfy/ldm/pixart/pixartms.py @@ -1,256 +1,256 @@ -# Based on: -# https://github.com/PixArt-alpha/PixArt-alpha [Apache 2.0 license] -# https://github.com/PixArt-alpha/PixArt-sigma [Apache 2.0 license] -import torch -import torch.nn as nn - -from .blocks import ( - t2i_modulate, - CaptionEmbedder, - AttentionKVCompress, - MultiHeadCrossAttention, - T2IFinalLayer, - SizeEmbedder, -) -from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder, PatchEmbed, Mlp, get_1d_sincos_pos_embed_from_grid_torch - - -def get_2d_sincos_pos_embed_torch(embed_dim, w, h, pe_interpolation=1.0, base_size=16, device=None, dtype=torch.float32): - grid_h, grid_w = torch.meshgrid( - torch.arange(h, device=device, dtype=dtype) / (h/base_size) / pe_interpolation, - torch.arange(w, device=device, dtype=dtype) / (w/base_size) / pe_interpolation, - indexing='ij' - ) - emb_h = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_h, device=device, dtype=dtype) - emb_w = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_w, device=device, dtype=dtype) - emb = torch.cat([emb_w, emb_h], dim=1) # (H*W, D) - return emb - -class PixArtMSBlock(nn.Module): - """ - A PixArt block with adaptive layer norm zero (adaLN-Zero) conditioning. - """ - def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, drop_path=0., input_size=None, - sampling=None, sr_ratio=1, qk_norm=False, dtype=None, device=None, operations=None, **block_kwargs): - super().__init__() - self.hidden_size = hidden_size - self.norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) - self.attn = AttentionKVCompress( - hidden_size, num_heads=num_heads, qkv_bias=True, sampling=sampling, sr_ratio=sr_ratio, - qk_norm=qk_norm, dtype=dtype, device=device, operations=operations, **block_kwargs - ) - self.cross_attn = MultiHeadCrossAttention( - hidden_size, num_heads, dtype=dtype, device=device, operations=operations, **block_kwargs - ) - self.norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) - # to be compatible with lower version pytorch - approx_gelu = lambda: nn.GELU(approximate="tanh") - self.mlp = Mlp( - in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, - dtype=dtype, device=device, operations=operations - ) - self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size ** 0.5) - - def forward(self, x, y, t, mask=None, HW=None, **kwargs): - B, N, C = x.shape - - shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None].to(dtype=x.dtype, device=x.device) + t.reshape(B, 6, -1)).chunk(6, dim=1) - x = x + (gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa), HW=HW)) - x = x + self.cross_attn(x, y, mask) - x = x + (gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp))) - - return x - - -### Core PixArt Model ### -class PixArtMS(nn.Module): - """ - Diffusion model with a Transformer backbone. - """ - def __init__( - self, - input_size=32, - patch_size=2, - in_channels=4, - hidden_size=1152, - depth=28, - num_heads=16, - mlp_ratio=4.0, - class_dropout_prob=0.1, - learn_sigma=True, - pred_sigma=True, - drop_path: float = 0., - caption_channels=4096, - pe_interpolation=None, - pe_precision=None, - config=None, - model_max_length=120, - micro_condition=True, - qk_norm=False, - kv_compress_config=None, - dtype=None, - device=None, - operations=None, - **kwargs, - ): - nn.Module.__init__(self) - self.dtype = dtype - self.pred_sigma = pred_sigma - self.in_channels = in_channels - self.out_channels = in_channels * 2 if pred_sigma else in_channels - self.patch_size = patch_size - self.num_heads = num_heads - self.pe_interpolation = pe_interpolation - self.pe_precision = pe_precision - self.hidden_size = hidden_size - self.depth = depth - - approx_gelu = lambda: nn.GELU(approximate="tanh") - self.t_block = nn.Sequential( - nn.SiLU(), - operations.Linear(hidden_size, 6 * hidden_size, bias=True, dtype=dtype, device=device) - ) - self.x_embedder = PatchEmbed( - patch_size=patch_size, - in_chans=in_channels, - embed_dim=hidden_size, - bias=True, - dtype=dtype, - device=device, - operations=operations - ) - self.t_embedder = TimestepEmbedder( - hidden_size, dtype=dtype, device=device, operations=operations, - ) - self.y_embedder = CaptionEmbedder( - in_channels=caption_channels, hidden_size=hidden_size, uncond_prob=class_dropout_prob, - act_layer=approx_gelu, token_num=model_max_length, - dtype=dtype, device=device, operations=operations, - ) - - self.micro_conditioning = micro_condition - if self.micro_conditioning: - self.csize_embedder = SizeEmbedder(hidden_size//3, dtype=dtype, device=device, operations=operations) - self.ar_embedder = SizeEmbedder(hidden_size//3, dtype=dtype, device=device, operations=operations) - - # For fixed sin-cos embedding: - # num_patches = (input_size // patch_size) * (input_size // patch_size) - # self.base_size = input_size // self.patch_size - # self.register_buffer("pos_embed", torch.zeros(1, num_patches, hidden_size)) - - drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)] # stochastic depth decay rule - if kv_compress_config is None: - kv_compress_config = { - 'sampling': None, - 'scale_factor': 1, - 'kv_compress_layer': [], - } - self.blocks = nn.ModuleList([ - PixArtMSBlock( - hidden_size, num_heads, mlp_ratio=mlp_ratio, drop_path=drop_path[i], - sampling=kv_compress_config['sampling'], - sr_ratio=int(kv_compress_config['scale_factor']) if i in kv_compress_config['kv_compress_layer'] else 1, - qk_norm=qk_norm, - dtype=dtype, - device=device, - operations=operations, - ) - for i in range(depth) - ]) - self.final_layer = T2IFinalLayer( - hidden_size, patch_size, self.out_channels, dtype=dtype, device=device, operations=operations - ) - - def forward_orig(self, x, timestep, y, mask=None, c_size=None, c_ar=None, **kwargs): - """ - Original forward pass of PixArt. - x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) - t: (N,) tensor of diffusion timesteps - y: (N, 1, 120, C) conditioning - ar: (N, 1): aspect ratio - cs: (N ,2) size conditioning for height/width - """ - B, C, H, W = x.shape - c_res = (H + W) // 2 - pe_interpolation = self.pe_interpolation - if pe_interpolation is None or self.pe_precision is not None: - # calculate pe_interpolation on-the-fly - pe_interpolation = round(c_res / (512/8.0), self.pe_precision or 0) - - pos_embed = get_2d_sincos_pos_embed_torch( - self.hidden_size, - h=(H // self.patch_size), - w=(W // self.patch_size), - pe_interpolation=pe_interpolation, - base_size=((round(c_res / 64) * 64) // self.patch_size), - device=x.device, - dtype=x.dtype, - ).unsqueeze(0) - - x = self.x_embedder(x) + pos_embed # (N, T, D), where T = H * W / patch_size ** 2 - t = self.t_embedder(timestep, x.dtype) # (N, D) - - if self.micro_conditioning and (c_size is not None and c_ar is not None): - bs = x.shape[0] - c_size = self.csize_embedder(c_size, bs) # (N, D) - c_ar = self.ar_embedder(c_ar, bs) # (N, D) - t = t + torch.cat([c_size, c_ar], dim=1) - - t0 = self.t_block(t) - y = self.y_embedder(y, self.training) # (N, D) - - if mask is not None: - if mask.shape[0] != y.shape[0]: - mask = mask.repeat(y.shape[0] // mask.shape[0], 1) - mask = mask.squeeze(1).squeeze(1) - y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1]) - y_lens = mask.sum(dim=1).tolist() - else: - y_lens = None - y = y.squeeze(1).view(1, -1, x.shape[-1]) - for block in self.blocks: - x = block(x, y, t0, y_lens, (H, W), **kwargs) # (N, T, D) - - x = self.final_layer(x, t) # (N, T, patch_size ** 2 * out_channels) - x = self.unpatchify(x, H, W) # (N, out_channels, H, W) - - return x - - def forward(self, x, timesteps, context, c_size=None, c_ar=None, **kwargs): - B, C, H, W = x.shape - - # Fallback for missing microconds - if self.micro_conditioning: - if c_size is None: - c_size = torch.tensor([H*8, W*8], dtype=x.dtype, device=x.device).repeat(B, 1) - - if c_ar is None: - c_ar = torch.tensor([H/W], dtype=x.dtype, device=x.device).repeat(B, 1) - - ## Still accepts the input w/o that dim but returns garbage - if len(context.shape) == 3: - context = context.unsqueeze(1) - - ## run original forward pass - out = self.forward_orig(x, timesteps, context, c_size=c_size, c_ar=c_ar) - - ## only return EPS - if self.pred_sigma: - return out[:, :self.in_channels] - return out - - def unpatchify(self, x, h, w): - """ - x: (N, T, patch_size**2 * C) - imgs: (N, H, W, C) - """ - c = self.out_channels - p = self.x_embedder.patch_size[0] - h = h // self.patch_size - w = w // self.patch_size - assert h * w == x.shape[1] - - x = x.reshape(shape=(x.shape[0], h, w, p, p, c)) - x = torch.einsum('nhwpqc->nchpwq', x) - imgs = x.reshape(shape=(x.shape[0], c, h * p, w * p)) - return imgs +# Based on: +# https://github.com/PixArt-alpha/PixArt-alpha [Apache 2.0 license] +# https://github.com/PixArt-alpha/PixArt-sigma [Apache 2.0 license] +import torch +import torch.nn as nn + +from .blocks import ( + t2i_modulate, + CaptionEmbedder, + AttentionKVCompress, + MultiHeadCrossAttention, + T2IFinalLayer, + SizeEmbedder, +) +from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder, PatchEmbed, Mlp, get_1d_sincos_pos_embed_from_grid_torch + + +def get_2d_sincos_pos_embed_torch(embed_dim, w, h, pe_interpolation=1.0, base_size=16, device=None, dtype=torch.float32): + grid_h, grid_w = torch.meshgrid( + torch.arange(h, device=device, dtype=dtype) / (h/base_size) / pe_interpolation, + torch.arange(w, device=device, dtype=dtype) / (w/base_size) / pe_interpolation, + indexing='ij' + ) + emb_h = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_h, device=device, dtype=dtype) + emb_w = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_w, device=device, dtype=dtype) + emb = torch.cat([emb_w, emb_h], dim=1) # (H*W, D) + return emb + +class PixArtMSBlock(nn.Module): + """ + A PixArt block with adaptive layer norm zero (adaLN-Zero) conditioning. + """ + def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, drop_path=0., input_size=None, + sampling=None, sr_ratio=1, qk_norm=False, dtype=None, device=None, operations=None, **block_kwargs): + super().__init__() + self.hidden_size = hidden_size + self.norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) + self.attn = AttentionKVCompress( + hidden_size, num_heads=num_heads, qkv_bias=True, sampling=sampling, sr_ratio=sr_ratio, + qk_norm=qk_norm, dtype=dtype, device=device, operations=operations, **block_kwargs + ) + self.cross_attn = MultiHeadCrossAttention( + hidden_size, num_heads, dtype=dtype, device=device, operations=operations, **block_kwargs + ) + self.norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) + # to be compatible with lower version pytorch + approx_gelu = lambda: nn.GELU(approximate="tanh") + self.mlp = Mlp( + in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, + dtype=dtype, device=device, operations=operations + ) + self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size ** 0.5) + + def forward(self, x, y, t, mask=None, HW=None, **kwargs): + B, N, C = x.shape + + shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None].to(dtype=x.dtype, device=x.device) + t.reshape(B, 6, -1)).chunk(6, dim=1) + x = x + (gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa), HW=HW)) + x = x + self.cross_attn(x, y, mask) + x = x + (gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp))) + + return x + + +### Core PixArt Model ### +class PixArtMS(nn.Module): + """ + Diffusion model with a Transformer backbone. + """ + def __init__( + self, + input_size=32, + patch_size=2, + in_channels=4, + hidden_size=1152, + depth=28, + num_heads=16, + mlp_ratio=4.0, + class_dropout_prob=0.1, + learn_sigma=True, + pred_sigma=True, + drop_path: float = 0., + caption_channels=4096, + pe_interpolation=None, + pe_precision=None, + config=None, + model_max_length=120, + micro_condition=True, + qk_norm=False, + kv_compress_config=None, + dtype=None, + device=None, + operations=None, + **kwargs, + ): + nn.Module.__init__(self) + self.dtype = dtype + self.pred_sigma = pred_sigma + self.in_channels = in_channels + self.out_channels = in_channels * 2 if pred_sigma else in_channels + self.patch_size = patch_size + self.num_heads = num_heads + self.pe_interpolation = pe_interpolation + self.pe_precision = pe_precision + self.hidden_size = hidden_size + self.depth = depth + + approx_gelu = lambda: nn.GELU(approximate="tanh") + self.t_block = nn.Sequential( + nn.SiLU(), + operations.Linear(hidden_size, 6 * hidden_size, bias=True, dtype=dtype, device=device) + ) + self.x_embedder = PatchEmbed( + patch_size=patch_size, + in_chans=in_channels, + embed_dim=hidden_size, + bias=True, + dtype=dtype, + device=device, + operations=operations + ) + self.t_embedder = TimestepEmbedder( + hidden_size, dtype=dtype, device=device, operations=operations, + ) + self.y_embedder = CaptionEmbedder( + in_channels=caption_channels, hidden_size=hidden_size, uncond_prob=class_dropout_prob, + act_layer=approx_gelu, token_num=model_max_length, + dtype=dtype, device=device, operations=operations, + ) + + self.micro_conditioning = micro_condition + if self.micro_conditioning: + self.csize_embedder = SizeEmbedder(hidden_size//3, dtype=dtype, device=device, operations=operations) + self.ar_embedder = SizeEmbedder(hidden_size//3, dtype=dtype, device=device, operations=operations) + + # For fixed sin-cos embedding: + # num_patches = (input_size // patch_size) * (input_size // patch_size) + # self.base_size = input_size // self.patch_size + # self.register_buffer("pos_embed", torch.zeros(1, num_patches, hidden_size)) + + drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)] # stochastic depth decay rule + if kv_compress_config is None: + kv_compress_config = { + 'sampling': None, + 'scale_factor': 1, + 'kv_compress_layer': [], + } + self.blocks = nn.ModuleList([ + PixArtMSBlock( + hidden_size, num_heads, mlp_ratio=mlp_ratio, drop_path=drop_path[i], + sampling=kv_compress_config['sampling'], + sr_ratio=int(kv_compress_config['scale_factor']) if i in kv_compress_config['kv_compress_layer'] else 1, + qk_norm=qk_norm, + dtype=dtype, + device=device, + operations=operations, + ) + for i in range(depth) + ]) + self.final_layer = T2IFinalLayer( + hidden_size, patch_size, self.out_channels, dtype=dtype, device=device, operations=operations + ) + + def forward_orig(self, x, timestep, y, mask=None, c_size=None, c_ar=None, **kwargs): + """ + Original forward pass of PixArt. + x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) + t: (N,) tensor of diffusion timesteps + y: (N, 1, 120, C) conditioning + ar: (N, 1): aspect ratio + cs: (N ,2) size conditioning for height/width + """ + B, C, H, W = x.shape + c_res = (H + W) // 2 + pe_interpolation = self.pe_interpolation + if pe_interpolation is None or self.pe_precision is not None: + # calculate pe_interpolation on-the-fly + pe_interpolation = round(c_res / (512/8.0), self.pe_precision or 0) + + pos_embed = get_2d_sincos_pos_embed_torch( + self.hidden_size, + h=(H // self.patch_size), + w=(W // self.patch_size), + pe_interpolation=pe_interpolation, + base_size=((round(c_res / 64) * 64) // self.patch_size), + device=x.device, + dtype=x.dtype, + ).unsqueeze(0) + + x = self.x_embedder(x) + pos_embed # (N, T, D), where T = H * W / patch_size ** 2 + t = self.t_embedder(timestep, x.dtype) # (N, D) + + if self.micro_conditioning and (c_size is not None and c_ar is not None): + bs = x.shape[0] + c_size = self.csize_embedder(c_size, bs) # (N, D) + c_ar = self.ar_embedder(c_ar, bs) # (N, D) + t = t + torch.cat([c_size, c_ar], dim=1) + + t0 = self.t_block(t) + y = self.y_embedder(y, self.training) # (N, D) + + if mask is not None: + if mask.shape[0] != y.shape[0]: + mask = mask.repeat(y.shape[0] // mask.shape[0], 1) + mask = mask.squeeze(1).squeeze(1) + y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1]) + y_lens = mask.sum(dim=1).tolist() + else: + y_lens = None + y = y.squeeze(1).view(1, -1, x.shape[-1]) + for block in self.blocks: + x = block(x, y, t0, y_lens, (H, W), **kwargs) # (N, T, D) + + x = self.final_layer(x, t) # (N, T, patch_size ** 2 * out_channels) + x = self.unpatchify(x, H, W) # (N, out_channels, H, W) + + return x + + def forward(self, x, timesteps, context, c_size=None, c_ar=None, **kwargs): + B, C, H, W = x.shape + + # Fallback for missing microconds + if self.micro_conditioning: + if c_size is None: + c_size = torch.tensor([H*8, W*8], dtype=x.dtype, device=x.device).repeat(B, 1) + + if c_ar is None: + c_ar = torch.tensor([H/W], dtype=x.dtype, device=x.device).repeat(B, 1) + + ## Still accepts the input w/o that dim but returns garbage + if len(context.shape) == 3: + context = context.unsqueeze(1) + + ## run original forward pass + out = self.forward_orig(x, timesteps, context, c_size=c_size, c_ar=c_ar) + + ## only return EPS + if self.pred_sigma: + return out[:, :self.in_channels] + return out + + def unpatchify(self, x, h, w): + """ + x: (N, T, patch_size**2 * C) + imgs: (N, H, W, C) + """ + c = self.out_channels + p = self.x_embedder.patch_size[0] + h = h // self.patch_size + w = w // self.patch_size + assert h * w == x.shape[1] + + x = x.reshape(shape=(x.shape[0], h, w, p, p, c)) + x = torch.einsum('nhwpqc->nchpwq', x) + imgs = x.reshape(shape=(x.shape[0], c, h * p, w * p)) + return imgs diff --git a/comfy/text_encoders/pixart_t5.py b/comfy/text_encoders/pixart_t5.py index b8de6bc4e..5f383de07 100644 --- a/comfy/text_encoders/pixart_t5.py +++ b/comfy/text_encoders/pixart_t5.py @@ -1,42 +1,42 @@ -import os - -from comfy import sd1_clip -import comfy.text_encoders.t5 -import comfy.text_encoders.sd3_clip -from comfy.sd1_clip import gen_empty_tokens - -from transformers import T5TokenizerFast - -class T5XXLModel(comfy.text_encoders.sd3_clip.T5XXLModel): - def __init__(self, **kwargs): - super().__init__(**kwargs) - - def gen_empty_tokens(self, special_tokens, *args, **kwargs): - # PixArt expects the negative to be all pad tokens - special_tokens = special_tokens.copy() - special_tokens.pop("end") - return gen_empty_tokens(special_tokens, *args, **kwargs) - -class PixArtT5XXL(sd1_clip.SD1ClipModel): - def __init__(self, device="cpu", dtype=None, model_options={}): - super().__init__(device=device, dtype=dtype, name="t5xxl", clip_model=T5XXLModel, model_options=model_options) - -class T5XXLTokenizer(sd1_clip.SDTokenizer): - def __init__(self, embedding_directory=None, tokenizer_data={}): - tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_tokenizer") - super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_data=tokenizer_data) # no padding - -class PixArtTokenizer(sd1_clip.SD1Tokenizer): - def __init__(self, embedding_directory=None, tokenizer_data={}): - super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="t5xxl", tokenizer=T5XXLTokenizer) - -def pixart_te(dtype_t5=None, t5xxl_scaled_fp8=None): - class PixArtTEModel_(PixArtT5XXL): - def __init__(self, device="cpu", dtype=None, model_options={}): - if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options: - model_options = model_options.copy() - model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8 - if dtype is None: - dtype = dtype_t5 - super().__init__(device=device, dtype=dtype, model_options=model_options) - return PixArtTEModel_ +import os + +from comfy import sd1_clip +import comfy.text_encoders.t5 +import comfy.text_encoders.sd3_clip +from comfy.sd1_clip import gen_empty_tokens + +from transformers import T5TokenizerFast + +class T5XXLModel(comfy.text_encoders.sd3_clip.T5XXLModel): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def gen_empty_tokens(self, special_tokens, *args, **kwargs): + # PixArt expects the negative to be all pad tokens + special_tokens = special_tokens.copy() + special_tokens.pop("end") + return gen_empty_tokens(special_tokens, *args, **kwargs) + +class PixArtT5XXL(sd1_clip.SD1ClipModel): + def __init__(self, device="cpu", dtype=None, model_options={}): + super().__init__(device=device, dtype=dtype, name="t5xxl", clip_model=T5XXLModel, model_options=model_options) + +class T5XXLTokenizer(sd1_clip.SDTokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_tokenizer") + super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_data=tokenizer_data) # no padding + +class PixArtTokenizer(sd1_clip.SD1Tokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="t5xxl", tokenizer=T5XXLTokenizer) + +def pixart_te(dtype_t5=None, t5xxl_scaled_fp8=None): + class PixArtTEModel_(PixArtT5XXL): + def __init__(self, device="cpu", dtype=None, model_options={}): + if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options: + model_options = model_options.copy() + model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8 + if dtype is None: + dtype = dtype_t5 + super().__init__(device=device, dtype=dtype, model_options=model_options) + return PixArtTEModel_ From 7bc7dd2aa2030a87135a3c8d20f636d52627ca9d Mon Sep 17 00:00:00 2001 From: JettHu <35261585+JettHu@users.noreply.github.com> Date: Sat, 12 Jul 2025 00:51:06 +0800 Subject: [PATCH 020/243] Execute async node earlier (#8865) --- comfy_execution/graph.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/comfy_execution/graph.py b/comfy_execution/graph.py index c79243e1e..60e2ab91e 100644 --- a/comfy_execution/graph.py +++ b/comfy_execution/graph.py @@ -3,6 +3,7 @@ from typing import Type, Literal import nodes import asyncio +import inspect from comfy_execution.graph_utils import is_link from comfy.comfy_types.node_typing import ComfyNodeABC, InputTypeDict, InputTypeOptions @@ -239,8 +240,15 @@ class ExecutionList(TopologicalSort): return True return False + # If an available node is async, do that first. + # This will execute the asynchronous function earlier, reducing the overall time. + def is_async(node_id): + class_type = self.dynprompt.get_node(node_id)["class_type"] + class_def = nodes.NODE_CLASS_MAPPINGS[class_type] + return inspect.iscoroutinefunction(getattr(class_def, class_def.FUNCTION)) + for node_id in node_list: - if is_output(node_id): + if is_output(node_id) or is_async(node_id): return node_id #This should handle the VAEDecode -> preview case From b43916a134d503bd5cab15dfa281f429527e448b Mon Sep 17 00:00:00 2001 From: chaObserv <154517000+chaObserv@users.noreply.github.com> Date: Sat, 12 Jul 2025 00:52:58 +0800 Subject: [PATCH 021/243] Fix fresca's input and output (#8871) --- comfy_extras/nodes_fresca.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/comfy_extras/nodes_fresca.py b/comfy_extras/nodes_fresca.py index ee310c874..65c2d0d0e 100644 --- a/comfy_extras/nodes_fresca.py +++ b/comfy_extras/nodes_fresca.py @@ -71,8 +71,11 @@ class FreSca: DESCRIPTION = "Applies frequency-dependent scaling to the guidance" def patch(self, model, scale_low, scale_high, freq_cutoff): def custom_cfg_function(args): - cond = args["conds_out"][0] - uncond = args["conds_out"][1] + conds_out = args["conds_out"] + if len(conds_out) <= 1 or None in args["conds"][:2]: + return conds_out + cond = conds_out[0] + uncond = conds_out[1] guidance = cond - uncond filtered_guidance = Fourier_filter( @@ -83,7 +86,7 @@ class FreSca: ) filtered_cond = filtered_guidance + uncond - return [filtered_cond, uncond] + return [filtered_cond, uncond] + conds_out[2:] m = model.clone() m.set_model_sampler_pre_cfg_function(custom_cfg_function) From b40143984c1bed9cd1bc73a373a80a90c625eb0f Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 12 Jul 2025 00:49:26 -0700 Subject: [PATCH 022/243] Add model detection error hint for lora. (#8880) --- comfy/sd.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/comfy/sd.py b/comfy/sd.py index 5b95cf75a..8081b167c 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -18,6 +18,7 @@ import comfy.ldm.hunyuan3d.vae import comfy.ldm.ace.vae.music_dcae_pipeline import yaml import math +import os import comfy.utils @@ -977,6 +978,12 @@ def load_gligen(ckpt_path): model = model.half() return comfy.model_patcher.ModelPatcher(model, load_device=model_management.get_torch_device(), offload_device=model_management.unet_offload_device()) +def model_detection_error_hint(path, state_dict): + filename = os.path.basename(path) + if 'lora' in filename.lower(): + return "\nHINT: This seems to be a Lora file and Lora files should be put in the lora folder and loaded with a lora loader node.." + return "" + def load_checkpoint(config_path=None, ckpt_path=None, output_vae=True, output_clip=True, embedding_directory=None, state_dict=None, config=None): logging.warning("Warning: The load checkpoint with config function is deprecated and will eventually be removed, please use the other one.") model, clip, vae, _ = load_checkpoint_guess_config(ckpt_path, output_vae=output_vae, output_clip=output_clip, output_clipvision=False, embedding_directory=embedding_directory, output_model=True) @@ -1005,7 +1012,7 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o sd, metadata = comfy.utils.load_torch_file(ckpt_path, return_metadata=True) out = load_state_dict_guess_config(sd, output_vae, output_clip, output_clipvision, embedding_directory, output_model, model_options, te_model_options=te_model_options, metadata=metadata) if out is None: - raise RuntimeError("ERROR: Could not detect model type of: {}".format(ckpt_path)) + raise RuntimeError("ERROR: Could not detect model type of: {}\n{}".format(ckpt_path, model_detection_error_hint(ckpt_path, sd))) return out def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True, model_options={}, te_model_options={}, metadata=None): @@ -1177,7 +1184,7 @@ def load_diffusion_model(unet_path, model_options={}): model = load_diffusion_model_state_dict(sd, model_options=model_options) if model is None: logging.error("ERROR UNSUPPORTED DIFFUSION MODEL {}".format(unet_path)) - raise RuntimeError("ERROR: Could not detect model type of: {}".format(unet_path)) + raise RuntimeError("ERROR: Could not detect model type of: {}\n{}".format(unet_path, model_detection_error_hint(unet_path, sd))) return model def load_unet(unet_path, dtype=None): From 480375f3495e9e1437faf47eb2a11222c9acf3f0 Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Sun, 13 Jul 2025 01:46:27 -0700 Subject: [PATCH 023/243] Remove auth tokens from history storage (#8889) Remove auth_token_comfy_org and api_key_comfy_org from extra_data before storing prompt history to prevent sensitive authentication tokens from being persisted in the history endpoint response. --- execution.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/execution.py b/execution.py index 90cefc023..bd638afba 100644 --- a/execution.py +++ b/execution.py @@ -1045,6 +1045,12 @@ class PromptQueue: if status is not None: status_dict = copy.deepcopy(status._asdict()) + # Remove auth tokens from extra_data before storing in history + if "auth_token_comfy_org" in prompt[3]: + del prompt[3]["auth_token_comfy_org"] + if "api_key_comfy_org" in prompt[3]: + del prompt[3]["api_key_comfy_org"] + self.history[prompt[1]] = { "prompt": prompt, "outputs": {}, From 4831e9c2c47b97f85fd771521f247a017d1f43e1 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 13 Jul 2025 01:59:17 -0700 Subject: [PATCH 024/243] Refactor previous pr. (#8893) --- execution.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/execution.py b/execution.py index bd638afba..c3a62f1cb 100644 --- a/execution.py +++ b/execution.py @@ -123,6 +123,8 @@ class CacheSet: } return result +SENSITIVE_EXTRA_DATA_KEYS = ("auth_token_comfy_org", "api_key_comfy_org") + def get_input_data(inputs, class_def, unique_id, outputs=None, dynprompt=None, extra_data={}): valid_inputs = class_def.INPUT_TYPES() input_data_all = {} @@ -1045,11 +1047,10 @@ class PromptQueue: if status is not None: status_dict = copy.deepcopy(status._asdict()) - # Remove auth tokens from extra_data before storing in history - if "auth_token_comfy_org" in prompt[3]: - del prompt[3]["auth_token_comfy_org"] - if "api_key_comfy_org" in prompt[3]: - del prompt[3]["api_key_comfy_org"] + # Remove sensitive data from extra_data before storing in history + for sensitive_val in SENSITIVE_EXTRA_DATA_KEYS: + if sensitive_val in prompt[3]: + prompt[3].pop(sensitive_val) self.history[prompt[1]] = { "prompt": prompt, From 9ca581c9416d799db0d7c55ac957a5fa486798c8 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 14 Jul 2025 10:10:20 -0700 Subject: [PATCH 025/243] Remove windows line endings. (#8902) --- comfy_extras/nodes_pixart.py | 48 ++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/comfy_extras/nodes_pixart.py b/comfy_extras/nodes_pixart.py index c7209c468..8d9276afe 100644 --- a/comfy_extras/nodes_pixart.py +++ b/comfy_extras/nodes_pixart.py @@ -1,24 +1,24 @@ -from nodes import MAX_RESOLUTION - -class CLIPTextEncodePixArtAlpha: - @classmethod - def INPUT_TYPES(s): - return {"required": { - "width": ("INT", {"default": 1024.0, "min": 0, "max": MAX_RESOLUTION}), - "height": ("INT", {"default": 1024.0, "min": 0, "max": MAX_RESOLUTION}), - # "aspect_ratio": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "text": ("STRING", {"multiline": True, "dynamicPrompts": True}), "clip": ("CLIP", ), - }} - - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "encode" - CATEGORY = "advanced/conditioning" - DESCRIPTION = "Encodes text and sets the resolution conditioning for PixArt Alpha. Does not apply to PixArt Sigma." - - def encode(self, clip, width, height, text): - tokens = clip.tokenize(text) - return (clip.encode_from_tokens_scheduled(tokens, add_dict={"width": width, "height": height}),) - -NODE_CLASS_MAPPINGS = { - "CLIPTextEncodePixArtAlpha": CLIPTextEncodePixArtAlpha, -} +from nodes import MAX_RESOLUTION + +class CLIPTextEncodePixArtAlpha: + @classmethod + def INPUT_TYPES(s): + return {"required": { + "width": ("INT", {"default": 1024.0, "min": 0, "max": MAX_RESOLUTION}), + "height": ("INT", {"default": 1024.0, "min": 0, "max": MAX_RESOLUTION}), + # "aspect_ratio": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), + "text": ("STRING", {"multiline": True, "dynamicPrompts": True}), "clip": ("CLIP", ), + }} + + RETURN_TYPES = ("CONDITIONING",) + FUNCTION = "encode" + CATEGORY = "advanced/conditioning" + DESCRIPTION = "Encodes text and sets the resolution conditioning for PixArt Alpha. Does not apply to PixArt Sigma." + + def encode(self, clip, width, height, text): + tokens = clip.tokenize(text) + return (clip.encode_from_tokens_scheduled(tokens, add_dict={"width": width, "height": height}),) + +NODE_CLASS_MAPPINGS = { + "CLIPTextEncodePixArtAlpha": CLIPTextEncodePixArtAlpha, +} From 861c3bbb3d2330dc7dff7567ffcf07946ace23b8 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Tue, 15 Jul 2025 01:27:57 +0800 Subject: [PATCH 026/243] Upate template to 0.1.36 (#8904) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 19a40ca0e..7705918a8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.35 +comfyui-workflow-templates==0.1.36 comfyui-embedded-docs==0.2.4 torch torchsde From 260a5ca5d9997bfa1cec5a4922cb066187e6daf0 Mon Sep 17 00:00:00 2001 From: FeepingCreature <540727+FeepingCreature@users.noreply.github.com> Date: Mon, 14 Jul 2025 20:48:31 +0200 Subject: [PATCH 027/243] Allow the prompt request to specify the prompt ID. (#8189) This makes it easier to write asynchronous clients that submit requests, because they can store the task immediately. Duplicate prompt IDs are rejected by the job queue. --- script_examples/websockets_api_example.py | 11 ++++++----- server.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/script_examples/websockets_api_example.py b/script_examples/websockets_api_example.py index d696d2bba..58f26cfb6 100644 --- a/script_examples/websockets_api_example.py +++ b/script_examples/websockets_api_example.py @@ -10,11 +10,11 @@ import urllib.parse server_address = "127.0.0.1:8188" client_id = str(uuid.uuid4()) -def queue_prompt(prompt): - p = {"prompt": prompt, "client_id": client_id} +def queue_prompt(prompt, prompt_id): + p = {"prompt": prompt, "client_id": client_id, "prompt_id": prompt_id} data = json.dumps(p).encode('utf-8') - req = urllib.request.Request("http://{}/prompt".format(server_address), data=data) - return json.loads(urllib.request.urlopen(req).read()) + req = urllib.request.Request("http://{}/prompt".format(server_address), data=data) + urllib.request.urlopen(req).read() def get_image(filename, subfolder, folder_type): data = {"filename": filename, "subfolder": subfolder, "type": folder_type} @@ -27,7 +27,8 @@ def get_history(prompt_id): return json.loads(response.read()) def get_images(ws, prompt): - prompt_id = queue_prompt(prompt)['prompt_id'] + prompt_id = str(uuid.uuid4()) + queue_prompt(prompt, prompt_id) output_images = {} while True: out = ws.recv() diff --git a/server.py b/server.py index e8bad9f4e..71a58f0fa 100644 --- a/server.py +++ b/server.py @@ -678,7 +678,7 @@ class PromptServer(): if "prompt" in json_data: prompt = json_data["prompt"] - prompt_id = str(uuid.uuid4()) + prompt_id = str(json_data.get("prompt_id", uuid.uuid4())) valid = await execution.validate_prompt(prompt_id, prompt) extra_data = {} if "extra_data" in json_data: From 543c24108c565a7dc21920d2c509b7596020eac1 Mon Sep 17 00:00:00 2001 From: Yoland Yan <4950057+yoland68@users.noreply.github.com> Date: Mon, 14 Jul 2025 17:45:55 -0700 Subject: [PATCH 028/243] Fix wrong reference bug (#8910) --- comfy/supported_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 2669ca01e..2ca3857f7 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1214,7 +1214,7 @@ class Omnigen2(supported_models_base.BASE): def clip_target(self, state_dict={}): pref = self.text_encoder_key_prefix[0] hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_3b.transformer.".format(pref)) - return supported_models_base.ClipTarget(comfy.text_encoders.omnigen2.LuminaTokenizer, comfy.text_encoders.omnigen2.te(**hunyuan_detect)) + return supported_models_base.ClipTarget(comfy.text_encoders.omnigen2.Omnigen2Tokenizer, comfy.text_encoders.omnigen2.te(**hunyuan_detect)) models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep, Omnigen2] From 9dabda19f046796f1169b593188b9b1b9263705e Mon Sep 17 00:00:00 2001 From: Yoland Yan <4950057+yoland68@users.noreply.github.com> Date: Mon, 14 Jul 2025 17:59:35 -0700 Subject: [PATCH 029/243] Update nodes_gemini.py (#8912) --- comfy_api_nodes/nodes_gemini.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index ae7b04846..5935ab2bb 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -406,7 +406,7 @@ class GeminiInputFiles(ComfyNodeABC): def create_file_part(self, file_path: str) -> GeminiPart: mime_type = ( - GeminiMimeType.pdf + GeminiMimeType.application_pdf if file_path.endswith(".pdf") else GeminiMimeType.text_plain ) From b1ae4126c39dd2a2831d4b88f91c7d18bc307ef7 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 14 Jul 2025 23:27:18 -0700 Subject: [PATCH 030/243] Add action to detect windows line endings. (#8917) --- .github/workflows/check-line-endings.yml | 40 ++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 .github/workflows/check-line-endings.yml diff --git a/.github/workflows/check-line-endings.yml b/.github/workflows/check-line-endings.yml new file mode 100644 index 000000000..f20dca565 --- /dev/null +++ b/.github/workflows/check-line-endings.yml @@ -0,0 +1,40 @@ +name: Check for Windows Line Endings + +on: + pull_request: + branches: ['*'] # Trigger on all pull requests to any branch + +jobs: + check-line-endings: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Fetch all history to compare changes + + - name: Check for Windows line endings (CRLF) + run: | + # Get the list of changed files in the PR + CHANGED_FILES=$(git diff --name-only origin/${{ github.base_ref }}..HEAD) + + # Flag to track if CRLF is found + CRLF_FOUND=false + + # Loop through each changed file + for FILE in $CHANGED_FILES; do + # Check if the file exists and is a text file + if [ -f "$FILE" ] && file "$FILE" | grep -q "text"; then + # Check for CRLF line endings + if grep -UP '\r$' "$FILE"; then + echo "Error: Windows line endings (CRLF) detected in $FILE" + CRLF_FOUND=true + fi + fi + done + + # Exit with error if CRLF was found + if [ "$CRLF_FOUND" = true ]; then + exit 1 + fi From 6b8062f4141f6655c4cea5422c29aa0a8b532cdf Mon Sep 17 00:00:00 2001 From: Brandon Wallace Date: Tue, 15 Jul 2025 20:08:27 -0500 Subject: [PATCH 031/243] Fix MaskComposite error when destination has 2 dimensions (#8915) Fix code that is using the original `destination` input instead of the reshaped value. --- comfy_extras/nodes_mask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_extras/nodes_mask.py b/comfy_extras/nodes_mask.py index ab387a2fc..2b0f8dd5d 100644 --- a/comfy_extras/nodes_mask.py +++ b/comfy_extras/nodes_mask.py @@ -247,7 +247,7 @@ class MaskComposite: visible_width, visible_height = (right - left, bottom - top,) source_portion = source[:, :visible_height, :visible_width] - destination_portion = destination[:, top:bottom, left:right] + destination_portion = output[:, top:bottom, left:right] if operation == "multiply": output[:, top:bottom, left:right] = destination_portion * source_portion From 50afba747cd3413a6f6eb6703c627a24d2e6f165 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 16 Jul 2025 00:42:17 -0700 Subject: [PATCH 032/243] Add attempt to work around the safetensors mmap issue. (#8928) --- comfy/cli_args.py | 1 + comfy/utils.py | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/comfy/cli_args.py b/comfy/cli_args.py index 7234a7ba0..ef0d4337e 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -144,6 +144,7 @@ class PerformanceFeature(enum.Enum): parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: fp16_accumulation fp8_matrix_mult cublas_ops") parser.add_argument("--mmap-torch-files", action="store_true", help="Use mmap when loading ckpt/pt files.") +parser.add_argument("--disable-mmap", action="store_true", help="Don't use mmap when loading safetensors.") parser.add_argument("--dont-print-server", action="store_true", help="Don't print server output.") parser.add_argument("--quick-test-for-ci", action="store_true", help="Quick test for CI.") diff --git a/comfy/utils.py b/comfy/utils.py index f8e01f713..9c076a0e0 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -31,6 +31,7 @@ from einops import rearrange from comfy.cli_args import args MMAP_TORCH_FILES = args.mmap_torch_files +DISABLE_MMAP = args.disable_mmap ALWAYS_SAFE_LOAD = False if hasattr(torch.serialization, "add_safe_globals"): # TODO: this was added in pytorch 2.4, the unsafe path should be removed once earlier versions are deprecated @@ -58,7 +59,10 @@ def load_torch_file(ckpt, safe_load=False, device=None, return_metadata=False): with safetensors.safe_open(ckpt, framework="pt", device=device.type) as f: sd = {} for k in f.keys(): - sd[k] = f.get_tensor(k) + tensor = f.get_tensor(k) + if DISABLE_MMAP: # TODO: Not sure if this is the best way to bypass the mmap issues + tensor = tensor.to(device=device, copy=True) + sd[k] = tensor if return_metadata: metadata = f.metadata() except Exception as e: From 9bc2798f72947d6b76b3650c88a7b2f0afc1a8d0 Mon Sep 17 00:00:00 2001 From: Harel Cain Date: Wed, 16 Jul 2025 19:54:38 +0200 Subject: [PATCH 033/243] LTXV VAE decoder: switch default padding mode (#8930) --- comfy/ldm/lightricks/vae/causal_video_autoencoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py index f91870d71..75ed069ad 100644 --- a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py +++ b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py @@ -973,7 +973,7 @@ class VideoVAE(nn.Module): norm_layer=config.get("norm_layer", "group_norm"), causal=config.get("causal_decoder", False), timestep_conditioning=self.timestep_conditioning, - spatial_padding_mode=config.get("spatial_padding_mode", "zeros"), + spatial_padding_mode=config.get("spatial_padding_mode", "reflect"), ) self.per_channel_statistics = processor() From 491fafbd6428e59c52093d76fde0bc10dfa723bb Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 16 Jul 2025 11:42:07 -0700 Subject: [PATCH 034/243] Silence clip tokenizer warning. (#8934) --- comfy/sd1_tokenizer/tokenizer_config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/sd1_tokenizer/tokenizer_config.json b/comfy/sd1_tokenizer/tokenizer_config.json index 5ba7bf706..8f7b3151d 100644 --- a/comfy/sd1_tokenizer/tokenizer_config.json +++ b/comfy/sd1_tokenizer/tokenizer_config.json @@ -18,7 +18,7 @@ "single_word": false }, "errors": "replace", - "model_max_length": 77, + "model_max_length": 8192, "name_or_path": "openai/clip-vit-large-patch14", "pad_token": "<|endoftext|>", "special_tokens_map_file": "./special_tokens_map.json", From 650838fd6fcb24e66bf82b3c75383f35433d7b8a Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 17 Jul 2025 01:11:07 -0700 Subject: [PATCH 035/243] Experimental CFGNorm node. (#8942) This is from the new hidream e1 1 model code. Figured it might be useful as a generic cfg trick. --- comfy_extras/nodes_cfg.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/comfy_extras/nodes_cfg.py b/comfy_extras/nodes_cfg.py index 1fb686644..1acaf15bc 100644 --- a/comfy_extras/nodes_cfg.py +++ b/comfy_extras/nodes_cfg.py @@ -40,6 +40,32 @@ class CFGZeroStar: m.set_model_sampler_post_cfg_function(cfg_zero_star) return (m, ) +class CFGNorm: + @classmethod + def INPUT_TYPES(s): + return {"required": {"model": ("MODEL",), + "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step": 0.01}), + }} + RETURN_TYPES = ("MODEL",) + RETURN_NAMES = ("patched_model",) + FUNCTION = "patch" + CATEGORY = "advanced/guidance" + + def patch(self, model, strength): + m = model.clone() + def cfg_norm(args): + cond_p = args['cond_denoised'] + pred_text_ = args["denoised"] + + norm_full_cond = torch.norm(cond_p, dim=1, keepdim=True) + norm_pred_text = torch.norm(pred_text_, dim=1, keepdim=True) + scale = (norm_full_cond / (norm_pred_text + 1e-8)).clamp(min=0.0, max=1.0) + return pred_text_ * scale * strength + + m.set_model_sampler_post_cfg_function(cfg_norm) + return (m, ) + NODE_CLASS_MAPPINGS = { - "CFGZeroStar": CFGZeroStar + "CFGZeroStar": CFGZeroStar, + "CFGNorm": CFGNorm, } From 7f492522b6dcb142ff2c4d3438310773d9a80551 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 18 Jul 2025 02:43:02 -0700 Subject: [PATCH 036/243] Forgot this (#8957) --- comfy_extras/nodes_cfg.py | 1 + 1 file changed, 1 insertion(+) diff --git a/comfy_extras/nodes_cfg.py b/comfy_extras/nodes_cfg.py index 1acaf15bc..5abdc115a 100644 --- a/comfy_extras/nodes_cfg.py +++ b/comfy_extras/nodes_cfg.py @@ -50,6 +50,7 @@ class CFGNorm: RETURN_NAMES = ("patched_model",) FUNCTION = "patch" CATEGORY = "advanced/guidance" + EXPERIMENTAL = True def patch(self, model, strength): m = model.clone() From 1b96fae1d4a8425c44d4d3bd60acd818d05bf4f6 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 19 Jul 2025 01:55:23 -0700 Subject: [PATCH 037/243] Add nested style of dual cfg to DualCFGGuider node. (#8965) --- comfy_extras/nodes_custom_sampler.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py index 33bc41842..d17737e1a 100644 --- a/comfy_extras/nodes_custom_sampler.py +++ b/comfy_extras/nodes_custom_sampler.py @@ -683,9 +683,10 @@ class CFGGuider: return (guider,) class Guider_DualCFG(comfy.samplers.CFGGuider): - def set_cfg(self, cfg1, cfg2): + def set_cfg(self, cfg1, cfg2, nested=False): self.cfg1 = cfg1 self.cfg2 = cfg2 + self.nested = nested def set_conds(self, positive, middle, negative): middle = node_helpers.conditioning_set_values(middle, {"prompt_type": "negative"}) @@ -695,14 +696,20 @@ class Guider_DualCFG(comfy.samplers.CFGGuider): negative_cond = self.conds.get("negative", None) middle_cond = self.conds.get("middle", None) positive_cond = self.conds.get("positive", None) - if model_options.get("disable_cfg1_optimization", False) == False: - if math.isclose(self.cfg2, 1.0): - negative_cond = None - if math.isclose(self.cfg1, 1.0): - middle_cond = None - out = comfy.samplers.calc_cond_batch(self.inner_model, [negative_cond, middle_cond, positive_cond], x, timestep, model_options) - return comfy.samplers.cfg_function(self.inner_model, out[1], out[0], self.cfg2, x, timestep, model_options=model_options, cond=middle_cond, uncond=negative_cond) + (out[2] - out[1]) * self.cfg1 + if self.nested: + out = comfy.samplers.calc_cond_batch(self.inner_model, [negative_cond, middle_cond, positive_cond], x, timestep, model_options) + pred_text = comfy.samplers.cfg_function(self.inner_model, out[2], out[1], self.cfg1, x, timestep, model_options=model_options, cond=positive_cond, uncond=middle_cond) + return out[0] + self.cfg2 * (pred_text - out[0]) + else: + if model_options.get("disable_cfg1_optimization", False) == False: + if math.isclose(self.cfg2, 1.0): + negative_cond = None + if math.isclose(self.cfg1, 1.0): + middle_cond = None + + out = comfy.samplers.calc_cond_batch(self.inner_model, [negative_cond, middle_cond, positive_cond], x, timestep, model_options) + return comfy.samplers.cfg_function(self.inner_model, out[1], out[0], self.cfg2, x, timestep, model_options=model_options, cond=middle_cond, uncond=negative_cond) + (out[2] - out[1]) * self.cfg1 class DualCFGGuider: @classmethod @@ -714,6 +721,7 @@ class DualCFGGuider: "negative": ("CONDITIONING", ), "cfg_conds": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0, "step":0.1, "round": 0.01}), "cfg_cond2_negative": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0, "step":0.1, "round": 0.01}), + "style": (["regular", "nested"],), } } @@ -722,10 +730,10 @@ class DualCFGGuider: FUNCTION = "get_guider" CATEGORY = "sampling/custom_sampling/guiders" - def get_guider(self, model, cond1, cond2, negative, cfg_conds, cfg_cond2_negative): + def get_guider(self, model, cond1, cond2, negative, cfg_conds, cfg_cond2_negative, style): guider = Guider_DualCFG(model) guider.set_conds(cond1, cond2, negative) - guider.set_cfg(cfg_conds, cfg_cond2_negative) + guider.set_cfg(cfg_conds, cfg_cond2_negative, nested=(style == "nested")) return (guider,) class DisableNoise: From 1da5639e865a50f921d870a92c7c87110ce20c48 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Sat, 19 Jul 2025 18:08:00 +0800 Subject: [PATCH 038/243] Update template to 0.1.37 (#8967) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 7705918a8..a7e44095f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.36 +comfyui-workflow-templates==0.1.37 comfyui-embedded-docs==0.2.4 torch torchsde From 100c2478eaba71ab735539fdc00c9d0de49bc224 Mon Sep 17 00:00:00 2001 From: chaObserv <154517000+chaObserv@users.noreply.github.com> Date: Sun, 20 Jul 2025 11:09:11 +0800 Subject: [PATCH 039/243] Add SamplingPercentToSigma node (#8963) It's helpful to adjust start_percent or end_percent based on the corresponding sigma. --- comfy_extras/nodes_custom_sampler.py | 30 ++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py index d17737e1a..d011f433b 100644 --- a/comfy_extras/nodes_custom_sampler.py +++ b/comfy_extras/nodes_custom_sampler.py @@ -301,6 +301,35 @@ class ExtendIntermediateSigmas: return (extended_sigmas,) + +class SamplingPercentToSigma: + @classmethod + def INPUT_TYPES(cls) -> InputTypeDict: + return { + "required": { + "model": (IO.MODEL, {}), + "sampling_percent": (IO.FLOAT, {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.0001}), + "return_actual_sigma": (IO.BOOLEAN, {"default": False, "tooltip": "Return the actual sigma value instead of the value used for interval checks.\nThis only affects results at 0.0 and 1.0."}), + } + } + + RETURN_TYPES = (IO.FLOAT,) + RETURN_NAMES = ("sigma_value",) + CATEGORY = "sampling/custom_sampling/sigmas" + + FUNCTION = "get_sigma" + + def get_sigma(self, model, sampling_percent, return_actual_sigma): + model_sampling = model.get_model_object("model_sampling") + sigma_val = model_sampling.percent_to_sigma(sampling_percent) + if return_actual_sigma: + if sampling_percent == 0.0: + sigma_val = model_sampling.sigma_max.item() + elif sampling_percent == 1.0: + sigma_val = model_sampling.sigma_min.item() + return (sigma_val,) + + class KSamplerSelect: @classmethod def INPUT_TYPES(s): @@ -887,6 +916,7 @@ NODE_CLASS_MAPPINGS = { "FlipSigmas": FlipSigmas, "SetFirstSigma": SetFirstSigma, "ExtendIntermediateSigmas": ExtendIntermediateSigmas, + "SamplingPercentToSigma": SamplingPercentToSigma, "CFGGuider": CFGGuider, "DualCFGGuider": DualCFGGuider, From a0c0785635a9f4d2da64b58fef063825f386d8da Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 19 Jul 2025 22:24:09 -0700 Subject: [PATCH 040/243] Document what the fast_fp16_accumulation is in the portable. (#8973) --- .ci/windows_base_files/README_VERY_IMPORTANT.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.ci/windows_base_files/README_VERY_IMPORTANT.txt b/.ci/windows_base_files/README_VERY_IMPORTANT.txt index d46acbcbf..8ab70c890 100755 --- a/.ci/windows_base_files/README_VERY_IMPORTANT.txt +++ b/.ci/windows_base_files/README_VERY_IMPORTANT.txt @@ -4,6 +4,9 @@ if you have a NVIDIA gpu: run_nvidia_gpu.bat +if you want to enable the fast fp16 accumulation (faster for fp16 models with slightly less quality): + +run_nvidia_gpu_fast_fp16_accumulation.bat To run it in slow CPU mode: From 7d627f764c2137d816a39adbc358cb28c1718a47 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Mon, 21 Jul 2025 03:58:35 +0800 Subject: [PATCH 041/243] Update template to 0.1.39 (#8981) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index a7e44095f..8f6a6d112 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.37 +comfyui-workflow-templates==0.1.39 comfyui-embedded-docs==0.2.4 torch torchsde From 9a470e073e2742d4edd6e7ea1ce28d861a77d9c4 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Mon, 21 Jul 2025 14:05:43 -0400 Subject: [PATCH 042/243] ComfyUI version 0.3.45 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 7981fbaca..180ecaf8a 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.44" +__version__ = "0.3.45" diff --git a/pyproject.toml b/pyproject.toml index 96ead2157..b1d6d9df6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.44" +version = "0.3.45" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 54a45b996729b361ea12f473de760e481dcf1f0a Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 21 Jul 2025 11:19:14 -0700 Subject: [PATCH 043/243] Replace torchaudio.load with pyav. (#8989) --- comfy_extras/nodes_audio.py | 58 ++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py index 8cd647846..38697240e 100644 --- a/comfy_extras/nodes_audio.py +++ b/comfy_extras/nodes_audio.py @@ -278,6 +278,62 @@ class PreviewAudio(SaveAudio): "hidden": {"prompt": "PROMPT", "extra_pnginfo": "EXTRA_PNGINFO"}, } +def f32_pcm(wav: torch.Tensor) -> torch.Tensor: + """Convert audio to float 32 bits PCM format.""" + if wav.dtype.is_floating_point: + return wav + elif wav.dtype == torch.int16: + return wav.float() / (2 ** 15) + elif wav.dtype == torch.int32: + return wav.float() / (2 ** 31) + raise ValueError(f"Unsupported wav dtype: {wav.dtype}") + +def load(filepath: str, frame_offset: int = 0, num_frames: int = -1) -> tuple[torch.Tensor, int]: + with av.open(filepath) as af: + if not af.streams.audio: + raise ValueError("No audio stream found in the file.") + + stream = af.streams.audio[0] + sr = stream.codec_context.sample_rate + n_channels = stream.channels + + seek_time = frame_offset / sr if frame_offset > 0 else 0.0 + duration = num_frames / sr if num_frames > 0 else -1.0 + + sample_offset = int(sr * seek_time) + num_samples = int(sr * duration) if duration >= 0 else -1 + + # Small negative offset for MP3 artifacts, NOTE: this is LLM code so idk if it's actually necessary' + seek_sec = max(0, seek_time - 0.1) if filepath.lower().endswith('.mp3') else seek_time + af.seek(int(seek_sec / stream.time_base), stream=stream) + + frames = [] + length = 0 + for frame in af.decode(streams=stream.index): + current_offset = int(frame.rate * frame.pts * frame.time_base) + strip = max(0, sample_offset - current_offset) + + buf = torch.from_numpy(frame.to_ndarray()) + if buf.shape[0] != n_channels: + buf = buf.view(-1, n_channels).t() + + buf = buf[:, strip:] + frames.append(buf) + length += buf.shape[1] + + if num_samples > 0 and length >= num_samples: + break + + if not frames: + raise ValueError("No audio frames decoded.") + + wav = torch.cat(frames, dim=1) + if num_samples > 0: + wav = wav[:, :num_samples] + + wav = f32_pcm(wav) + return wav, sr + class LoadAudio: @classmethod def INPUT_TYPES(s): @@ -292,7 +348,7 @@ class LoadAudio: def load(self, audio): audio_path = folder_paths.get_annotated_filepath(audio) - waveform, sample_rate = torchaudio.load(audio_path) + waveform, sample_rate = load(audio_path) audio = {"waveform": waveform.unsqueeze(0), "sample_rate": sample_rate} return (audio, ) From 5249e45a1c7d91656ebefdebe3815005ec3d39d7 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 21 Jul 2025 12:23:41 -0700 Subject: [PATCH 044/243] Add hidream e1.1 example to readme. (#8990) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 0e021a687..d004364ee 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,7 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith - Image Editing Models - [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/) - [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model) + - [HiDream E1.1](https://comfyanonymous.github.io/ComfyUI_examples/hidream/#hidream-e11) - Video Models - [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/) - [Mochi](https://comfyanonymous.github.io/ComfyUI_examples/mochi/) From 0aa1c58b04b27311c6ba38b1d9949e7e20037d00 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 21 Jul 2025 13:48:25 -0700 Subject: [PATCH 045/243] This is not needed. (#8991) --- comfy_extras/nodes_audio.py | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py index 38697240e..a90b31779 100644 --- a/comfy_extras/nodes_audio.py +++ b/comfy_extras/nodes_audio.py @@ -288,7 +288,7 @@ def f32_pcm(wav: torch.Tensor) -> torch.Tensor: return wav.float() / (2 ** 31) raise ValueError(f"Unsupported wav dtype: {wav.dtype}") -def load(filepath: str, frame_offset: int = 0, num_frames: int = -1) -> tuple[torch.Tensor, int]: +def load(filepath: str) -> tuple[torch.Tensor, int]: with av.open(filepath) as af: if not af.streams.audio: raise ValueError("No audio stream found in the file.") @@ -297,40 +297,20 @@ def load(filepath: str, frame_offset: int = 0, num_frames: int = -1) -> tuple[to sr = stream.codec_context.sample_rate n_channels = stream.channels - seek_time = frame_offset / sr if frame_offset > 0 else 0.0 - duration = num_frames / sr if num_frames > 0 else -1.0 - - sample_offset = int(sr * seek_time) - num_samples = int(sr * duration) if duration >= 0 else -1 - - # Small negative offset for MP3 artifacts, NOTE: this is LLM code so idk if it's actually necessary' - seek_sec = max(0, seek_time - 0.1) if filepath.lower().endswith('.mp3') else seek_time - af.seek(int(seek_sec / stream.time_base), stream=stream) - frames = [] length = 0 for frame in af.decode(streams=stream.index): - current_offset = int(frame.rate * frame.pts * frame.time_base) - strip = max(0, sample_offset - current_offset) - buf = torch.from_numpy(frame.to_ndarray()) if buf.shape[0] != n_channels: buf = buf.view(-1, n_channels).t() - buf = buf[:, strip:] frames.append(buf) length += buf.shape[1] - if num_samples > 0 and length >= num_samples: - break - if not frames: raise ValueError("No audio frames decoded.") wav = torch.cat(frames, dim=1) - if num_samples > 0: - wav = wav[:, :num_samples] - wav = f32_pcm(wav) return wav, sr From 5ac9ec214ba3ef1632701416f27948a57ec60919 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 22 Jul 2025 01:07:51 -0700 Subject: [PATCH 046/243] Try to fix line endings workflow. (#9001) --- .github/workflows/check-line-endings.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/check-line-endings.yml b/.github/workflows/check-line-endings.yml index f20dca565..03b3e3ced 100644 --- a/.github/workflows/check-line-endings.yml +++ b/.github/workflows/check-line-endings.yml @@ -17,6 +17,7 @@ jobs: - name: Check for Windows line endings (CRLF) run: | # Get the list of changed files in the PR + git merge origin/${{ github.base_ref }} --no-edit CHANGED_FILES=$(git diff --name-only origin/${{ github.base_ref }}..HEAD) # Flag to track if CRLF is found From 255f1398638b265a47d0e74fb4759fe6cfc3b3d4 Mon Sep 17 00:00:00 2001 From: Simon Lui <502929+simonlui@users.noreply.github.com> Date: Tue, 22 Jul 2025 12:20:09 -0700 Subject: [PATCH 047/243] Add xpu version for async offload and some other things. (#9004) --- comfy/model_management.py | 41 +++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 816caf18f..ab1e9bf3a 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -101,7 +101,7 @@ if args.directml is not None: lowvram_available = False #TODO: need to find a way to get free memory in directml before this can be enabled by default. try: - import intel_extension_for_pytorch as ipex + import intel_extension_for_pytorch as ipex # noqa: F401 _ = torch.xpu.device_count() xpu_available = xpu_available or torch.xpu.is_available() except: @@ -186,8 +186,12 @@ def get_total_memory(dev=None, torch_total_too=False): elif is_intel_xpu(): stats = torch.xpu.memory_stats(dev) mem_reserved = stats['reserved_bytes.all.current'] + if torch_version_numeric < (2, 6): + mem_total_xpu = torch.xpu.get_device_properties(dev).total_memory + else: + _, mem_total_xpu = torch.xpu.mem_get_info(dev) mem_total_torch = mem_reserved - mem_total = torch.xpu.get_device_properties(dev).total_memory + mem_total = mem_total_xpu elif is_ascend_npu(): stats = torch.npu.memory_stats(dev) mem_reserved = stats['reserved_bytes.all.current'] @@ -929,7 +933,7 @@ def device_supports_non_blocking(device): if is_device_mps(device): return False #pytorch bug? mps doesn't support non blocking if is_intel_xpu(): - return False + return True if args.deterministic: #TODO: figure out why deterministic breaks non blocking from gpu to cpu (previews) return False if directml_enabled: @@ -968,6 +972,8 @@ def get_offload_stream(device): stream_counter = (stream_counter + 1) % len(ss) if is_device_cuda(device): ss[stream_counter].wait_stream(torch.cuda.current_stream()) + elif is_device_xpu(device): + ss[stream_counter].wait_stream(torch.xpu.current_stream()) stream_counters[device] = stream_counter return s elif is_device_cuda(device): @@ -979,6 +985,15 @@ def get_offload_stream(device): stream_counter = (stream_counter + 1) % len(ss) stream_counters[device] = stream_counter return s + elif is_device_xpu(device): + ss = [] + for k in range(NUM_STREAMS): + ss.append(torch.xpu.Stream(device=device, priority=0)) + STREAMS[device] = ss + s = ss[stream_counter] + stream_counter = (stream_counter + 1) % len(ss) + stream_counters[device] = stream_counter + return s return None def sync_stream(device, stream): @@ -986,6 +1001,8 @@ def sync_stream(device, stream): return if is_device_cuda(device): torch.cuda.current_stream().wait_stream(stream) + elif is_device_xpu(device): + torch.xpu.current_stream().wait_stream(stream) def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, stream=None): if device is None or weight.device == device: @@ -1092,8 +1109,11 @@ def get_free_memory(dev=None, torch_free_too=False): stats = torch.xpu.memory_stats(dev) mem_active = stats['active_bytes.all.current'] mem_reserved = stats['reserved_bytes.all.current'] + if torch_version_numeric < (2, 6): + mem_free_xpu = torch.xpu.get_device_properties(dev).total_memory - mem_reserved + else: + mem_free_xpu, _ = torch.xpu.mem_get_info(dev) mem_free_torch = mem_reserved - mem_active - mem_free_xpu = torch.xpu.get_device_properties(dev).total_memory - mem_reserved mem_free_total = mem_free_xpu + mem_free_torch elif is_ascend_npu(): stats = torch.npu.memory_stats(dev) @@ -1142,6 +1162,9 @@ def is_device_cpu(device): def is_device_mps(device): return is_device_type(device, 'mps') +def is_device_xpu(device): + return is_device_type(device, 'xpu') + def is_device_cuda(device): return is_device_type(device, 'cuda') @@ -1173,7 +1196,10 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma return False if is_intel_xpu(): - return True + if torch_version_numeric < (2, 3): + return True + else: + return torch.xpu.get_device_properties(device).has_fp16 if is_ascend_npu(): return True @@ -1236,7 +1262,10 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma return False if is_intel_xpu(): - return True + if torch_version_numeric < (2, 6): + return True + else: + return torch.xpu.get_device_capability(device)['has_bfloat16_conversions'] if is_ascend_npu(): return True From 5ad33787dee43d36f8d054c590818b3153b55370 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 23 Jul 2025 11:20:49 -0700 Subject: [PATCH 048/243] Add default device argument. (#9023) --- comfy/cli_args.py | 3 ++- comfy/model_management.py | 1 + main.py | 9 +++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/comfy/cli_args.py b/comfy/cli_args.py index ef0d4337e..0d760d524 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -49,7 +49,8 @@ parser.add_argument("--temp-directory", type=str, default=None, help="Set the Co parser.add_argument("--input-directory", type=str, default=None, help="Set the ComfyUI input directory. Overrides --base-directory.") parser.add_argument("--auto-launch", action="store_true", help="Automatically launch ComfyUI in the default browser.") parser.add_argument("--disable-auto-launch", action="store_true", help="Disable auto launching the browser.") -parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use.") +parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use. All other devices will not be visible.") +parser.add_argument("--default-device", type=int, default=None, metavar="DEFAULT_DEVICE_ID", help="Set the id of the default device, all other devices will stay visible.") cm_group = parser.add_mutually_exclusive_group() cm_group.add_argument("--cuda-malloc", action="store_true", help="Enable cudaMallocAsync (enabled by default for torch 2.0 and up).") cm_group.add_argument("--disable-cuda-malloc", action="store_true", help="Disable cudaMallocAsync.") diff --git a/comfy/model_management.py b/comfy/model_management.py index ab1e9bf3a..346673895 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -880,6 +880,7 @@ def vae_dtype(device=None, allowed_dtypes=[]): return d # NOTE: bfloat16 seems to work on AMD for the VAE but is extremely slow in some cases compared to fp32 + # slowness still a problem on pytorch nightly 2.9.0.dev20250720+rocm6.4 tested on RDNA3 if d == torch.bfloat16 and (not is_amd()) and should_use_bf16(device): return d diff --git a/main.py b/main.py index 2b4ffafd4..e8ca8152a 100644 --- a/main.py +++ b/main.py @@ -115,6 +115,15 @@ if os.name == "nt": logging.getLogger("xformers").addFilter(lambda record: 'A matching Triton is not available' not in record.getMessage()) if __name__ == "__main__": + if args.default_device is not None: + default_dev = args.default_device + devices = list(range(32)) + devices.remove(default_dev) + devices.insert(0, default_dev) + devices = ','.join(map(str, devices)) + os.environ['CUDA_VISIBLE_DEVICES'] = str(devices) + os.environ['HIP_VISIBLE_DEVICES'] = str(devices) + if args.cuda_device is not None: os.environ['CUDA_VISIBLE_DEVICES'] = str(args.cuda_device) os.environ['HIP_VISIBLE_DEVICES'] = str(args.cuda_device) From 39dda1d40d1f2f18ccda8ade860932d0b8a07af4 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 23 Jul 2025 15:10:59 -0700 Subject: [PATCH 049/243] Fix xpu function not implemented. (#9026) --- comfy/model_management.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 346673895..746b063ed 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -186,10 +186,7 @@ def get_total_memory(dev=None, torch_total_too=False): elif is_intel_xpu(): stats = torch.xpu.memory_stats(dev) mem_reserved = stats['reserved_bytes.all.current'] - if torch_version_numeric < (2, 6): - mem_total_xpu = torch.xpu.get_device_properties(dev).total_memory - else: - _, mem_total_xpu = torch.xpu.mem_get_info(dev) + mem_total_xpu = torch.xpu.get_device_properties(dev).total_memory mem_total_torch = mem_reserved mem_total = mem_total_xpu elif is_ascend_npu(): From a86a58c308c2423e86054462a8c9f1125536a034 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 23 Jul 2025 15:18:20 -0700 Subject: [PATCH 050/243] Fix xpu function not implemented p2. (#9027) --- comfy/model_management.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 746b063ed..42873d09b 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1107,10 +1107,7 @@ def get_free_memory(dev=None, torch_free_too=False): stats = torch.xpu.memory_stats(dev) mem_active = stats['active_bytes.all.current'] mem_reserved = stats['reserved_bytes.all.current'] - if torch_version_numeric < (2, 6): - mem_free_xpu = torch.xpu.get_device_properties(dev).total_memory - mem_reserved - else: - mem_free_xpu, _ = torch.xpu.mem_get_info(dev) + mem_free_xpu = torch.xpu.get_device_properties(dev).total_memory - mem_reserved mem_free_torch = mem_reserved - mem_active mem_free_total = mem_free_xpu + mem_free_torch elif is_ascend_npu(): From d3504e1778c0cc8992b04fe30dc0fae239c13713 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 23 Jul 2025 16:21:29 -0700 Subject: [PATCH 051/243] Enable pytorch attention by default for gfx1201 on torch 2.8 (#9029) --- comfy/model_management.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 42873d09b..e8b9b5c81 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -308,7 +308,10 @@ try: logging.info("ROCm version: {}".format(rocm_version)) if args.use_split_cross_attention == False and args.use_quad_cross_attention == False: if torch_version_numeric >= (2, 7): # works on 2.6 but doesn't actually seem to improve much - if any((a in arch) for a in ["gfx90a", "gfx942", "gfx1100", "gfx1101", "gfx1151"]): # TODO: more arches, TODO: gfx1201 and gfx950 + if any((a in arch) for a in ["gfx90a", "gfx942", "gfx1100", "gfx1101", "gfx1151"]): # TODO: more arches, TODO: gfx950 + ENABLE_PYTORCH_ATTENTION = True + if torch_version_numeric >= (2, 8): + if any((a in arch) for a in ["gfx1201"]): ENABLE_PYTORCH_ATTENTION = True if torch_version_numeric >= (2, 7) and rocm_version >= (6, 4): if any((a in arch) for a in ["gfx1201", "gfx942", "gfx950"]): # TODO: more arches From e78d2304966b6265fa2320b4d87dca534ea15642 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 23 Jul 2025 16:37:43 -0700 Subject: [PATCH 052/243] Only enable cuda malloc on cuda torch. (#9031) --- cuda_malloc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cuda_malloc.py b/cuda_malloc.py index eb2857c5f..c1d9ae3ca 100644 --- a/cuda_malloc.py +++ b/cuda_malloc.py @@ -74,7 +74,8 @@ if not args.cuda_malloc: module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) version = module.__version__ - if int(version[0]) >= 2: #enable by default for torch version 2.0 and up + + if int(version[0]) >= 2 and "+cu" in version: #enable by default for torch version 2.0 and up only on cuda torch args.cuda_malloc = cuda_malloc_supported() except: pass From e729a5cc1157bc0ece7daae9583c3a5a3ba95fbb Mon Sep 17 00:00:00 2001 From: chaObserv <154517000+chaObserv@users.noreply.github.com> Date: Thu, 24 Jul 2025 07:47:05 +0800 Subject: [PATCH 053/243] Separate denoised and noise estimation in Euler CFG++ (#9008) This will change their behavior with the sampling CONST type. It also combines euler_cfg_pp and euler_ancestral_cfg_pp into one main function. --- comfy/k_diffusion/sampling.py | 64 +++++++++++++++++------------------ 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/comfy/k_diffusion/sampling.py b/comfy/k_diffusion/sampling.py index 2ed415b1f..a2bc492fd 100644 --- a/comfy/k_diffusion/sampling.py +++ b/comfy/k_diffusion/sampling.py @@ -1210,39 +1210,21 @@ def sample_deis(model, x, sigmas, extra_args=None, callback=None, disable=None, return x_next -@torch.no_grad() -def sample_euler_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None): - extra_args = {} if extra_args is None else extra_args - - temp = [0] - def post_cfg_function(args): - temp[0] = args["uncond_denoised"] - return args["denoised"] - - model_options = extra_args.get("model_options", {}).copy() - extra_args["model_options"] = comfy.model_patcher.set_model_options_post_cfg_function(model_options, post_cfg_function, disable_cfg1_optimization=True) - - s_in = x.new_ones([x.shape[0]]) - for i in trange(len(sigmas) - 1, disable=disable): - sigma_hat = sigmas[i] - denoised = model(x, sigma_hat * s_in, **extra_args) - d = to_d(x, sigma_hat, temp[0]) - if callback is not None: - callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised}) - # Euler method - x = denoised + d * sigmas[i + 1] - return x - @torch.no_grad() def sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None): - """Ancestral sampling with Euler method steps.""" + """Ancestral sampling with Euler method steps (CFG++).""" extra_args = {} if extra_args is None else extra_args seed = extra_args.get("seed", None) noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler - temp = [0] + model_sampling = model.inner_model.model_patcher.get_model_object("model_sampling") + lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling) + + uncond_denoised = None + def post_cfg_function(args): - temp[0] = args["uncond_denoised"] + nonlocal uncond_denoised + uncond_denoised = args["uncond_denoised"] return args["denoised"] model_options = extra_args.get("model_options", {}).copy() @@ -1251,15 +1233,33 @@ def sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=No s_in = x.new_ones([x.shape[0]]) for i in trange(len(sigmas) - 1, disable=disable): denoised = model(x, sigmas[i] * s_in, **extra_args) - sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta) if callback is not None: callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised}) - d = to_d(x, sigmas[i], temp[0]) - # Euler method - x = denoised + d * sigma_down - if sigmas[i + 1] > 0: - x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up + if sigmas[i + 1] == 0: + # Denoising step + x = denoised + else: + alpha_s = sigmas[i] * lambda_fn(sigmas[i]).exp() + alpha_t = sigmas[i + 1] * lambda_fn(sigmas[i + 1]).exp() + d = to_d(x, sigmas[i], alpha_s * uncond_denoised) # to noise + + # DDIM stochastic sampling + sigma_down, sigma_up = get_ancestral_step(sigmas[i] / alpha_s, sigmas[i + 1] / alpha_t, eta=eta) + sigma_down = alpha_t * sigma_down + + # Euler method + x = alpha_t * denoised + sigma_down * d + if eta > 0 and s_noise > 0: + x = x + alpha_t * noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up return x + + +@torch.no_grad() +def sample_euler_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None): + """Euler method steps (CFG++).""" + return sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=0.0, s_noise=0.0, noise_sampler=None) + + @torch.no_grad() def sample_dpmpp_2s_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None): """Ancestral sampling with DPM-Solver++(2S) second-order steps.""" From eb2f78b4e09b1970e2fc51fc5d2e062f1a826399 Mon Sep 17 00:00:00 2001 From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com> Date: Thu, 24 Jul 2025 08:57:27 +0800 Subject: [PATCH 054/243] [Training Node] algo support, grad acc, optional grad ckpt (#9015) * Add factorization utils for lokr * Add lokr train impl * Add loha train impl * Add adapter map for algo selection * Add optional grad ckpt and algo selection * Update __init__.py * correct key name for loha * Use custom fwd/bwd func and better init for loha * Support gradient accumulation * Fix bugs of loha * use more stable init * Add OFT training * linting --- comfy/weight_adapter/__init__.py | 13 ++- comfy/weight_adapter/base.py | 40 +++++++++ comfy/weight_adapter/loha.py | 134 ++++++++++++++++++++++++++++++- comfy/weight_adapter/lokr.py | 86 +++++++++++++++++++- comfy/weight_adapter/oft.py | 67 +++++++++++++++- comfy_extras/nodes_train.py | 47 ++++++++--- 6 files changed, 372 insertions(+), 15 deletions(-) diff --git a/comfy/weight_adapter/__init__.py b/comfy/weight_adapter/__init__.py index 560b82be3..b40f920e4 100644 --- a/comfy/weight_adapter/__init__.py +++ b/comfy/weight_adapter/__init__.py @@ -15,9 +15,20 @@ adapters: list[type[WeightAdapterBase]] = [ OFTAdapter, BOFTAdapter, ] +adapter_maps: dict[str, type[WeightAdapterBase]] = { + "LoRA": LoRAAdapter, + "LoHa": LoHaAdapter, + "LoKr": LoKrAdapter, + "OFT": OFTAdapter, + ## We disable not implemented algo for now + # "GLoRA": GLoRAAdapter, + # "BOFT": BOFTAdapter, +} + __all__ = [ "WeightAdapterBase", "WeightAdapterTrainBase", - "adapters" + "adapters", + "adapter_maps", ] + [a.__name__ for a in adapters] diff --git a/comfy/weight_adapter/base.py b/comfy/weight_adapter/base.py index b5c7db423..43644b106 100644 --- a/comfy/weight_adapter/base.py +++ b/comfy/weight_adapter/base.py @@ -133,3 +133,43 @@ def tucker_weight_from_conv(up, down, mid): def tucker_weight(wa, wb, t): temp = torch.einsum("i j ..., j r -> i r ...", t, wb) return torch.einsum("i j ..., i r -> r j ...", temp, wa) + + +def factorization(dimension: int, factor: int = -1) -> tuple[int, int]: + """ + return a tuple of two value of input dimension decomposed by the number closest to factor + second value is higher or equal than first value. + + examples) + factor + -1 2 4 8 16 ... + 127 -> 1, 127 127 -> 1, 127 127 -> 1, 127 127 -> 1, 127 127 -> 1, 127 + 128 -> 8, 16 128 -> 2, 64 128 -> 4, 32 128 -> 8, 16 128 -> 8, 16 + 250 -> 10, 25 250 -> 2, 125 250 -> 2, 125 250 -> 5, 50 250 -> 10, 25 + 360 -> 8, 45 360 -> 2, 180 360 -> 4, 90 360 -> 8, 45 360 -> 12, 30 + 512 -> 16, 32 512 -> 2, 256 512 -> 4, 128 512 -> 8, 64 512 -> 16, 32 + 1024 -> 32, 32 1024 -> 2, 512 1024 -> 4, 256 1024 -> 8, 128 1024 -> 16, 64 + """ + + if factor > 0 and (dimension % factor) == 0 and dimension >= factor**2: + m = factor + n = dimension // factor + if m > n: + n, m = m, n + return m, n + if factor < 0: + factor = dimension + m, n = 1, dimension + length = m + n + while m < n: + new_m = m + 1 + while dimension % new_m != 0: + new_m += 1 + new_n = dimension // new_m + if new_m + new_n > length or new_m > factor: + break + else: + m, n = new_m, new_n + if m > n: + n, m = m, n + return m, n diff --git a/comfy/weight_adapter/loha.py b/comfy/weight_adapter/loha.py index ce79abad5..55c97a3af 100644 --- a/comfy/weight_adapter/loha.py +++ b/comfy/weight_adapter/loha.py @@ -3,7 +3,120 @@ from typing import Optional import torch import comfy.model_management -from .base import WeightAdapterBase, weight_decompose +from .base import WeightAdapterBase, WeightAdapterTrainBase, weight_decompose + + +class HadaWeight(torch.autograd.Function): + @staticmethod + def forward(ctx, w1u, w1d, w2u, w2d, scale=torch.tensor(1)): + ctx.save_for_backward(w1d, w1u, w2d, w2u, scale) + diff_weight = ((w1u @ w1d) * (w2u @ w2d)) * scale + return diff_weight + + @staticmethod + def backward(ctx, grad_out): + (w1d, w1u, w2d, w2u, scale) = ctx.saved_tensors + grad_out = grad_out * scale + temp = grad_out * (w2u @ w2d) + grad_w1u = temp @ w1d.T + grad_w1d = w1u.T @ temp + + temp = grad_out * (w1u @ w1d) + grad_w2u = temp @ w2d.T + grad_w2d = w2u.T @ temp + + del temp + return grad_w1u, grad_w1d, grad_w2u, grad_w2d, None + + +class HadaWeightTucker(torch.autograd.Function): + @staticmethod + def forward(ctx, t1, w1u, w1d, t2, w2u, w2d, scale=torch.tensor(1)): + ctx.save_for_backward(t1, w1d, w1u, t2, w2d, w2u, scale) + + rebuild1 = torch.einsum("i j ..., j r, i p -> p r ...", t1, w1d, w1u) + rebuild2 = torch.einsum("i j ..., j r, i p -> p r ...", t2, w2d, w2u) + + return rebuild1 * rebuild2 * scale + + @staticmethod + def backward(ctx, grad_out): + (t1, w1d, w1u, t2, w2d, w2u, scale) = ctx.saved_tensors + grad_out = grad_out * scale + + temp = torch.einsum("i j ..., j r -> i r ...", t2, w2d) + rebuild = torch.einsum("i j ..., i r -> r j ...", temp, w2u) + + grad_w = rebuild * grad_out + del rebuild + + grad_w1u = torch.einsum("r j ..., i j ... -> r i", temp, grad_w) + grad_temp = torch.einsum("i j ..., i r -> r j ...", grad_w, w1u.T) + del grad_w, temp + + grad_w1d = torch.einsum("i r ..., i j ... -> r j", t1, grad_temp) + grad_t1 = torch.einsum("i j ..., j r -> i r ...", grad_temp, w1d.T) + del grad_temp + + temp = torch.einsum("i j ..., j r -> i r ...", t1, w1d) + rebuild = torch.einsum("i j ..., i r -> r j ...", temp, w1u) + + grad_w = rebuild * grad_out + del rebuild + + grad_w2u = torch.einsum("r j ..., i j ... -> r i", temp, grad_w) + grad_temp = torch.einsum("i j ..., i r -> r j ...", grad_w, w2u.T) + del grad_w, temp + + grad_w2d = torch.einsum("i r ..., i j ... -> r j", t2, grad_temp) + grad_t2 = torch.einsum("i j ..., j r -> i r ...", grad_temp, w2d.T) + del grad_temp + return grad_t1, grad_w1u, grad_w1d, grad_t2, grad_w2u, grad_w2d, None + + +class LohaDiff(WeightAdapterTrainBase): + def __init__(self, weights): + super().__init__() + # Unpack weights tuple from LoHaAdapter + w1a, w1b, alpha, w2a, w2b, t1, t2, _ = weights + + # Create trainable parameters + self.hada_w1_a = torch.nn.Parameter(w1a) + self.hada_w1_b = torch.nn.Parameter(w1b) + self.hada_w2_a = torch.nn.Parameter(w2a) + self.hada_w2_b = torch.nn.Parameter(w2b) + + self.use_tucker = False + if t1 is not None and t2 is not None: + self.use_tucker = True + self.hada_t1 = torch.nn.Parameter(t1) + self.hada_t2 = torch.nn.Parameter(t2) + else: + # Keep the attributes for consistent access + self.hada_t1 = None + self.hada_t2 = None + + # Store rank and non-trainable alpha + self.rank = w1b.shape[0] + self.alpha = torch.nn.Parameter(torch.tensor(alpha), requires_grad=False) + + def __call__(self, w): + org_dtype = w.dtype + + scale = self.alpha / self.rank + if self.use_tucker: + diff_weight = HadaWeightTucker.apply(self.hada_t1, self.hada_w1_a, self.hada_w1_b, self.hada_t2, self.hada_w2_a, self.hada_w2_b, scale) + else: + diff_weight = HadaWeight.apply(self.hada_w1_a, self.hada_w1_b, self.hada_w2_a, self.hada_w2_b, scale) + + # Add the scaled difference to the original weight + weight = w.to(diff_weight) + diff_weight.reshape(w.shape) + + return weight.to(org_dtype) + + def passive_memory_usage(self): + """Calculates memory usage of the trainable parameters.""" + return sum(param.numel() * param.element_size() for param in self.parameters()) class LoHaAdapter(WeightAdapterBase): @@ -13,6 +126,25 @@ class LoHaAdapter(WeightAdapterBase): self.loaded_keys = loaded_keys self.weights = weights + @classmethod + def create_train(cls, weight, rank=1, alpha=1.0): + out_dim = weight.shape[0] + in_dim = weight.shape[1:].numel() + mat1 = torch.empty(out_dim, rank, device=weight.device, dtype=weight.dtype) + mat2 = torch.empty(rank, in_dim, device=weight.device, dtype=weight.dtype) + torch.nn.init.normal_(mat1, 0.1) + torch.nn.init.constant_(mat2, 0.0) + mat3 = torch.empty(out_dim, rank, device=weight.device, dtype=weight.dtype) + mat4 = torch.empty(rank, in_dim, device=weight.device, dtype=weight.dtype) + torch.nn.init.normal_(mat3, 0.1) + torch.nn.init.normal_(mat4, 0.01) + return LohaDiff( + (mat1, mat2, alpha, mat3, mat4, None, None, None) + ) + + def to_train(self): + return LohaDiff(self.weights) + @classmethod def load( cls, diff --git a/comfy/weight_adapter/lokr.py b/comfy/weight_adapter/lokr.py index 51233db2d..49b0be55f 100644 --- a/comfy/weight_adapter/lokr.py +++ b/comfy/weight_adapter/lokr.py @@ -3,7 +3,77 @@ from typing import Optional import torch import comfy.model_management -from .base import WeightAdapterBase, weight_decompose +from .base import ( + WeightAdapterBase, + WeightAdapterTrainBase, + weight_decompose, + factorization, +) + + +class LokrDiff(WeightAdapterTrainBase): + def __init__(self, weights): + super().__init__() + (lokr_w1, lokr_w2, alpha, lokr_w1_a, lokr_w1_b, lokr_w2_a, lokr_w2_b, lokr_t2, dora_scale) = weights + self.use_tucker = False + if lokr_w1_a is not None: + _, rank_a = lokr_w1_a.shape[0], lokr_w1_a.shape[1] + rank_a, _ = lokr_w1_b.shape[0], lokr_w1_b.shape[1] + self.lokr_w1_a = torch.nn.Parameter(lokr_w1_a) + self.lokr_w1_b = torch.nn.Parameter(lokr_w1_b) + self.w1_rebuild = True + self.ranka = rank_a + + if lokr_w2_a is not None: + _, rank_b = lokr_w2_a.shape[0], lokr_w2_a.shape[1] + rank_b, _ = lokr_w2_b.shape[0], lokr_w2_b.shape[1] + self.lokr_w2_a = torch.nn.Parameter(lokr_w2_a) + self.lokr_w2_b = torch.nn.Parameter(lokr_w2_b) + if lokr_t2 is not None: + self.use_tucker = True + self.lokr_t2 = torch.nn.Parameter(lokr_t2) + self.w2_rebuild = True + self.rankb = rank_b + + if lokr_w1 is not None: + self.lokr_w1 = torch.nn.Parameter(lokr_w1) + self.w1_rebuild = False + + if lokr_w2 is not None: + self.lokr_w2 = torch.nn.Parameter(lokr_w2) + self.w2_rebuild = False + + self.alpha = torch.nn.Parameter(torch.tensor(alpha), requires_grad=False) + + @property + def w1(self): + if self.w1_rebuild: + return (self.lokr_w1_a @ self.lokr_w1_b) * (self.alpha / self.ranka) + else: + return self.lokr_w1 + + @property + def w2(self): + if self.w2_rebuild: + if self.use_tucker: + w2 = torch.einsum( + 'i j k l, j r, i p -> p r k l', + self.lokr_t2, + self.lokr_w2_b, + self.lokr_w2_a + ) + else: + w2 = self.lokr_w2_a @ self.lokr_w2_b + return w2 * (self.alpha / self.rankb) + else: + return self.lokr_w2 + + def __call__(self, w): + diff = torch.kron(self.w1, self.w2) + return w + diff.reshape(w.shape).to(w) + + def passive_memory_usage(self): + return sum(param.numel() * param.element_size() for param in self.parameters()) class LoKrAdapter(WeightAdapterBase): @@ -13,6 +83,20 @@ class LoKrAdapter(WeightAdapterBase): self.loaded_keys = loaded_keys self.weights = weights + @classmethod + def create_train(cls, weight, rank=1, alpha=1.0): + out_dim = weight.shape[0] + in_dim = weight.shape[1:].numel() + out1, out2 = factorization(out_dim, rank) + in1, in2 = factorization(in_dim, rank) + mat1 = torch.empty(out1, in1, device=weight.device, dtype=weight.dtype) + mat2 = torch.empty(out2, in2, device=weight.device, dtype=weight.dtype) + torch.nn.init.kaiming_uniform_(mat2, a=5**0.5) + torch.nn.init.constant_(mat1, 0.0) + return LokrDiff( + (mat1, mat2, alpha, None, None, None, None, None, None) + ) + @classmethod def load( cls, diff --git a/comfy/weight_adapter/oft.py b/comfy/weight_adapter/oft.py index 25009eca3..9d4982083 100644 --- a/comfy/weight_adapter/oft.py +++ b/comfy/weight_adapter/oft.py @@ -3,7 +3,58 @@ from typing import Optional import torch import comfy.model_management -from .base import WeightAdapterBase, weight_decompose +from .base import WeightAdapterBase, WeightAdapterTrainBase, weight_decompose, factorization + + +class OFTDiff(WeightAdapterTrainBase): + def __init__(self, weights): + super().__init__() + # Unpack weights tuple from LoHaAdapter + blocks, rescale, alpha, _ = weights + + # Create trainable parameters + self.oft_blocks = torch.nn.Parameter(blocks) + if rescale is not None: + self.rescale = torch.nn.Parameter(rescale) + self.rescaled = True + else: + self.rescaled = False + self.block_num, self.block_size, _ = blocks.shape + self.constraint = float(alpha) + self.alpha = torch.nn.Parameter(torch.tensor(alpha), requires_grad=False) + + def __call__(self, w): + org_dtype = w.dtype + I = torch.eye(self.block_size, device=self.oft_blocks.device) + + ## generate r + # for Q = -Q^T + q = self.oft_blocks - self.oft_blocks.transpose(1, 2) + normed_q = q + if self.constraint: + q_norm = torch.norm(q) + 1e-8 + if q_norm > self.constraint: + normed_q = q * self.constraint / q_norm + # use float() to prevent unsupported type + r = (I + normed_q) @ (I - normed_q).float().inverse() + + ## Apply chunked matmul on weight + _, *shape = w.shape + org_weight = w.to(dtype=r.dtype) + org_weight = org_weight.unflatten(0, (self.block_num, self.block_size)) + # Init R=0, so add I on it to ensure the output of step0 is original model output + weight = torch.einsum( + "k n m, k n ... -> k m ...", + r, + org_weight, + ).flatten(0, 1) + if self.rescaled: + weight = self.rescale * weight + return weight.to(org_dtype) + + def passive_memory_usage(self): + """Calculates memory usage of the trainable parameters.""" + return sum(param.numel() * param.element_size() for param in self.parameters()) class OFTAdapter(WeightAdapterBase): @@ -13,6 +64,18 @@ class OFTAdapter(WeightAdapterBase): self.loaded_keys = loaded_keys self.weights = weights + @classmethod + def create_train(cls, weight, rank=1, alpha=1.0): + out_dim = weight.shape[0] + block_size, block_num = factorization(out_dim, rank) + block = torch.zeros(block_num, block_size, block_size, device=weight.device, dtype=weight.dtype) + return OFTDiff( + (block, None, alpha, None) + ) + + def to_train(self): + return OFTDiff(self.weights) + @classmethod def load( cls, @@ -60,6 +123,8 @@ class OFTAdapter(WeightAdapterBase): blocks = v[0] rescale = v[1] alpha = v[2] + if alpha is None: + alpha = 0 dora_scale = v[3] blocks = comfy.model_management.cast_to_device(blocks, weight.device, intermediate_dtype) diff --git a/comfy_extras/nodes_train.py b/comfy_extras/nodes_train.py index 3d05fdab5..c3aaaee9b 100644 --- a/comfy_extras/nodes_train.py +++ b/comfy_extras/nodes_train.py @@ -20,7 +20,7 @@ import folder_paths import node_helpers from comfy.cli_args import args from comfy.comfy_types.node_typing import IO -from comfy.weight_adapter import adapters +from comfy.weight_adapter import adapters, adapter_maps def make_batch_extra_option_dict(d, indicies, full_size=None): @@ -39,13 +39,13 @@ def make_batch_extra_option_dict(d, indicies, full_size=None): class TrainSampler(comfy.samplers.Sampler): - - def __init__(self, loss_fn, optimizer, loss_callback=None, batch_size=1, total_steps=1, seed=0, training_dtype=torch.bfloat16): + def __init__(self, loss_fn, optimizer, loss_callback=None, batch_size=1, grad_acc=1, total_steps=1, seed=0, training_dtype=torch.bfloat16): self.loss_fn = loss_fn self.optimizer = optimizer self.loss_callback = loss_callback self.batch_size = batch_size self.total_steps = total_steps + self.grad_acc = grad_acc self.seed = seed self.training_dtype = training_dtype @@ -92,8 +92,9 @@ class TrainSampler(comfy.samplers.Sampler): self.loss_callback(loss.item()) pbar.set_postfix({"loss": f"{loss.item():.4f}"}) - self.optimizer.step() - self.optimizer.zero_grad() + if (i+1) % self.grad_acc == 0: + self.optimizer.step() + self.optimizer.zero_grad() torch.cuda.empty_cache() return torch.zeros_like(latent_image) @@ -419,6 +420,16 @@ class TrainLoraNode: "tooltip": "The batch size to use for training.", }, ), + "grad_accumulation_steps": ( + IO.INT, + { + "default": 1, + "min": 1, + "max": 1024, + "step": 1, + "tooltip": "The number of gradient accumulation steps to use for training.", + } + ), "steps": ( IO.INT, { @@ -478,6 +489,17 @@ class TrainLoraNode: ["bf16", "fp32"], {"default": "bf16", "tooltip": "The dtype to use for lora."}, ), + "algorithm": ( + list(adapter_maps.keys()), + {"default": list(adapter_maps.keys())[0], "tooltip": "The algorithm to use for training."}, + ), + "gradient_checkpointing": ( + IO.BOOLEAN, + { + "default": True, + "tooltip": "Use gradient checkpointing for training.", + } + ), "existing_lora": ( folder_paths.get_filename_list("loras") + ["[None]"], { @@ -501,6 +523,7 @@ class TrainLoraNode: positive, batch_size, steps, + grad_accumulation_steps, learning_rate, rank, optimizer, @@ -508,6 +531,8 @@ class TrainLoraNode: seed, training_dtype, lora_dtype, + algorithm, + gradient_checkpointing, existing_lora, ): mp = model.clone() @@ -558,10 +583,8 @@ class TrainLoraNode: if existing_adapter is not None: break else: - # If no existing adapter found, use LoRA - # We will add algo option in the future existing_adapter = None - adapter_cls = adapters[0] + adapter_cls = adapter_maps[algorithm] if existing_adapter is not None: train_adapter = existing_adapter.to_train().to(lora_dtype) @@ -615,8 +638,9 @@ class TrainLoraNode: criterion = torch.nn.SmoothL1Loss() # setup models - for m in find_all_highest_child_module_with_forward(mp.model.diffusion_model): - patch(m) + if gradient_checkpointing: + for m in find_all_highest_child_module_with_forward(mp.model.diffusion_model): + patch(m) mp.model.requires_grad_(False) comfy.model_management.load_models_gpu([mp], memory_required=1e20, force_full_load=True) @@ -629,7 +653,8 @@ class TrainLoraNode: optimizer, loss_callback=loss_callback, batch_size=batch_size, - total_steps=steps, + grad_acc=grad_accumulation_steps, + total_steps=steps*grad_accumulation_steps, seed=seed, training_dtype=dtype ) From 0ccc88b03fbe190135e24ac04612565f8f0756b4 Mon Sep 17 00:00:00 2001 From: honglyua Date: Fri, 25 Jul 2025 01:57:36 +0800 Subject: [PATCH 055/243] Support Iluvatar CoreX (#8585) * Support Iluvatar CoreX Co-authored-by: mingjiang.li --- README.md | 7 +++++++ comfy/model_management.py | 23 ++++++++++++++++++++++- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d004364ee..a148623cd 100644 --- a/README.md +++ b/README.md @@ -294,6 +294,13 @@ For models compatible with Cambricon Extension for PyTorch (torch_mlu). Here's a 2. Next, install the PyTorch(torch_mlu) following the instructions on the [Installation](https://www.cambricon.com/docs/sdk_1.15.0/cambricon_pytorch_1.17.0/user_guide_1.9/index.html) 3. Launch ComfyUI by running `python main.py` +#### Iluvatar Corex + +For models compatible with Iluvatar Extension for PyTorch. Here's a step-by-step guide tailored to your platform and installation method: + +1. Install the Iluvatar Corex Toolkit by adhering to the platform-specific instructions on the [Installation](https://support.iluvatar.com/#/DocumentCentre?id=1&nameCenter=2&productId=520117912052801536) +2. Launch ComfyUI by running `python main.py` + # Running ```python main.py``` diff --git a/comfy/model_management.py b/comfy/model_management.py index e8b9b5c81..9add54ceb 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -128,6 +128,11 @@ try: except: mlu_available = False +try: + ixuca_available = hasattr(torch, "corex") +except: + ixuca_available = False + if args.cpu: cpu_state = CPUState.CPU @@ -151,6 +156,12 @@ def is_mlu(): return True return False +def is_ixuca(): + global ixuca_available + if ixuca_available: + return True + return False + def get_torch_device(): global directml_enabled global cpu_state @@ -289,7 +300,7 @@ try: if torch_version_numeric[0] >= 2: if ENABLE_PYTORCH_ATTENTION == False and args.use_split_cross_attention == False and args.use_quad_cross_attention == False: ENABLE_PYTORCH_ATTENTION = True - if is_intel_xpu() or is_ascend_npu() or is_mlu(): + if is_intel_xpu() or is_ascend_npu() or is_mlu() or is_ixuca(): if args.use_split_cross_attention == False and args.use_quad_cross_attention == False: ENABLE_PYTORCH_ATTENTION = True except: @@ -1045,6 +1056,8 @@ def xformers_enabled(): return False if is_mlu(): return False + if is_ixuca(): + return False if directml_enabled: return False return XFORMERS_IS_AVAILABLE @@ -1080,6 +1093,8 @@ def pytorch_attention_flash_attention(): return True if is_amd(): return True #if you have pytorch attention enabled on AMD it probably supports at least mem efficient attention + if is_ixuca(): + return True return False def force_upcast_attention_dtype(): @@ -1205,6 +1220,9 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma if is_mlu(): return True + if is_ixuca(): + return True + if torch.version.hip: return True @@ -1268,6 +1286,9 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma if is_ascend_npu(): return True + if is_ixuca(): + return True + if is_amd(): arch = torch.cuda.get_device_properties(device).gcnArchName if any((a in arch) for a in ["gfx1030", "gfx1031", "gfx1010", "gfx1011", "gfx1012", "gfx906", "gfx900", "gfx803"]): # RDNA2 and older don't support bf16 From d03ae077b4330f58e7caba53ff94e7fd58d0dc7d Mon Sep 17 00:00:00 2001 From: SHIVANSH GUPTA <121501003+shivansh-gupta4@users.noreply.github.com> Date: Thu, 24 Jul 2025 23:35:54 +0530 Subject: [PATCH 056/243] Added parameter required_frontend_version in the /system_stats API response (#8875) * Added the parameter required_frontend_version in the /system_stats api response * Update server.py * Created a function get_required_frontend_version and wrote tests for it * Refactored the function to return currently installed frontend pacakage version * Moved required_frontend to a new function and imported that in server.py * Corrected test cases using mocking techniques * Corrected files to comply with ruff formatting --- app/frontend_management.py | 47 +++++++++++++++++--- server.py | 2 + tests-unit/app_test/frontend_manager_test.py | 35 ++++++++++++++- 3 files changed, 77 insertions(+), 7 deletions(-) diff --git a/app/frontend_management.py b/app/frontend_management.py index 001ebbecb..0bee73685 100644 --- a/app/frontend_management.py +++ b/app/frontend_management.py @@ -29,18 +29,48 @@ def frontend_install_warning_message(): This error is happening because the ComfyUI frontend is no longer shipped as part of the main repo but as a pip package instead. """.strip() +def parse_version(version: str) -> tuple[int, int, int]: + return tuple(map(int, version.split("."))) + +def is_valid_version(version: str) -> bool: + """Validate if a string is a valid semantic version (X.Y.Z format).""" + pattern = r"^(\d+)\.(\d+)\.(\d+)$" + return bool(re.match(pattern, version)) + +def get_installed_frontend_version(): + """Get the currently installed frontend package version.""" + frontend_version_str = version("comfyui-frontend-package") + return frontend_version_str + +def get_required_frontend_version(): + """Get the required frontend version from requirements.txt.""" + try: + with open(requirements_path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if line.startswith("comfyui-frontend-package=="): + version_str = line.split("==")[-1] + if not is_valid_version(version_str): + logging.error(f"Invalid version format in requirements.txt: {version_str}") + return None + return version_str + logging.error("comfyui-frontend-package not found in requirements.txt") + return None + except FileNotFoundError: + logging.error("requirements.txt not found. Cannot determine required frontend version.") + return None + except Exception as e: + logging.error(f"Error reading requirements.txt: {e}") + return None def check_frontend_version(): """Check if the frontend version is up to date.""" - def parse_version(version: str) -> tuple[int, int, int]: - return tuple(map(int, version.split("."))) - try: - frontend_version_str = version("comfyui-frontend-package") + frontend_version_str = get_installed_frontend_version() frontend_version = parse_version(frontend_version_str) - with open(requirements_path, "r", encoding="utf-8") as f: - required_frontend = parse_version(f.readline().split("=")[-1]) + required_frontend_str = get_required_frontend_version() + required_frontend = parse_version(required_frontend_str) if frontend_version < required_frontend: app.logger.log_startup_warning( f""" @@ -168,6 +198,11 @@ def download_release_asset_zip(release: Release, destination_path: str) -> None: class FrontendManager: CUSTOM_FRONTENDS_ROOT = str(Path(__file__).parents[1] / "web_custom_versions") + @classmethod + def get_required_frontend_version(cls) -> str: + """Get the required frontend package version.""" + return get_required_frontend_version() + @classmethod def default_frontend_path(cls) -> str: try: diff --git a/server.py b/server.py index 71a58f0fa..f4de0079b 100644 --- a/server.py +++ b/server.py @@ -553,6 +553,7 @@ class PromptServer(): ram_free = comfy.model_management.get_free_memory(cpu_device) vram_total, torch_vram_total = comfy.model_management.get_total_memory(device, torch_total_too=True) vram_free, torch_vram_free = comfy.model_management.get_free_memory(device, torch_free_too=True) + required_frontend_version = FrontendManager.get_required_frontend_version() system_stats = { "system": { @@ -560,6 +561,7 @@ class PromptServer(): "ram_total": ram_total, "ram_free": ram_free, "comfyui_version": __version__, + "required_frontend_version": required_frontend_version, "python_version": sys.version, "pytorch_version": comfy.model_management.torch_version, "embedded_python": os.path.split(os.path.split(sys.executable)[0])[1] == "python_embeded", diff --git a/tests-unit/app_test/frontend_manager_test.py b/tests-unit/app_test/frontend_manager_test.py index ce67df6c6..ce43ac564 100644 --- a/tests-unit/app_test/frontend_manager_test.py +++ b/tests-unit/app_test/frontend_manager_test.py @@ -1,7 +1,7 @@ import argparse import pytest from requests.exceptions import HTTPError -from unittest.mock import patch +from unittest.mock import patch, mock_open from app.frontend_management import ( FrontendManager, @@ -172,3 +172,36 @@ def test_init_frontend_fallback_on_error(): # Assert assert frontend_path == "/default/path" mock_check.assert_called_once() + + +def test_get_frontend_version(): + # Arrange + expected_version = "1.25.0" + mock_requirements_content = """torch +torchsde +comfyui-frontend-package==1.25.0 +other-package==1.0.0 +numpy""" + + # Act + with patch("builtins.open", mock_open(read_data=mock_requirements_content)): + version = FrontendManager.get_required_frontend_version() + + # Assert + assert version == expected_version + + +def test_get_frontend_version_invalid_semver(): + # Arrange + mock_requirements_content = """torch +torchsde +comfyui-frontend-package==1.29.3.75 +other-package==1.0.0 +numpy""" + + # Act + with patch("builtins.open", mock_open(read_data=mock_requirements_content)): + version = FrontendManager.get_required_frontend_version() + + # Assert + assert version is None From 69cb57b3426b08a82e7fb713b0b48c23725f3da7 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 24 Jul 2025 12:06:25 -0700 Subject: [PATCH 057/243] Print xpu device name. (#9035) --- comfy/model_management.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comfy/model_management.py b/comfy/model_management.py index 9add54ceb..232d363aa 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -392,6 +392,8 @@ def get_torch_device_name(device): except: allocator_backend = "" return "{} {} : {}".format(device, torch.cuda.get_device_name(device), allocator_backend) + elif device.type == "xpu": + return "{} {}".format(device, torch.xpu.get_device_name(device)) else: return "{}".format(device.type) elif is_intel_xpu(): From 4293e4da214f77a3fde97c15f0691307e61bc18d Mon Sep 17 00:00:00 2001 From: Eugene Fairley Date: Thu, 24 Jul 2025 17:59:19 -0700 Subject: [PATCH 058/243] Add WAN ATI support (#8874) * Add WAN ATI support * Fixes * Fix length * Remove extra functions * Fix * Fix * Ruff fix * Remove torch.no_grad * Add batch trajectory logic * Scale inputs before and after motion patch * Batch image/trajectory * Ruff fix * Clean up --- comfy/utils.py | 20 +++ comfy_extras/nodes_wan.py | 305 +++++++++++++++++++++++++++++++++++++- 2 files changed, 324 insertions(+), 1 deletion(-) diff --git a/comfy/utils.py b/comfy/utils.py index 9c076a0e0..fab28cf08 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -698,6 +698,26 @@ def resize_to_batch_size(tensor, batch_size): return output +def resize_list_to_batch_size(l, batch_size): + in_batch_size = len(l) + if in_batch_size == batch_size or in_batch_size == 0: + return l + + if batch_size <= 1: + return l[:batch_size] + + output = [] + if batch_size < in_batch_size: + scale = (in_batch_size - 1) / (batch_size - 1) + for i in range(batch_size): + output.append(l[min(round(i * scale), in_batch_size - 1)]) + else: + scale = in_batch_size / batch_size + for i in range(batch_size): + output.append(l[min(math.floor((i + 0.5) * scale), in_batch_size - 1)]) + + return output + def convert_sd_to(state_dict, dtype): keys = list(state_dict.keys()) for k in keys: diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index d6097a104..d71908f31 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -1,3 +1,4 @@ +import math import nodes import node_helpers import torch @@ -5,7 +6,9 @@ import comfy.model_management import comfy.utils import comfy.latent_formats import comfy.clip_vision - +import json +import numpy as np +from typing import Tuple class WanImageToVideo: @classmethod @@ -383,7 +386,307 @@ class WanPhantomSubjectToVideo: out_latent["samples"] = latent return (positive, cond2, negative, out_latent) +def parse_json_tracks(tracks): + """Parse JSON track data into a standardized format""" + tracks_data = [] + try: + # If tracks is a string, try to parse it as JSON + if isinstance(tracks, str): + parsed = json.loads(tracks.replace("'", '"')) + tracks_data.extend(parsed) + else: + # If tracks is a list of strings, parse each one + for track_str in tracks: + parsed = json.loads(track_str.replace("'", '"')) + tracks_data.append(parsed) + + # Check if we have a single track (dict with x,y) or a list of tracks + if tracks_data and isinstance(tracks_data[0], dict) and 'x' in tracks_data[0]: + # Single track detected, wrap it in a list + tracks_data = [tracks_data] + elif tracks_data and isinstance(tracks_data[0], list) and tracks_data[0] and isinstance(tracks_data[0][0], dict) and 'x' in tracks_data[0][0]: + # Already a list of tracks, nothing to do + pass + else: + # Unexpected format + pass + + except json.JSONDecodeError: + tracks_data = [] + return tracks_data + +def process_tracks(tracks_np: np.ndarray, frame_size: Tuple[int, int], num_frames, quant_multi: int = 8, **kwargs): + # tracks: shape [t, h, w, 3] => samples align with 24 fps, model trained with 16 fps. + # frame_size: tuple (W, H) + tracks = torch.from_numpy(tracks_np).float() + + if tracks.shape[1] == 121: + tracks = torch.permute(tracks, (1, 0, 2, 3)) + + tracks, visibles = tracks[..., :2], tracks[..., 2:3] + + short_edge = min(*frame_size) + + frame_center = torch.tensor([*frame_size]).type_as(tracks) / 2 + tracks = tracks - frame_center + + tracks = tracks / short_edge * 2 + + visibles = visibles * 2 - 1 + + trange = torch.linspace(-1, 1, tracks.shape[0]).view(-1, 1, 1, 1).expand(*visibles.shape) + + out_ = torch.cat([trange, tracks, visibles], dim=-1).view(121, -1, 4) + + out_0 = out_[:1] + + out_l = out_[1:] # 121 => 120 | 1 + a = 120 // math.gcd(120, num_frames) + b = num_frames // math.gcd(120, num_frames) + out_l = torch.repeat_interleave(out_l, b, dim=0)[1::a] # 120 => 120 * b => 120 * b / a == F + + final_result = torch.cat([out_0, out_l], dim=0) + + return final_result + +FIXED_LENGTH = 121 +def pad_pts(tr): + """Convert list of {x,y} to (FIXED_LENGTH,1,3) array, padding/truncating.""" + pts = np.array([[p['x'], p['y'], 1] for p in tr], dtype=np.float32) + n = pts.shape[0] + if n < FIXED_LENGTH: + pad = np.zeros((FIXED_LENGTH - n, 3), dtype=np.float32) + pts = np.vstack((pts, pad)) + else: + pts = pts[:FIXED_LENGTH] + return pts.reshape(FIXED_LENGTH, 1, 3) + +def ind_sel(target: torch.Tensor, ind: torch.Tensor, dim: int = 1): + """Index selection utility function""" + assert ( + len(ind.shape) > dim + ), "Index must have the target dim, but get dim: %d, ind shape: %s" % (dim, str(ind.shape)) + + target = target.expand( + *tuple( + [ind.shape[k] if target.shape[k] == 1 else -1 for k in range(dim)] + + [ + -1, + ] + * (len(target.shape) - dim) + ) + ) + + ind_pad = ind + + if len(target.shape) > dim + 1: + for _ in range(len(target.shape) - (dim + 1)): + ind_pad = ind_pad.unsqueeze(-1) + ind_pad = ind_pad.expand(*(-1,) * (dim + 1), *target.shape[(dim + 1) : :]) + + return torch.gather(target, dim=dim, index=ind_pad) + +def merge_final(vert_attr: torch.Tensor, weight: torch.Tensor, vert_assign: torch.Tensor): + """Merge vertex attributes with weights""" + target_dim = len(vert_assign.shape) - 1 + if len(vert_attr.shape) == 2: + assert vert_attr.shape[0] > vert_assign.max() + new_shape = [1] * target_dim + list(vert_attr.shape) + tensor = vert_attr.reshape(new_shape) + sel_attr = ind_sel(tensor, vert_assign.type(torch.long), dim=target_dim) + else: + assert vert_attr.shape[1] > vert_assign.max() + new_shape = [vert_attr.shape[0]] + [1] * (target_dim - 1) + list(vert_attr.shape[1:]) + tensor = vert_attr.reshape(new_shape) + sel_attr = ind_sel(tensor, vert_assign.type(torch.long), dim=target_dim) + + final_attr = torch.sum(sel_attr * weight.unsqueeze(-1), dim=-2) + return final_attr + + +def _patch_motion_single( + tracks: torch.FloatTensor, # (B, T, N, 4) + vid: torch.FloatTensor, # (C, T, H, W) + temperature: float, + vae_divide: tuple, + topk: int, +): + """Apply motion patching based on tracks""" + _, T, H, W = vid.shape + N = tracks.shape[2] + _, tracks_xy, visible = torch.split( + tracks, [1, 2, 1], dim=-1 + ) # (B, T, N, 2) | (B, T, N, 1) + tracks_n = tracks_xy / torch.tensor([W / min(H, W), H / min(H, W)], device=tracks_xy.device) + tracks_n = tracks_n.clamp(-1, 1) + visible = visible.clamp(0, 1) + + xx = torch.linspace(-W / min(H, W), W / min(H, W), W) + yy = torch.linspace(-H / min(H, W), H / min(H, W), H) + + grid = torch.stack(torch.meshgrid(yy, xx, indexing="ij")[::-1], dim=-1).to( + tracks_xy.device + ) + + tracks_pad = tracks_xy[:, 1:] + visible_pad = visible[:, 1:] + + visible_align = visible_pad.view(T - 1, 4, *visible_pad.shape[2:]).sum(1) + tracks_align = (tracks_pad * visible_pad).view(T - 1, 4, *tracks_pad.shape[2:]).sum( + 1 + ) / (visible_align + 1e-5) + dist_ = ( + (tracks_align[:, None, None] - grid[None, :, :, None]).pow(2).sum(-1) + ) # T, H, W, N + weight = torch.exp(-dist_ * temperature) * visible_align.clamp(0, 1).view( + T - 1, 1, 1, N + ) + vert_weight, vert_index = torch.topk( + weight, k=min(topk, weight.shape[-1]), dim=-1 + ) + + grid_mode = "bilinear" + point_feature = torch.nn.functional.grid_sample( + vid.permute(1, 0, 2, 3)[:1], + tracks_n[:, :1].type(vid.dtype), + mode=grid_mode, + padding_mode="zeros", + align_corners=False, + ) + point_feature = point_feature.squeeze(0).squeeze(1).permute(1, 0) # N, C=16 + + out_feature = merge_final(point_feature, vert_weight, vert_index).permute(3, 0, 1, 2) # T - 1, H, W, C => C, T - 1, H, W + out_weight = vert_weight.sum(-1) # T - 1, H, W + + # out feature -> already soft weighted + mix_feature = out_feature + vid[:, 1:] * (1 - out_weight.clamp(0, 1)) + + out_feature_full = torch.cat([vid[:, :1], mix_feature], dim=1) # C, T, H, W + out_mask_full = torch.cat([torch.ones_like(out_weight[:1]), out_weight], dim=0) # T, H, W + + return out_mask_full[None].expand(vae_divide[0], -1, -1, -1), out_feature_full + + +def patch_motion( + tracks: torch.FloatTensor, # (B, TB, T, N, 4) + vid: torch.FloatTensor, # (C, T, H, W) + temperature: float = 220.0, + vae_divide: tuple = (4, 16), + topk: int = 2, +): + B = len(tracks) + + # Process each batch separately + out_masks = [] + out_features = [] + + for b in range(B): + mask, feature = _patch_motion_single( + tracks[b], # (T, N, 4) + vid[b], # (C, T, H, W) + temperature, + vae_divide, + topk + ) + out_masks.append(mask) + out_features.append(feature) + + # Stack results: (B, C, T, H, W) + out_mask_full = torch.stack(out_masks, dim=0) + out_feature_full = torch.stack(out_features, dim=0) + + return out_mask_full, out_feature_full + +class WanTrackToVideo: + @classmethod + def INPUT_TYPES(s): + return {"required": { + "positive": ("CONDITIONING", ), + "negative": ("CONDITIONING", ), + "vae": ("VAE", ), + "tracks": ("STRING", {"multiline": True, "default": "[]"}), + "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), + "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), + "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), + "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), + "temperature": ("FLOAT", {"default": 220.0, "min": 1.0, "max": 1000.0, "step": 0.1}), + "topk": ("INT", {"default": 2, "min": 1, "max": 10}), + "start_image": ("IMAGE", ), + }, + "optional": { + "clip_vision_output": ("CLIP_VISION_OUTPUT", ), + }} + + RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") + RETURN_NAMES = ("positive", "negative", "latent") + FUNCTION = "encode" + + CATEGORY = "conditioning/video_models" + + def encode(self, positive, negative, vae, tracks, width, height, length, batch_size, + temperature, topk, start_image=None, clip_vision_output=None): + + tracks_data = parse_json_tracks(tracks) + + if not tracks_data: + return WanImageToVideo().encode(positive, negative, vae, width, height, length, batch_size, start_image=start_image, clip_vision_output=clip_vision_output) + + latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], + device=comfy.model_management.intermediate_device()) + + if isinstance(tracks_data[0][0], dict): + tracks_data = [tracks_data] + + processed_tracks = [] + for batch in tracks_data: + arrs = [] + for track in batch: + pts = pad_pts(track) + arrs.append(pts) + + tracks_np = np.stack(arrs, axis=0) + processed_tracks.append(process_tracks(tracks_np, (width, height), length - 1).unsqueeze(0)) + + if start_image is not None: + start_image = comfy.utils.common_upscale(start_image[:batch_size].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + videos = torch.ones((start_image.shape[0], length, height, width, start_image.shape[-1]), device=start_image.device, dtype=start_image.dtype) * 0.5 + for i in range(start_image.shape[0]): + videos[i, 0] = start_image[i] + + latent_videos = [] + videos = comfy.utils.resize_to_batch_size(videos, batch_size) + for i in range(batch_size): + latent_videos += [vae.encode(videos[i, :, :, :, :3])] + y = torch.cat(latent_videos, dim=0) + + # Scale latent since patch_motion is non-linear + y = comfy.latent_formats.Wan21().process_in(y) + + processed_tracks = comfy.utils.resize_list_to_batch_size(processed_tracks, batch_size) + res = patch_motion( + processed_tracks, y, temperature=temperature, topk=topk, vae_divide=(4, 16) + ) + + mask, concat_latent_image = res + concat_latent_image = comfy.latent_formats.Wan21().process_out(concat_latent_image) + mask = -mask + 1.0 # Invert mask to match expected format + positive = node_helpers.conditioning_set_values(positive, + {"concat_mask": mask, + "concat_latent_image": concat_latent_image}) + negative = node_helpers.conditioning_set_values(negative, + {"concat_mask": mask, + "concat_latent_image": concat_latent_image}) + + if clip_vision_output is not None: + positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output}) + negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output}) + + out_latent = {} + out_latent["samples"] = latent + return (positive, negative, out_latent) + NODE_CLASS_MAPPINGS = { + "WanTrackToVideo": WanTrackToVideo, "WanImageToVideo": WanImageToVideo, "WanFunControlToVideo": WanFunControlToVideo, "WanFunInpaintToVideo": WanFunInpaintToVideo, From e6e5d33b351fc5ed8334d74dac77b283ecea8708 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 25 Jul 2025 01:58:28 -0700 Subject: [PATCH 059/243] Remove useless code. (#9041) This is only needed on old pytorch 2.0 and older. --- comfy/ldm/wan/vae.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/comfy/ldm/wan/vae.py b/comfy/ldm/wan/vae.py index a8ebc5ec6..a83c6edfd 100644 --- a/comfy/ldm/wan/vae.py +++ b/comfy/ldm/wan/vae.py @@ -52,15 +52,6 @@ class RMS_norm(nn.Module): x, dim=(1 if self.channel_first else -1)) * self.scale * self.gamma.to(x) + (self.bias.to(x) if self.bias is not None else 0) -class Upsample(nn.Upsample): - - def forward(self, x): - """ - Fix bfloat16 support for nearest neighbor interpolation. - """ - return super().forward(x.float()).type_as(x) - - class Resample(nn.Module): def __init__(self, dim, mode): @@ -73,11 +64,11 @@ class Resample(nn.Module): # layers if mode == 'upsample2d': self.resample = nn.Sequential( - Upsample(scale_factor=(2., 2.), mode='nearest-exact'), + nn.Upsample(scale_factor=(2., 2.), mode='nearest-exact'), ops.Conv2d(dim, dim // 2, 3, padding=1)) elif mode == 'upsample3d': self.resample = nn.Sequential( - Upsample(scale_factor=(2., 2.), mode='nearest-exact'), + nn.Upsample(scale_factor=(2., 2.), mode='nearest-exact'), ops.Conv2d(dim, dim // 2, 3, padding=1)) self.time_conv = CausalConv3d( dim, dim * 2, (3, 1, 1), padding=(1, 0, 0)) From 93bc2f8e4d5dace2328b861579df24f91684e27e Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Sat, 26 Jul 2025 01:24:23 +0800 Subject: [PATCH 060/243] Update template to 0.1.40 (#9048) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8f6a6d112..33a59b4be 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.39 +comfyui-workflow-templates==0.1.40 comfyui-embedded-docs==0.2.4 torch torchsde From c0207b473fa9ad413fad6d5658449356e39758cc Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 25 Jul 2025 14:25:08 -0700 Subject: [PATCH 061/243] Fix issue with line endings github workflow. (#9053) --- .github/workflows/check-line-endings.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/check-line-endings.yml b/.github/workflows/check-line-endings.yml index 03b3e3ced..eeb594d6c 100644 --- a/.github/workflows/check-line-endings.yml +++ b/.github/workflows/check-line-endings.yml @@ -17,8 +17,7 @@ jobs: - name: Check for Windows line endings (CRLF) run: | # Get the list of changed files in the PR - git merge origin/${{ github.base_ref }} --no-edit - CHANGED_FILES=$(git diff --name-only origin/${{ github.base_ref }}..HEAD) + CHANGED_FILES=$(git diff --name-only ${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }}) # Flag to track if CRLF is found CRLF_FOUND=false From c60467a148c2b6f7b9fe47725362361f2de9ae50 Mon Sep 17 00:00:00 2001 From: Thor-ATX Date: Sat, 26 Jul 2025 09:27:03 +1200 Subject: [PATCH 062/243] Update negative prompt for Moonvalley nodes (#9038) Co-authored-by: thorsten --- comfy_api_nodes/nodes_moonvalley.py | 359 +++++++++++++++++----------- 1 file changed, 224 insertions(+), 135 deletions(-) diff --git a/comfy_api_nodes/nodes_moonvalley.py b/comfy_api_nodes/nodes_moonvalley.py index 6e937411c..057021efa 100644 --- a/comfy_api_nodes/nodes_moonvalley.py +++ b/comfy_api_nodes/nodes_moonvalley.py @@ -2,7 +2,11 @@ import logging from typing import Any, Callable, Optional, TypeVar import random import torch -from comfy_api_nodes.util.validation_utils import get_image_dimensions, validate_image_dimensions, validate_video_dimensions +from comfy_api_nodes.util.validation_utils import ( + get_image_dimensions, + validate_image_dimensions, + validate_video_dimensions, +) from comfy_api_nodes.apis import ( @@ -10,7 +14,7 @@ from comfy_api_nodes.apis import ( MoonvalleyTextToVideoInferenceParams, MoonvalleyVideoToVideoInferenceParams, MoonvalleyVideoToVideoRequest, - MoonvalleyPromptResponse + MoonvalleyPromptResponse, ) from comfy_api_nodes.apis.client import ( ApiEndpoint, @@ -54,20 +58,26 @@ MAX_VIDEO_SIZE = 1024 * 1024 * 1024 # 1 GB max for in-memory video processing MOONVALLEY_MAREY_MAX_PROMPT_LENGTH = 5000 R = TypeVar("R") + + class MoonvalleyApiError(Exception): """Base exception for Moonvalley API errors.""" + pass + def is_valid_task_creation_response(response: MoonvalleyPromptResponse) -> bool: """Verifies that the initial response contains a task ID.""" return bool(response.id) + def validate_task_creation_response(response) -> None: if not is_valid_task_creation_response(response): error_msg = f"Moonvalley Marey API: Initial request failed. Code: {response.code}, Message: {response.message}, Data: {response}" logging.error(error_msg) raise MoonvalleyApiError(error_msg) + def get_video_from_response(response): video = response.output_url logging.info( @@ -102,16 +112,17 @@ def poll_until_finished( poll_interval=16.0, failed_statuses=["error"], status_extractor=lambda response: ( - response.status - if response and response.status - else None + response.status if response and response.status else None ), auth_kwargs=auth_kwargs, result_url_extractor=result_url_extractor, node_id=node_id, ).execute() -def validate_prompts(prompt:str, negative_prompt: str, max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH): + +def validate_prompts( + prompt: str, negative_prompt: str, max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH +): """Verifies that the prompt isn't empty and that neither prompt is too long.""" if not prompt: raise ValueError("Positive prompt is empty") @@ -123,16 +134,15 @@ def validate_prompts(prompt:str, negative_prompt: str, max_length=MOONVALLEY_MAR ) return True + def validate_input_media(width, height, with_frame_conditioning, num_frames_in=None): - # inference validation - # T = num_frames - # in all cases, the following must be true: T divisible by 16 and H,W by 8. in addition... - # with image conditioning: H*W must be divisible by 8192 - # without image conditioning: T divisible by 32 - if num_frames_in and not num_frames_in % 16 == 0 : - return False, ( - "The input video total frame count must be divisible by 16!" - ) + # inference validation + # T = num_frames + # in all cases, the following must be true: T divisible by 16 and H,W by 8. in addition... + # with image conditioning: H*W must be divisible by 8192 + # without image conditioning: T divisible by 32 + if num_frames_in and not num_frames_in % 16 == 0: + return False, ("The input video total frame count must be divisible by 16!") if height % 8 != 0 or width % 8 != 0: return False, ( @@ -146,13 +156,13 @@ def validate_input_media(width, height, with_frame_conditioning, num_frames_in=N "divisible by 8192 for frame conditioning" ) else: - if num_frames_in and not num_frames_in % 32 == 0 : - return False, ( - "The input video total frame count must be divisible by 32!" - ) + if num_frames_in and not num_frames_in % 32 == 0: + return False, ("The input video total frame count must be divisible by 32!") -def validate_input_image(image: torch.Tensor, with_frame_conditioning: bool=False) -> None: +def validate_input_image( + image: torch.Tensor, with_frame_conditioning: bool = False +) -> None: """ Validates the input image adheres to the expectations of the API: - The image resolution should not be less than 300*300px @@ -160,10 +170,15 @@ def validate_input_image(image: torch.Tensor, with_frame_conditioning: bool=Fals """ height, width = get_image_dimensions(image) - validate_input_media(width, height, with_frame_conditioning ) - validate_image_dimensions(image, min_width=300, min_height=300, max_height=MAX_HEIGHT, max_width=MAX_WIDTH) + validate_input_media(width, height, with_frame_conditioning) + validate_image_dimensions( + image, min_width=300, min_height=300, max_height=MAX_HEIGHT, max_width=MAX_WIDTH + ) -def validate_input_video(video: VideoInput, num_frames_out: int, with_frame_conditioning: bool=False): + +def validate_input_video( + video: VideoInput, num_frames_out: int, with_frame_conditioning: bool = False +): try: width, height = video.get_dimensions() except Exception as e: @@ -171,7 +186,13 @@ def validate_input_video(video: VideoInput, num_frames_out: int, with_frame_cond raise ValueError(f"Cannot get video dimensions: {e}") from e validate_input_media(width, height, with_frame_conditioning) - validate_video_dimensions(video, min_width=MIN_VID_WIDTH, min_height=MIN_VID_HEIGHT, max_width=MAX_VID_WIDTH, max_height=MAX_VID_HEIGHT) + validate_video_dimensions( + video, + min_width=MIN_VID_WIDTH, + min_height=MIN_VID_HEIGHT, + max_width=MAX_VID_WIDTH, + max_height=MAX_VID_HEIGHT, + ) trimmed_video = validate_input_video_length(video, num_frames_out) return trimmed_video @@ -180,22 +201,29 @@ def validate_input_video(video: VideoInput, num_frames_out: int, with_frame_cond def validate_input_video_length(video: VideoInput, num_frames: int): if video.get_duration() > 60: - raise MoonvalleyApiError("Input Video lenth should be less than 1min. Please trim.") + raise MoonvalleyApiError( + "Input Video lenth should be less than 1min. Please trim." + ) if num_frames == 128: - if video.get_duration() < 5: - raise MoonvalleyApiError("Input Video length is less than 5s. Please use a video longer than or equal to 5s.") - if video.get_duration() > 5: - # trim video to 5s - video = trim_video(video, 5) + if video.get_duration() < 5: + raise MoonvalleyApiError( + "Input Video length is less than 5s. Please use a video longer than or equal to 5s." + ) + if video.get_duration() > 5: + # trim video to 5s + video = trim_video(video, 5) if num_frames == 256: if video.get_duration() < 10: - raise MoonvalleyApiError("Input Video length is less than 10s. Please use a video longer than or equal to 10s.") + raise MoonvalleyApiError( + "Input Video length is less than 10s. Please use a video longer than or equal to 10s." + ) if video.get_duration() > 10: # trim video to 10s video = trim_video(video, 10) return video + def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: """ Returns a new VideoInput object trimmed from the beginning to the specified duration, @@ -219,8 +247,8 @@ def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: input_source = video.get_stream_source() # Open containers - input_container = av.open(input_source, mode='r') - output_container = av.open(output_buffer, mode='w', format='mp4') + input_container = av.open(input_source, mode="r") + output_container = av.open(output_buffer, mode="w", format="mp4") # Set up output streams for re-encoding video_stream = None @@ -230,22 +258,32 @@ def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: logging.info(f"Found stream: type={stream.type}, class={type(stream)}") if isinstance(stream, av.VideoStream): # Create output video stream with same parameters - video_stream = output_container.add_stream('h264', rate=stream.average_rate) + video_stream = output_container.add_stream( + "h264", rate=stream.average_rate + ) video_stream.width = stream.width video_stream.height = stream.height - video_stream.pix_fmt = 'yuv420p' - logging.info(f"Added video stream: {stream.width}x{stream.height} @ {stream.average_rate}fps") + video_stream.pix_fmt = "yuv420p" + logging.info( + f"Added video stream: {stream.width}x{stream.height} @ {stream.average_rate}fps" + ) elif isinstance(stream, av.AudioStream): # Create output audio stream with same parameters - audio_stream = output_container.add_stream('aac', rate=stream.sample_rate) + audio_stream = output_container.add_stream( + "aac", rate=stream.sample_rate + ) audio_stream.sample_rate = stream.sample_rate audio_stream.layout = stream.layout - logging.info(f"Added audio stream: {stream.sample_rate}Hz, {stream.channels} channels") + logging.info( + f"Added audio stream: {stream.sample_rate}Hz, {stream.channels} channels" + ) # Calculate target frame count that's divisible by 32 fps = input_container.streams.video[0].average_rate estimated_frames = int(duration_sec * fps) - target_frames = (estimated_frames // 32) * 32 # Round down to nearest multiple of 32 + target_frames = ( + estimated_frames // 32 + ) * 32 # Round down to nearest multiple of 32 if target_frames == 0: raise ValueError("Video too short: need at least 32 frames for Moonvalley") @@ -268,7 +306,9 @@ def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: for packet in video_stream.encode(): output_container.mux(packet) - logging.info(f"Encoded {frame_count} video frames (target: {target_frames})") + logging.info( + f"Encoded {frame_count} video frames (target: {target_frames})" + ) # Decode and re-encode audio frames if audio_stream: @@ -292,7 +332,6 @@ def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: output_container.close() input_container.close() - # Return as VideoFromFile using the buffer output_buffer.seek(0) return VideoFromFile(output_buffer) @@ -305,6 +344,7 @@ def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: output_container.close() raise RuntimeError(f"Failed to trim video: {str(e)}") from e + # --- BaseMoonvalleyVideoNode --- class BaseMoonvalleyVideoNode: def parseWidthHeightFromRes(self, resolution: str): @@ -328,7 +368,7 @@ class BaseMoonvalleyVideoNode: "Motion Transfer": "motion_control", "Canny": "canny_control", "Pose Transfer": "pose_control", - "Depth": "depth_control" + "Depth": "depth_control", } if value in control_map: return control_map[value] @@ -355,31 +395,63 @@ class BaseMoonvalleyVideoNode: return { "required": { "prompt": model_field_to_node_input( - IO.STRING, MoonvalleyTextToVideoRequest, "prompt_text", - multiline=True + IO.STRING, + MoonvalleyTextToVideoRequest, + "prompt_text", + multiline=True, ), "negative_prompt": model_field_to_node_input( IO.STRING, MoonvalleyTextToVideoInferenceParams, "negative_prompt", multiline=True, - default="gopro, bright, contrast, static, overexposed, bright, vignette, artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, flare, saturation, distorted, warped, wide angle, contrast, saturated, vibrant, glowing, cross dissolve, texture, videogame, saturation, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, blown out, horrible, blurry, worst quality, bad, transition, dissolve, cross-dissolve, melt, fade in, fade out, wobbly, weird, low quality, plastic, stock footage, video camera, boring, static", + default="low-poly, flat shader, bad rigging, stiff animation, uncanny eyes, low-quality textures, looping glitch, cheap effect, overbloom, bloom spam, default lighting, game asset, stiff face, ugly specular, AI artifacts", ), - - "resolution": (IO.COMBO, { - "options": ["16:9 (1920 x 1080)", - "9:16 (1080 x 1920)", - "1:1 (1152 x 1152)", - "4:3 (1440 x 1080)", - "3:4 (1080 x 1440)", - "21:9 (2560 x 1080)"], + "resolution": ( + IO.COMBO, + { + "options": [ + "16:9 (1920 x 1080)", + "9:16 (1080 x 1920)", + "1:1 (1152 x 1152)", + "4:3 (1440 x 1080)", + "3:4 (1080 x 1440)", + "21:9 (2560 x 1080)", + ], "default": "16:9 (1920 x 1080)", "tooltip": "Resolution of the output video", - }), + }, + ), # "length": (IO.COMBO,{"options":['5s','10s'], "default": '5s'}), - "prompt_adherence": model_field_to_node_input(IO.FLOAT,MoonvalleyTextToVideoInferenceParams,"guidance_scale",default=7.0, step=1, min=1, max=20), - "seed": model_field_to_node_input(IO.INT,MoonvalleyTextToVideoInferenceParams, "seed", default=random.randint(0, 2**32 - 1), min=0, max=4294967295, step=1, display="number", tooltip="Random seed value", control_after_generate=True), - "steps": model_field_to_node_input(IO.INT, MoonvalleyTextToVideoInferenceParams, "steps", default=100, min=1, max=100), + "prompt_adherence": model_field_to_node_input( + IO.FLOAT, + MoonvalleyTextToVideoInferenceParams, + "guidance_scale", + default=7.0, + step=1, + min=1, + max=20, + ), + "seed": model_field_to_node_input( + IO.INT, + MoonvalleyTextToVideoInferenceParams, + "seed", + default=random.randint(0, 2**32 - 1), + min=0, + max=4294967295, + step=1, + display="number", + tooltip="Random seed value", + control_after_generate=True, + ), + "steps": model_field_to_node_input( + IO.INT, + MoonvalleyTextToVideoInferenceParams, + "steps", + default=100, + min=1, + max=100, + ), }, "hidden": { "auth_token": "AUTH_TOKEN_COMFY_ORG", @@ -393,7 +465,7 @@ class BaseMoonvalleyVideoNode: "image_url", tooltip="The reference image used to generate the video", ), - } + }, } RETURN_TYPES = ("STRING",) @@ -404,6 +476,7 @@ class BaseMoonvalleyVideoNode: def generate(self, **kwargs): return None + # --- MoonvalleyImg2VideoNode --- class MoonvalleyImg2VideoNode(BaseMoonvalleyVideoNode): @@ -415,43 +488,46 @@ class MoonvalleyImg2VideoNode(BaseMoonvalleyVideoNode): RETURN_NAMES = ("video",) DESCRIPTION = "Moonvalley Marey Image to Video Node" - def generate(self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs): + def generate( + self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs + ): image = kwargs.get("image", None) - if (image is None): + if image is None: raise MoonvalleyApiError("image is required") total_frames = get_total_frames_from_length() - validate_input_image(image,True) + validate_input_image(image, True) validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) width_height = self.parseWidthHeightFromRes(kwargs.get("resolution")) - inference_params=MoonvalleyTextToVideoInferenceParams( - negative_prompt=negative_prompt, - steps=kwargs.get("steps"), - seed=kwargs.get("seed"), - guidance_scale=kwargs.get("prompt_adherence"), - num_frames=total_frames, - width=width_height.get("width"), - height=width_height.get("height"), - use_negative_prompts=True - ) + inference_params = MoonvalleyTextToVideoInferenceParams( + negative_prompt=negative_prompt, + steps=kwargs.get("steps"), + seed=kwargs.get("seed"), + guidance_scale=kwargs.get("prompt_adherence"), + num_frames=total_frames, + width=width_height.get("width"), + height=width_height.get("height"), + use_negative_prompts=True, + ) """Upload image to comfy backend to have a URL available for further processing""" # Get MIME type from tensor - assuming PNG format for image tensors mime_type = "image/png" - image_url = upload_images_to_comfyapi(image, max_images=1, auth_kwargs=kwargs, mime_type=mime_type)[0] + image_url = upload_images_to_comfyapi( + image, max_images=1, auth_kwargs=kwargs, mime_type=mime_type + )[0] request = MoonvalleyTextToVideoRequest( - image_url=image_url, - prompt_text=prompt, - inference_params=inference_params - ) + image_url=image_url, prompt_text=prompt, inference_params=inference_params + ) initial_operation = SynchronousOperation( - endpoint=ApiEndpoint(path=API_IMG2VIDEO_ENDPOINT, - method=HttpMethod.POST, - request_model=MoonvalleyTextToVideoRequest, - response_model=MoonvalleyPromptResponse - ), + endpoint=ApiEndpoint( + path=API_IMG2VIDEO_ENDPOINT, + method=HttpMethod.POST, + request_model=MoonvalleyTextToVideoRequest, + response_model=MoonvalleyPromptResponse, + ), request=request, auth_kwargs=kwargs, ) @@ -463,7 +539,8 @@ class MoonvalleyImg2VideoNode(BaseMoonvalleyVideoNode): task_id, auth_kwargs=kwargs, node_id=unique_id ) video = download_url_to_video_output(final_response.output_url) - return (video, ) + return (video,) + # --- MoonvalleyVid2VidNode --- class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): @@ -479,38 +556,46 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): if param in input_types["optional"]: del input_types["optional"][param] input_types["optional"] = { - "video": (IO.VIDEO, {"default": "", "multiline": False, "tooltip": "The reference video used to generate the output video. Input a 5s video for 128 frames and a 10s video for 256 frames. Longer videos will be trimmed automatically."}), - "control_type": ( - ["Motion Transfer", "Pose Transfer"], - {"default": "Motion Transfer"}, - ), - "motion_intensity": ( - "INT", - { - "default": 100, - "step": 1, - "min": 0, - "max": 100, - "tooltip": "Only used if control_type is 'Motion Transfer'", - }, - ) - } + "video": ( + IO.VIDEO, + { + "default": "", + "multiline": False, + "tooltip": "The reference video used to generate the output video. Input a 5s video for 128 frames and a 10s video for 256 frames. Longer videos will be trimmed automatically.", + }, + ), + "control_type": ( + ["Motion Transfer", "Pose Transfer"], + {"default": "Motion Transfer"}, + ), + "motion_intensity": ( + "INT", + { + "default": 100, + "step": 1, + "min": 0, + "max": 100, + "tooltip": "Only used if control_type is 'Motion Transfer'", + }, + ), + } return input_types RETURN_TYPES = ("VIDEO",) RETURN_NAMES = ("video",) - def generate(self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs): + def generate( + self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs + ): video = kwargs.get("video") num_frames = get_total_frames_from_length() - if not video : + if not video: raise MoonvalleyApiError("video is required") - """Validate video input""" - video_url="" + video_url = "" if video: validated_video = validate_input_video(video, num_frames, False) video_url = upload_video_to_comfyapi(validated_video, auth_kwargs=kwargs) @@ -520,29 +605,30 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): """Validate prompts and inference input""" validate_prompts(prompt, negative_prompt) - inference_params=MoonvalleyVideoToVideoInferenceParams( + inference_params = MoonvalleyVideoToVideoInferenceParams( negative_prompt=negative_prompt, steps=kwargs.get("steps"), seed=kwargs.get("seed"), guidance_scale=kwargs.get("prompt_adherence"), - control_params={'motion_intensity': motion_intensity} + control_params={"motion_intensity": motion_intensity}, ) control = self.parseControlParameter(control_type) request = MoonvalleyVideoToVideoRequest( - control_type=control, - video_url=video_url, - prompt_text=prompt, - inference_params=inference_params - ) + control_type=control, + video_url=video_url, + prompt_text=prompt, + inference_params=inference_params, + ) initial_operation = SynchronousOperation( - endpoint=ApiEndpoint(path=API_VIDEO2VIDEO_ENDPOINT, - method=HttpMethod.POST, - request_model=MoonvalleyVideoToVideoRequest, - response_model=MoonvalleyPromptResponse - ), + endpoint=ApiEndpoint( + path=API_VIDEO2VIDEO_ENDPOINT, + method=HttpMethod.POST, + request_model=MoonvalleyVideoToVideoRequest, + response_model=MoonvalleyPromptResponse, + ), request=request, auth_kwargs=kwargs, ) @@ -556,7 +642,8 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): video = download_url_to_video_output(final_response.output_url) - return (video, ) + return (video,) + # --- MoonvalleyTxt2VideoNode --- class MoonvalleyTxt2VideoNode(BaseMoonvalleyVideoNode): @@ -575,31 +662,33 @@ class MoonvalleyTxt2VideoNode(BaseMoonvalleyVideoNode): del input_types["optional"][param] return input_types - def generate(self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs): + def generate( + self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs + ): validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) width_height = self.parseWidthHeightFromRes(kwargs.get("resolution")) num_frames = get_total_frames_from_length() - inference_params=MoonvalleyTextToVideoInferenceParams( - negative_prompt=negative_prompt, - steps=kwargs.get("steps"), - seed=kwargs.get("seed"), - guidance_scale=kwargs.get("prompt_adherence"), - num_frames=num_frames, - width=width_height.get("width"), - height=width_height.get("height"), - ) + inference_params = MoonvalleyTextToVideoInferenceParams( + negative_prompt=negative_prompt, + steps=kwargs.get("steps"), + seed=kwargs.get("seed"), + guidance_scale=kwargs.get("prompt_adherence"), + num_frames=num_frames, + width=width_height.get("width"), + height=width_height.get("height"), + ) request = MoonvalleyTextToVideoRequest( - prompt_text=prompt, - inference_params=inference_params - ) + prompt_text=prompt, inference_params=inference_params + ) initial_operation = SynchronousOperation( - endpoint=ApiEndpoint(path=API_TXT2VIDEO_ENDPOINT, - method=HttpMethod.POST, - request_model=MoonvalleyTextToVideoRequest, - response_model=MoonvalleyPromptResponse - ), + endpoint=ApiEndpoint( + path=API_TXT2VIDEO_ENDPOINT, + method=HttpMethod.POST, + request_model=MoonvalleyTextToVideoRequest, + response_model=MoonvalleyPromptResponse, + ), request=request, auth_kwargs=kwargs, ) @@ -612,8 +701,7 @@ class MoonvalleyTxt2VideoNode(BaseMoonvalleyVideoNode): ) video = download_url_to_video_output(final_response.output_url) - return (video, ) - + return (video,) NODE_CLASS_MAPPINGS = { @@ -629,6 +717,7 @@ NODE_DISPLAY_NAME_MAPPINGS = { # "MoonvalleyVideo2VideoNode": "Moonvalley Marey Video to Video", } + def get_total_frames_from_length(length="5s"): # if length == '5s': # return 128 From b850d9a8bb2c99989fe79d1fded26ab5c103c7b2 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 25 Jul 2025 18:25:45 -0700 Subject: [PATCH 063/243] Add map_function to get_history. (#9056) --- execution.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/execution.py b/execution.py index c3a62f1cb..8a9663a7d 100644 --- a/execution.py +++ b/execution.py @@ -1097,7 +1097,7 @@ class PromptQueue: return True return False - def get_history(self, prompt_id=None, max_items=None, offset=-1): + def get_history(self, prompt_id=None, max_items=None, offset=-1, map_function=None): with self.mutex: if prompt_id is None: out = {} @@ -1106,13 +1106,21 @@ class PromptQueue: offset = len(self.history) - max_items for k in self.history: if i >= offset: - out[k] = self.history[k] + p = self.history[k] + if map_function is not None: + p = map_function(p) + out[k] = p if max_items is not None and len(out) >= max_items: break i += 1 return out elif prompt_id in self.history: - return {prompt_id: copy.deepcopy(self.history[prompt_id])} + p = self.history[prompt_id] + if map_function is None: + p = copy.deepcopy(p) + else: + p = map_function(p) + return {prompt_id: p} else: return {} From 0621d73a9c56fdc9e79aad87ed260135639bca50 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 26 Jul 2025 01:44:19 -0700 Subject: [PATCH 064/243] Remove useless code. (#9059) --- comfy/ldm/wan/vae.py | 41 ----------------------------------------- 1 file changed, 41 deletions(-) diff --git a/comfy/ldm/wan/vae.py b/comfy/ldm/wan/vae.py index a83c6edfd..6b07840fc 100644 --- a/comfy/ldm/wan/vae.py +++ b/comfy/ldm/wan/vae.py @@ -148,29 +148,6 @@ class Resample(nn.Module): feat_idx[0] += 1 return x - def init_weight(self, conv): - conv_weight = conv.weight - nn.init.zeros_(conv_weight) - c1, c2, t, h, w = conv_weight.size() - one_matrix = torch.eye(c1, c2) - init_matrix = one_matrix - nn.init.zeros_(conv_weight) - #conv_weight.data[:,:,-1,1,1] = init_matrix * 0.5 - conv_weight.data[:, :, 1, 0, 0] = init_matrix #* 0.5 - conv.weight.data.copy_(conv_weight) - nn.init.zeros_(conv.bias.data) - - def init_weight2(self, conv): - conv_weight = conv.weight.data - nn.init.zeros_(conv_weight) - c1, c2, t, h, w = conv_weight.size() - init_matrix = torch.eye(c1 // 2, c2) - #init_matrix = repeat(init_matrix, 'o ... -> (o 2) ...').permute(1,0,2).contiguous().reshape(c1,c2) - conv_weight[:c1 // 2, :, -1, 0, 0] = init_matrix - conv_weight[c1 // 2:, :, -1, 0, 0] = init_matrix - conv.weight.data.copy_(conv_weight) - nn.init.zeros_(conv.bias.data) - class ResidualBlock(nn.Module): @@ -485,12 +462,6 @@ class WanVAE(nn.Module): self.decoder = Decoder3d(dim, z_dim, dim_mult, num_res_blocks, attn_scales, self.temperal_upsample, dropout) - def forward(self, x): - mu, log_var = self.encode(x) - z = self.reparameterize(mu, log_var) - x_recon = self.decode(z) - return x_recon, mu, log_var - def encode(self, x): self.clear_cache() ## cache @@ -536,18 +507,6 @@ class WanVAE(nn.Module): self.clear_cache() return out - def reparameterize(self, mu, log_var): - std = torch.exp(0.5 * log_var) - eps = torch.randn_like(std) - return eps * std + mu - - def sample(self, imgs, deterministic=False): - mu, log_var = self.encode(imgs) - if deterministic: - return mu - std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0)) - return mu + std * torch.randn_like(std) - def clear_cache(self): self._conv_num = count_conv3d(self.decoder) self._conv_idx = [0] From 1ef70fcde4b84e8cd743c4f1fd9cdce24bbadbad Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Sun, 27 Jul 2025 05:25:33 +0800 Subject: [PATCH 065/243] Fix the broken link (#9060) --- comfy_api_nodes/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_api_nodes/README.md b/comfy_api_nodes/README.md index 64a389cc1..f56d6c860 100644 --- a/comfy_api_nodes/README.md +++ b/comfy_api_nodes/README.md @@ -2,7 +2,7 @@ ## Introduction -Below are a collection of nodes that work by calling external APIs. More information available in our [docs](https://docs.comfy.org/tutorials/api-nodes/overview#api-nodes). +Below are a collection of nodes that work by calling external APIs. More information available in our [docs](https://docs.comfy.org/tutorials/api-nodes/overview). ## Development From 78672d0ee6d20d8269f324474643e5cc00f1c348 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 27 Jul 2025 04:42:58 -0700 Subject: [PATCH 066/243] Small readme update. (#9071) --- README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index a148623cd..8a15136aa 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith ## Features - Nodes/graph/flowchart interface to experiment and create complex Stable Diffusion workflows without needing to code anything. - Image Models - - SD1.x, SD2.x, + - SD1.x, SD2.x ([unCLIP](https://comfyanonymous.github.io/ComfyUI_examples/unclip/)) - [SDXL](https://comfyanonymous.github.io/ComfyUI_examples/sdxl/), [SDXL Turbo](https://comfyanonymous.github.io/ComfyUI_examples/sdturbo/) - [Stable Cascade](https://comfyanonymous.github.io/ComfyUI_examples/stable_cascade/) - [SD3 and SD3.5](https://comfyanonymous.github.io/ComfyUI_examples/sd3/) @@ -84,9 +84,9 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith - [Hunyuan3D 2.0](https://docs.comfy.org/tutorials/3d/hunyuan3D-2) - Asynchronous Queue system - Many optimizations: Only re-executes the parts of the workflow that changes between executions. -- Smart memory management: can automatically run models on GPUs with as low as 1GB vram. +- Smart memory management: can automatically run large models on GPUs with as low as 1GB vram with smart offloading. - Works even if you don't have a GPU with: ```--cpu``` (slow) -- Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs and CLIP models. +- Can load ckpt and safetensors: All in one checkpoints or standalone diffusion models, VAEs and CLIP models. - Safe loading of ckpt, pt, pth, etc.. files. - Embeddings/Textual inversion - [Loras (regular, locon and loha)](https://comfyanonymous.github.io/ComfyUI_examples/lora/) @@ -98,7 +98,6 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith - [Inpainting](https://comfyanonymous.github.io/ComfyUI_examples/inpaint/) with both regular and inpainting models. - [ControlNet and T2I-Adapter](https://comfyanonymous.github.io/ComfyUI_examples/controlnet/) - [Upscale Models (ESRGAN, ESRGAN variants, SwinIR, Swin2SR, etc...)](https://comfyanonymous.github.io/ComfyUI_examples/upscale_models/) -- [unCLIP Models](https://comfyanonymous.github.io/ComfyUI_examples/unclip/) - [GLIGEN](https://comfyanonymous.github.io/ComfyUI_examples/gligen/) - [Model Merging](https://comfyanonymous.github.io/ComfyUI_examples/model_merging/) - [LCM models and Loras](https://comfyanonymous.github.io/ComfyUI_examples/lcm/) From e6d9f6274494c5ac96295deb1bea54de50189059 Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Sun, 27 Jul 2025 16:51:36 -0700 Subject: [PATCH 067/243] Add Moonvalley Marey V2V node with updated input validation (#9069) * [moonvalley] Update V2V node to match API specification - Add exact resolution validation for supported resolutions (1920x1080, 1080x1920, 1152x1152, 1536x1152, 1152x1536) - Change frame count validation from divisible by 32 to 16 - Add MP4 container format validation - Remove internal parameters (steps, guidance_scale) from V2V inference params - Update video duration handling to support only 5 seconds (auto-trim if longer) - Add motion_intensity parameter (0-100) for Motion Transfer control type - Add get_container_format() method to VideoInput classes * update negative prompt --- comfy_api/input/video_types.py | 13 ++ comfy_api/input_impl/video_types.py | 12 ++ comfy_api_nodes/nodes_moonvalley.py | 225 +++++++++++++++------------- 3 files changed, 145 insertions(+), 105 deletions(-) diff --git a/comfy_api/input/video_types.py b/comfy_api/input/video_types.py index bb936e0a4..5d95dc507 100644 --- a/comfy_api/input/video_types.py +++ b/comfy_api/input/video_types.py @@ -2,6 +2,7 @@ from __future__ import annotations from abc import ABC, abstractmethod from typing import Optional, Union import io +import av from comfy_api.util import VideoContainer, VideoCodec, VideoComponents class VideoInput(ABC): @@ -70,3 +71,15 @@ class VideoInput(ABC): components = self.get_components() frame_count = components.images.shape[0] return float(frame_count / components.frame_rate) + + def get_container_format(self) -> str: + """ + Returns the container format of the video (e.g., 'mp4', 'mov', 'avi'). + + Returns: + Container format as string + """ + # Default implementation - subclasses should override for better performance + source = self.get_stream_source() + with av.open(source, mode="r") as container: + return container.format.name diff --git a/comfy_api/input_impl/video_types.py b/comfy_api/input_impl/video_types.py index 9ae818f4e..91e7c1bfa 100644 --- a/comfy_api/input_impl/video_types.py +++ b/comfy_api/input_impl/video_types.py @@ -121,6 +121,18 @@ class VideoFromFile(VideoInput): raise ValueError(f"Could not determine duration for file '{self.__file}'") + def get_container_format(self) -> str: + """ + Returns the container format of the video (e.g., 'mp4', 'mov', 'avi'). + + Returns: + Container format as string + """ + if isinstance(self.__file, io.BytesIO): + self.__file.seek(0) + with av.open(self.__file, mode='r') as container: + return container.format.name + def get_components_internal(self, container: InputContainer) -> VideoComponents: # Get video frames frames = [] diff --git a/comfy_api_nodes/nodes_moonvalley.py b/comfy_api_nodes/nodes_moonvalley.py index 057021efa..789fcef02 100644 --- a/comfy_api_nodes/nodes_moonvalley.py +++ b/comfy_api_nodes/nodes_moonvalley.py @@ -5,7 +5,6 @@ import torch from comfy_api_nodes.util.validation_utils import ( get_image_dimensions, validate_image_dimensions, - validate_video_dimensions, ) @@ -176,54 +175,76 @@ def validate_input_image( ) -def validate_input_video( - video: VideoInput, num_frames_out: int, with_frame_conditioning: bool = False -): +def validate_video_to_video_input(video: VideoInput) -> VideoInput: + """ + Validates and processes video input for Moonvalley Video-to-Video generation. + + Args: + video: Input video to validate + + Returns: + Validated and potentially trimmed video + + Raises: + ValueError: If video doesn't meet requirements + MoonvalleyApiError: If video duration is too short + """ + width, height = _get_video_dimensions(video) + _validate_video_dimensions(width, height) + _validate_container_format(video) + + return _validate_and_trim_duration(video) + + +def _get_video_dimensions(video: VideoInput) -> tuple[int, int]: + """Extracts video dimensions with error handling.""" try: - width, height = video.get_dimensions() + return video.get_dimensions() except Exception as e: logging.error("Error getting dimensions of video: %s", e) raise ValueError(f"Cannot get video dimensions: {e}") from e - validate_input_media(width, height, with_frame_conditioning) - validate_video_dimensions( - video, - min_width=MIN_VID_WIDTH, - min_height=MIN_VID_HEIGHT, - max_width=MAX_VID_WIDTH, - max_height=MAX_VID_HEIGHT, - ) - trimmed_video = validate_input_video_length(video, num_frames_out) - return trimmed_video +def _validate_video_dimensions(width: int, height: int) -> None: + """Validates video dimensions meet Moonvalley V2V requirements.""" + supported_resolutions = { + (1920, 1080), (1080, 1920), (1152, 1152), + (1536, 1152), (1152, 1536) + } + + if (width, height) not in supported_resolutions: + supported_list = ', '.join([f'{w}x{h}' for w, h in sorted(supported_resolutions)]) + raise ValueError(f"Resolution {width}x{height} not supported. Supported: {supported_list}") -def validate_input_video_length(video: VideoInput, num_frames: int): +def _validate_container_format(video: VideoInput) -> None: + """Validates video container format is MP4.""" + container_format = video.get_container_format() + if container_format not in ['mp4', 'mov,mp4,m4a,3gp,3g2,mj2']: + raise ValueError(f"Only MP4 container format supported. Got: {container_format}") - if video.get_duration() > 60: - raise MoonvalleyApiError( - "Input Video lenth should be less than 1min. Please trim." - ) - if num_frames == 128: - if video.get_duration() < 5: - raise MoonvalleyApiError( - "Input Video length is less than 5s. Please use a video longer than or equal to 5s." - ) - if video.get_duration() > 5: - # trim video to 5s - video = trim_video(video, 5) - if num_frames == 256: - if video.get_duration() < 10: - raise MoonvalleyApiError( - "Input Video length is less than 10s. Please use a video longer than or equal to 10s." - ) - if video.get_duration() > 10: - # trim video to 10s - video = trim_video(video, 10) +def _validate_and_trim_duration(video: VideoInput) -> VideoInput: + """Validates video duration and trims to 5 seconds if needed.""" + duration = video.get_duration() + _validate_minimum_duration(duration) + return _trim_if_too_long(video, duration) + + +def _validate_minimum_duration(duration: float) -> None: + """Ensures video is at least 5 seconds long.""" + if duration < 5: + raise MoonvalleyApiError("Input video must be at least 5 seconds long.") + + +def _trim_if_too_long(video: VideoInput, duration: float) -> VideoInput: + """Trims video to 5 seconds if longer.""" + if duration > 5: + return trim_video(video, 5) return video + def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: """ Returns a new VideoInput object trimmed from the beginning to the specified duration, @@ -278,15 +299,13 @@ def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: f"Added audio stream: {stream.sample_rate}Hz, {stream.channels} channels" ) - # Calculate target frame count that's divisible by 32 + # Calculate target frame count that's divisible by 16 fps = input_container.streams.video[0].average_rate estimated_frames = int(duration_sec * fps) - target_frames = ( - estimated_frames // 32 - ) * 32 # Round down to nearest multiple of 32 + target_frames = (estimated_frames // 16) * 16 # Round down to nearest multiple of 16 if target_frames == 0: - raise ValueError("Video too short: need at least 32 frames for Moonvalley") + raise ValueError("Video too short: need at least 16 frames for Moonvalley") frame_count = 0 audio_frame_count = 0 @@ -353,8 +372,8 @@ class BaseMoonvalleyVideoNode: "16:9 (1920 x 1080)": {"width": 1920, "height": 1080}, "9:16 (1080 x 1920)": {"width": 1080, "height": 1920}, "1:1 (1152 x 1152)": {"width": 1152, "height": 1152}, - "4:3 (1440 x 1080)": {"width": 1440, "height": 1080}, - "3:4 (1080 x 1440)": {"width": 1080, "height": 1440}, + "4:3 (1536 x 1152)": {"width": 1536, "height": 1152}, + "3:4 (1152 x 1536)": {"width": 1152, "height": 1536}, "21:9 (2560 x 1080)": {"width": 2560, "height": 1080}, } if resolution in res_map: @@ -494,7 +513,6 @@ class MoonvalleyImg2VideoNode(BaseMoonvalleyVideoNode): image = kwargs.get("image", None) if image is None: raise MoonvalleyApiError("image is required") - total_frames = get_total_frames_from_length() validate_input_image(image, True) validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) @@ -505,7 +523,7 @@ class MoonvalleyImg2VideoNode(BaseMoonvalleyVideoNode): steps=kwargs.get("steps"), seed=kwargs.get("seed"), guidance_scale=kwargs.get("prompt_adherence"), - num_frames=total_frames, + num_frames=128, width=width_height.get("width"), height=width_height.get("height"), use_negative_prompts=True, @@ -549,39 +567,45 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): @classmethod def INPUT_TYPES(cls): - input_types = super().INPUT_TYPES() - for param in ["resolution", "image"]: - if param in input_types["required"]: - del input_types["required"][param] - if param in input_types["optional"]: - del input_types["optional"][param] - input_types["optional"] = { - "video": ( - IO.VIDEO, - { - "default": "", - "multiline": False, - "tooltip": "The reference video used to generate the output video. Input a 5s video for 128 frames and a 10s video for 256 frames. Longer videos will be trimmed automatically.", - }, - ), - "control_type": ( - ["Motion Transfer", "Pose Transfer"], - {"default": "Motion Transfer"}, - ), - "motion_intensity": ( - "INT", - { - "default": 100, - "step": 1, - "min": 0, - "max": 100, - "tooltip": "Only used if control_type is 'Motion Transfer'", - }, - ), + return { + "required": { + "prompt": model_field_to_node_input( + IO.STRING, MoonvalleyVideoToVideoRequest, "prompt_text", + multiline=True + ), + "negative_prompt": model_field_to_node_input( + IO.STRING, + MoonvalleyVideoToVideoInferenceParams, + "negative_prompt", + multiline=True, + default="low-poly, flat shader, bad rigging, stiff animation, uncanny eyes, low-quality textures, looping glitch, cheap effect, overbloom, bloom spam, default lighting, game asset, stiff face, ugly specular, AI artifacts" + ), + "seed": model_field_to_node_input(IO.INT,MoonvalleyVideoToVideoInferenceParams, "seed", default=random.randint(0, 2**32 - 1), min=0, max=4294967295, step=1, display="number", tooltip="Random seed value", control_after_generate=True), + }, + "hidden": { + "auth_token": "AUTH_TOKEN_COMFY_ORG", + "comfy_api_key": "API_KEY_COMFY_ORG", + "unique_id": "UNIQUE_ID", + }, + "optional": { + "video": (IO.VIDEO, {"default": "", "multiline": False, "tooltip": "The reference video used to generate the output video. Must be at least 5 seconds long. Videos longer than 5s will be automatically trimmed. Only MP4 format supported."}), + "control_type": ( + ["Motion Transfer", "Pose Transfer"], + {"default": "Motion Transfer"}, + ), + "motion_intensity": ( + "INT", + { + "default": 100, + "step": 1, + "min": 0, + "max": 100, + "tooltip": "Only used if control_type is 'Motion Transfer'", + }, + ) + } } - return input_types - RETURN_TYPES = ("VIDEO",) RETURN_NAMES = ("video",) @@ -589,15 +613,13 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs ): video = kwargs.get("video") - num_frames = get_total_frames_from_length() if not video: raise MoonvalleyApiError("video is required") - """Validate video input""" video_url = "" if video: - validated_video = validate_input_video(video, num_frames, False) + validated_video = validate_video_to_video_input(video) video_url = upload_video_to_comfyapi(validated_video, auth_kwargs=kwargs) control_type = kwargs.get("control_type") @@ -605,12 +627,16 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): """Validate prompts and inference input""" validate_prompts(prompt, negative_prompt) - inference_params = MoonvalleyVideoToVideoInferenceParams( + + # Only include motion_intensity for Motion Transfer + control_params = {} + if control_type == "Motion Transfer" and motion_intensity is not None: + control_params['motion_intensity'] = motion_intensity + + inference_params=MoonvalleyVideoToVideoInferenceParams( negative_prompt=negative_prompt, - steps=kwargs.get("steps"), seed=kwargs.get("seed"), - guidance_scale=kwargs.get("prompt_adherence"), - control_params={"motion_intensity": motion_intensity}, + control_params=control_params ) control = self.parseControlParameter(control_type) @@ -667,17 +693,16 @@ class MoonvalleyTxt2VideoNode(BaseMoonvalleyVideoNode): ): validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) width_height = self.parseWidthHeightFromRes(kwargs.get("resolution")) - num_frames = get_total_frames_from_length() - inference_params = MoonvalleyTextToVideoInferenceParams( - negative_prompt=negative_prompt, - steps=kwargs.get("steps"), - seed=kwargs.get("seed"), - guidance_scale=kwargs.get("prompt_adherence"), - num_frames=num_frames, - width=width_height.get("width"), - height=width_height.get("height"), - ) + inference_params=MoonvalleyTextToVideoInferenceParams( + negative_prompt=negative_prompt, + steps=kwargs.get("steps"), + seed=kwargs.get("seed"), + guidance_scale=kwargs.get("prompt_adherence"), + num_frames=128, + width=width_height.get("width"), + height=width_height.get("height"), + ) request = MoonvalleyTextToVideoRequest( prompt_text=prompt, inference_params=inference_params ) @@ -707,22 +732,12 @@ class MoonvalleyTxt2VideoNode(BaseMoonvalleyVideoNode): NODE_CLASS_MAPPINGS = { "MoonvalleyImg2VideoNode": MoonvalleyImg2VideoNode, "MoonvalleyTxt2VideoNode": MoonvalleyTxt2VideoNode, - # "MoonvalleyVideo2VideoNode": MoonvalleyVideo2VideoNode, + "MoonvalleyVideo2VideoNode": MoonvalleyVideo2VideoNode, } NODE_DISPLAY_NAME_MAPPINGS = { "MoonvalleyImg2VideoNode": "Moonvalley Marey Image to Video", "MoonvalleyTxt2VideoNode": "Moonvalley Marey Text to Video", - # "MoonvalleyVideo2VideoNode": "Moonvalley Marey Video to Video", + "MoonvalleyVideo2VideoNode": "Moonvalley Marey Video to Video", } - - -def get_total_frames_from_length(length="5s"): - # if length == '5s': - # return 128 - # elif length == '10s': - # return 256 - return 128 - # else: - # raise MoonvalleyApiError("length is required") From d0210fe2e5df25b329926e20e3be32451fd5b841 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Mon, 28 Jul 2025 19:55:02 +0800 Subject: [PATCH 068/243] Update template to 0.1.41 (#9079) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 33a59b4be..14a085a2c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.40 +comfyui-workflow-templates==0.1.41 comfyui-embedded-docs==0.2.4 torch torchsde From a88788dce6b0d7b5e2876c7cd0121b45e80f4ad8 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 28 Jul 2025 05:00:23 -0700 Subject: [PATCH 069/243] Wan 2.2 support. (#9080) --- comfy/latent_formats.py | 76 ++++ comfy/ldm/wan/model.py | 16 +- comfy/ldm/wan/vae2_2.py | 726 ++++++++++++++++++++++++++++++++++++++ comfy/model_base.py | 30 +- comfy/model_detection.py | 2 + comfy/sd.py | 36 +- comfy/supported_models.py | 15 +- comfy_extras/nodes_wan.py | 44 +++ 8 files changed, 926 insertions(+), 19 deletions(-) create mode 100644 comfy/ldm/wan/vae2_2.py diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py index 82d9f9bb8..caf4991fc 100644 --- a/comfy/latent_formats.py +++ b/comfy/latent_formats.py @@ -457,6 +457,82 @@ class Wan21(LatentFormat): latents_std = self.latents_std.to(latent.device, latent.dtype) return latent * latents_std / self.scale_factor + latents_mean +class Wan22(Wan21): + latent_channels = 48 + latent_dimensions = 3 + + latent_rgb_factors = [ + [ 0.0119, 0.0103, 0.0046], + [-0.1062, -0.0504, 0.0165], + [ 0.0140, 0.0409, 0.0491], + [-0.0813, -0.0677, 0.0607], + [ 0.0656, 0.0851, 0.0808], + [ 0.0264, 0.0463, 0.0912], + [ 0.0295, 0.0326, 0.0590], + [-0.0244, -0.0270, 0.0025], + [ 0.0443, -0.0102, 0.0288], + [-0.0465, -0.0090, -0.0205], + [ 0.0359, 0.0236, 0.0082], + [-0.0776, 0.0854, 0.1048], + [ 0.0564, 0.0264, 0.0561], + [ 0.0006, 0.0594, 0.0418], + [-0.0319, -0.0542, -0.0637], + [-0.0268, 0.0024, 0.0260], + [ 0.0539, 0.0265, 0.0358], + [-0.0359, -0.0312, -0.0287], + [-0.0285, -0.1032, -0.1237], + [ 0.1041, 0.0537, 0.0622], + [-0.0086, -0.0374, -0.0051], + [ 0.0390, 0.0670, 0.2863], + [ 0.0069, 0.0144, 0.0082], + [ 0.0006, -0.0167, 0.0079], + [ 0.0313, -0.0574, -0.0232], + [-0.1454, -0.0902, -0.0481], + [ 0.0714, 0.0827, 0.0447], + [-0.0304, -0.0574, -0.0196], + [ 0.0401, 0.0384, 0.0204], + [-0.0758, -0.0297, -0.0014], + [ 0.0568, 0.1307, 0.1372], + [-0.0055, -0.0310, -0.0380], + [ 0.0239, -0.0305, 0.0325], + [-0.0663, -0.0673, -0.0140], + [-0.0416, -0.0047, -0.0023], + [ 0.0166, 0.0112, -0.0093], + [-0.0211, 0.0011, 0.0331], + [ 0.1833, 0.1466, 0.2250], + [-0.0368, 0.0370, 0.0295], + [-0.3441, -0.3543, -0.2008], + [-0.0479, -0.0489, -0.0420], + [-0.0660, -0.0153, 0.0800], + [-0.0101, 0.0068, 0.0156], + [-0.0690, -0.0452, -0.0927], + [-0.0145, 0.0041, 0.0015], + [ 0.0421, 0.0451, 0.0373], + [ 0.0504, -0.0483, -0.0356], + [-0.0837, 0.0168, 0.0055] + ] + + latent_rgb_factors_bias = [0.0317, -0.0878, -0.1388] + + def __init__(self): + self.scale_factor = 1.0 + self.latents_mean = torch.tensor([ + -0.2289, -0.0052, -0.1323, -0.2339, -0.2799, 0.0174, 0.1838, 0.1557, + -0.1382, 0.0542, 0.2813, 0.0891, 0.1570, -0.0098, 0.0375, -0.1825, + -0.2246, -0.1207, -0.0698, 0.5109, 0.2665, -0.2108, -0.2158, 0.2502, + -0.2055, -0.0322, 0.1109, 0.1567, -0.0729, 0.0899, -0.2799, -0.1230, + -0.0313, -0.1649, 0.0117, 0.0723, -0.2839, -0.2083, -0.0520, 0.3748, + 0.0152, 0.1957, 0.1433, -0.2944, 0.3573, -0.0548, -0.1681, -0.0667, + ]).view(1, self.latent_channels, 1, 1, 1) + self.latents_std = torch.tensor([ + 0.4765, 1.0364, 0.4514, 1.1677, 0.5313, 0.4990, 0.4818, 0.5013, + 0.8158, 1.0344, 0.5894, 1.0901, 0.6885, 0.6165, 0.8454, 0.4978, + 0.5759, 0.3523, 0.7135, 0.6804, 0.5833, 1.4146, 0.8986, 0.5659, + 0.7069, 0.5338, 0.4889, 0.4917, 0.4069, 0.4999, 0.6866, 0.4093, + 0.5709, 0.6065, 0.6415, 0.4944, 0.5726, 1.2042, 0.5458, 1.6887, + 0.3971, 1.0600, 0.3943, 0.5537, 0.5444, 0.4089, 0.7468, 0.7744 + ]).view(1, self.latent_channels, 1, 1, 1) + class Hunyuan3Dv2(LatentFormat): latent_channels = 64 latent_dimensions = 1 diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index 1d6edb354..b9e47e9f7 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -201,8 +201,10 @@ class WanAttentionBlock(nn.Module): freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2] """ # assert e.dtype == torch.float32 - - e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e).chunk(6, dim=1) + if e.ndim < 4: + e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e).chunk(6, dim=1) + else: + e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device).unsqueeze(0) + e).unbind(2) # assert e[0].dtype == torch.float32 # self-attention @@ -325,7 +327,10 @@ class Head(nn.Module): e(Tensor): Shape [B, C] """ # assert e.dtype == torch.float32 - e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e.unsqueeze(1)).chunk(2, dim=1) + if e.ndim < 3: + e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e.unsqueeze(1)).chunk(2, dim=1) + else: + e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device).unsqueeze(0) + e.unsqueeze(2)).unbind(2) x = (self.head(self.norm(x) * (1 + e[1]) + e[0])) return x @@ -506,8 +511,9 @@ class WanModel(torch.nn.Module): # time embeddings e = self.time_embedding( - sinusoidal_embedding_1d(self.freq_dim, t).to(dtype=x[0].dtype)) - e0 = self.time_projection(e).unflatten(1, (6, self.dim)) + sinusoidal_embedding_1d(self.freq_dim, t.flatten()).to(dtype=x[0].dtype)) + e = e.reshape(t.shape[0], -1, e.shape[-1]) + e0 = self.time_projection(e).unflatten(2, (6, self.dim)) # context context = self.text_embedding(context) diff --git a/comfy/ldm/wan/vae2_2.py b/comfy/ldm/wan/vae2_2.py new file mode 100644 index 000000000..c2c150e10 --- /dev/null +++ b/comfy/ldm/wan/vae2_2.py @@ -0,0 +1,726 @@ +# original version: https://github.com/Wan-Video/Wan2.2/blob/main/wan/modules/vae2_2.py +# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange +from .vae import AttentionBlock, CausalConv3d, RMS_norm + +import comfy.ops +ops = comfy.ops.disable_weight_init + +CACHE_T = 2 + + +class Resample(nn.Module): + + def __init__(self, dim, mode): + assert mode in ( + "none", + "upsample2d", + "upsample3d", + "downsample2d", + "downsample3d", + ) + super().__init__() + self.dim = dim + self.mode = mode + + # layers + if mode == "upsample2d": + self.resample = nn.Sequential( + nn.Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"), + ops.Conv2d(dim, dim, 3, padding=1), + ) + elif mode == "upsample3d": + self.resample = nn.Sequential( + nn.Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"), + ops.Conv2d(dim, dim, 3, padding=1), + # ops.Conv2d(dim, dim//2, 3, padding=1) + ) + self.time_conv = CausalConv3d( + dim, dim * 2, (3, 1, 1), padding=(1, 0, 0)) + elif mode == "downsample2d": + self.resample = nn.Sequential( + nn.ZeroPad2d((0, 1, 0, 1)), + ops.Conv2d(dim, dim, 3, stride=(2, 2))) + elif mode == "downsample3d": + self.resample = nn.Sequential( + nn.ZeroPad2d((0, 1, 0, 1)), + ops.Conv2d(dim, dim, 3, stride=(2, 2))) + self.time_conv = CausalConv3d( + dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0)) + else: + self.resample = nn.Identity() + + def forward(self, x, feat_cache=None, feat_idx=[0]): + b, c, t, h, w = x.size() + if self.mode == "upsample3d": + if feat_cache is not None: + idx = feat_idx[0] + if feat_cache[idx] is None: + feat_cache[idx] = "Rep" + feat_idx[0] += 1 + else: + cache_x = x[:, :, -CACHE_T:, :, :].clone() + if (cache_x.shape[2] < 2 and feat_cache[idx] is not None and + feat_cache[idx] != "Rep"): + # cache last frame of last two chunk + cache_x = torch.cat( + [ + feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to( + cache_x.device), + cache_x, + ], + dim=2, + ) + if (cache_x.shape[2] < 2 and feat_cache[idx] is not None and + feat_cache[idx] == "Rep"): + cache_x = torch.cat( + [ + torch.zeros_like(cache_x).to(cache_x.device), + cache_x + ], + dim=2, + ) + if feat_cache[idx] == "Rep": + x = self.time_conv(x) + else: + x = self.time_conv(x, feat_cache[idx]) + feat_cache[idx] = cache_x + feat_idx[0] += 1 + x = x.reshape(b, 2, c, t, h, w) + x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]), + 3) + x = x.reshape(b, c, t * 2, h, w) + t = x.shape[2] + x = rearrange(x, "b c t h w -> (b t) c h w") + x = self.resample(x) + x = rearrange(x, "(b t) c h w -> b c t h w", t=t) + + if self.mode == "downsample3d": + if feat_cache is not None: + idx = feat_idx[0] + if feat_cache[idx] is None: + feat_cache[idx] = x.clone() + feat_idx[0] += 1 + else: + cache_x = x[:, :, -1:, :, :].clone() + x = self.time_conv( + torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2)) + feat_cache[idx] = cache_x + feat_idx[0] += 1 + return x + + +class ResidualBlock(nn.Module): + + def __init__(self, in_dim, out_dim, dropout=0.0): + super().__init__() + self.in_dim = in_dim + self.out_dim = out_dim + + # layers + self.residual = nn.Sequential( + RMS_norm(in_dim, images=False), + nn.SiLU(), + CausalConv3d(in_dim, out_dim, 3, padding=1), + RMS_norm(out_dim, images=False), + nn.SiLU(), + nn.Dropout(dropout), + CausalConv3d(out_dim, out_dim, 3, padding=1), + ) + self.shortcut = ( + CausalConv3d(in_dim, out_dim, 1) + if in_dim != out_dim else nn.Identity()) + + def forward(self, x, feat_cache=None, feat_idx=[0]): + h = self.shortcut(x) + for layer in self.residual: + if isinstance(layer, CausalConv3d) and feat_cache is not None: + idx = feat_idx[0] + cache_x = x[:, :, -CACHE_T:, :, :].clone() + if cache_x.shape[2] < 2 and feat_cache[idx] is not None: + # cache last frame of last two chunk + cache_x = torch.cat( + [ + feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to( + cache_x.device), + cache_x, + ], + dim=2, + ) + x = layer(x, feat_cache[idx]) + feat_cache[idx] = cache_x + feat_idx[0] += 1 + else: + x = layer(x) + return x + h + + +def patchify(x, patch_size): + if patch_size == 1: + return x + if x.dim() == 4: + x = rearrange( + x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size, r=patch_size) + elif x.dim() == 5: + x = rearrange( + x, + "b c f (h q) (w r) -> b (c r q) f h w", + q=patch_size, + r=patch_size, + ) + else: + raise ValueError(f"Invalid input shape: {x.shape}") + + return x + + +def unpatchify(x, patch_size): + if patch_size == 1: + return x + + if x.dim() == 4: + x = rearrange( + x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size, r=patch_size) + elif x.dim() == 5: + x = rearrange( + x, + "b (c r q) f h w -> b c f (h q) (w r)", + q=patch_size, + r=patch_size, + ) + return x + + +class AvgDown3D(nn.Module): + + def __init__( + self, + in_channels, + out_channels, + factor_t, + factor_s=1, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.factor_t = factor_t + self.factor_s = factor_s + self.factor = self.factor_t * self.factor_s * self.factor_s + + assert in_channels * self.factor % out_channels == 0 + self.group_size = in_channels * self.factor // out_channels + + def forward(self, x: torch.Tensor) -> torch.Tensor: + pad_t = (self.factor_t - x.shape[2] % self.factor_t) % self.factor_t + pad = (0, 0, 0, 0, pad_t, 0) + x = F.pad(x, pad) + B, C, T, H, W = x.shape + x = x.view( + B, + C, + T // self.factor_t, + self.factor_t, + H // self.factor_s, + self.factor_s, + W // self.factor_s, + self.factor_s, + ) + x = x.permute(0, 1, 3, 5, 7, 2, 4, 6).contiguous() + x = x.view( + B, + C * self.factor, + T // self.factor_t, + H // self.factor_s, + W // self.factor_s, + ) + x = x.view( + B, + self.out_channels, + self.group_size, + T // self.factor_t, + H // self.factor_s, + W // self.factor_s, + ) + x = x.mean(dim=2) + return x + + +class DupUp3D(nn.Module): + + def __init__( + self, + in_channels: int, + out_channels: int, + factor_t, + factor_s=1, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + + self.factor_t = factor_t + self.factor_s = factor_s + self.factor = self.factor_t * self.factor_s * self.factor_s + + assert out_channels * self.factor % in_channels == 0 + self.repeats = out_channels * self.factor // in_channels + + def forward(self, x: torch.Tensor, first_chunk=False) -> torch.Tensor: + x = x.repeat_interleave(self.repeats, dim=1) + x = x.view( + x.size(0), + self.out_channels, + self.factor_t, + self.factor_s, + self.factor_s, + x.size(2), + x.size(3), + x.size(4), + ) + x = x.permute(0, 1, 5, 2, 6, 3, 7, 4).contiguous() + x = x.view( + x.size(0), + self.out_channels, + x.size(2) * self.factor_t, + x.size(4) * self.factor_s, + x.size(6) * self.factor_s, + ) + if first_chunk: + x = x[:, :, self.factor_t - 1:, :, :] + return x + + +class Down_ResidualBlock(nn.Module): + + def __init__(self, + in_dim, + out_dim, + dropout, + mult, + temperal_downsample=False, + down_flag=False): + super().__init__() + + # Shortcut path with downsample + self.avg_shortcut = AvgDown3D( + in_dim, + out_dim, + factor_t=2 if temperal_downsample else 1, + factor_s=2 if down_flag else 1, + ) + + # Main path with residual blocks and downsample + downsamples = [] + for _ in range(mult): + downsamples.append(ResidualBlock(in_dim, out_dim, dropout)) + in_dim = out_dim + + # Add the final downsample block + if down_flag: + mode = "downsample3d" if temperal_downsample else "downsample2d" + downsamples.append(Resample(out_dim, mode=mode)) + + self.downsamples = nn.Sequential(*downsamples) + + def forward(self, x, feat_cache=None, feat_idx=[0]): + x_copy = x.clone() + for module in self.downsamples: + x = module(x, feat_cache, feat_idx) + + return x + self.avg_shortcut(x_copy) + + +class Up_ResidualBlock(nn.Module): + + def __init__(self, + in_dim, + out_dim, + dropout, + mult, + temperal_upsample=False, + up_flag=False): + super().__init__() + # Shortcut path with upsample + if up_flag: + self.avg_shortcut = DupUp3D( + in_dim, + out_dim, + factor_t=2 if temperal_upsample else 1, + factor_s=2 if up_flag else 1, + ) + else: + self.avg_shortcut = None + + # Main path with residual blocks and upsample + upsamples = [] + for _ in range(mult): + upsamples.append(ResidualBlock(in_dim, out_dim, dropout)) + in_dim = out_dim + + # Add the final upsample block + if up_flag: + mode = "upsample3d" if temperal_upsample else "upsample2d" + upsamples.append(Resample(out_dim, mode=mode)) + + self.upsamples = nn.Sequential(*upsamples) + + def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False): + x_main = x.clone() + for module in self.upsamples: + x_main = module(x_main, feat_cache, feat_idx) + if self.avg_shortcut is not None: + x_shortcut = self.avg_shortcut(x, first_chunk) + return x_main + x_shortcut + else: + return x_main + + +class Encoder3d(nn.Module): + + def __init__( + self, + dim=128, + z_dim=4, + dim_mult=[1, 2, 4, 4], + num_res_blocks=2, + attn_scales=[], + temperal_downsample=[True, True, False], + dropout=0.0, + ): + super().__init__() + self.dim = dim + self.z_dim = z_dim + self.dim_mult = dim_mult + self.num_res_blocks = num_res_blocks + self.attn_scales = attn_scales + self.temperal_downsample = temperal_downsample + + # dimensions + dims = [dim * u for u in [1] + dim_mult] + scale = 1.0 + + # init block + self.conv1 = CausalConv3d(12, dims[0], 3, padding=1) + + # downsample blocks + downsamples = [] + for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])): + t_down_flag = ( + temperal_downsample[i] + if i < len(temperal_downsample) else False) + downsamples.append( + Down_ResidualBlock( + in_dim=in_dim, + out_dim=out_dim, + dropout=dropout, + mult=num_res_blocks, + temperal_downsample=t_down_flag, + down_flag=i != len(dim_mult) - 1, + )) + scale /= 2.0 + self.downsamples = nn.Sequential(*downsamples) + + # middle blocks + self.middle = nn.Sequential( + ResidualBlock(out_dim, out_dim, dropout), + AttentionBlock(out_dim), + ResidualBlock(out_dim, out_dim, dropout), + ) + + # # output blocks + self.head = nn.Sequential( + RMS_norm(out_dim, images=False), + nn.SiLU(), + CausalConv3d(out_dim, z_dim, 3, padding=1), + ) + + def forward(self, x, feat_cache=None, feat_idx=[0]): + + if feat_cache is not None: + idx = feat_idx[0] + cache_x = x[:, :, -CACHE_T:, :, :].clone() + if cache_x.shape[2] < 2 and feat_cache[idx] is not None: + cache_x = torch.cat( + [ + feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to( + cache_x.device), + cache_x, + ], + dim=2, + ) + x = self.conv1(x, feat_cache[idx]) + feat_cache[idx] = cache_x + feat_idx[0] += 1 + else: + x = self.conv1(x) + + ## downsamples + for layer in self.downsamples: + if feat_cache is not None: + x = layer(x, feat_cache, feat_idx) + else: + x = layer(x) + + ## middle + for layer in self.middle: + if isinstance(layer, ResidualBlock) and feat_cache is not None: + x = layer(x, feat_cache, feat_idx) + else: + x = layer(x) + + ## head + for layer in self.head: + if isinstance(layer, CausalConv3d) and feat_cache is not None: + idx = feat_idx[0] + cache_x = x[:, :, -CACHE_T:, :, :].clone() + if cache_x.shape[2] < 2 and feat_cache[idx] is not None: + cache_x = torch.cat( + [ + feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to( + cache_x.device), + cache_x, + ], + dim=2, + ) + x = layer(x, feat_cache[idx]) + feat_cache[idx] = cache_x + feat_idx[0] += 1 + else: + x = layer(x) + + return x + + +class Decoder3d(nn.Module): + + def __init__( + self, + dim=128, + z_dim=4, + dim_mult=[1, 2, 4, 4], + num_res_blocks=2, + attn_scales=[], + temperal_upsample=[False, True, True], + dropout=0.0, + ): + super().__init__() + self.dim = dim + self.z_dim = z_dim + self.dim_mult = dim_mult + self.num_res_blocks = num_res_blocks + self.attn_scales = attn_scales + self.temperal_upsample = temperal_upsample + + # dimensions + dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]] + # init block + self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1) + + # middle blocks + self.middle = nn.Sequential( + ResidualBlock(dims[0], dims[0], dropout), + AttentionBlock(dims[0]), + ResidualBlock(dims[0], dims[0], dropout), + ) + + # upsample blocks + upsamples = [] + for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])): + t_up_flag = temperal_upsample[i] if i < len( + temperal_upsample) else False + upsamples.append( + Up_ResidualBlock( + in_dim=in_dim, + out_dim=out_dim, + dropout=dropout, + mult=num_res_blocks + 1, + temperal_upsample=t_up_flag, + up_flag=i != len(dim_mult) - 1, + )) + self.upsamples = nn.Sequential(*upsamples) + + # output blocks + self.head = nn.Sequential( + RMS_norm(out_dim, images=False), + nn.SiLU(), + CausalConv3d(out_dim, 12, 3, padding=1), + ) + + def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False): + if feat_cache is not None: + idx = feat_idx[0] + cache_x = x[:, :, -CACHE_T:, :, :].clone() + if cache_x.shape[2] < 2 and feat_cache[idx] is not None: + cache_x = torch.cat( + [ + feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to( + cache_x.device), + cache_x, + ], + dim=2, + ) + x = self.conv1(x, feat_cache[idx]) + feat_cache[idx] = cache_x + feat_idx[0] += 1 + else: + x = self.conv1(x) + + for layer in self.middle: + if isinstance(layer, ResidualBlock) and feat_cache is not None: + x = layer(x, feat_cache, feat_idx) + else: + x = layer(x) + + ## upsamples + for layer in self.upsamples: + if feat_cache is not None: + x = layer(x, feat_cache, feat_idx, first_chunk) + else: + x = layer(x) + + ## head + for layer in self.head: + if isinstance(layer, CausalConv3d) and feat_cache is not None: + idx = feat_idx[0] + cache_x = x[:, :, -CACHE_T:, :, :].clone() + if cache_x.shape[2] < 2 and feat_cache[idx] is not None: + cache_x = torch.cat( + [ + feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to( + cache_x.device), + cache_x, + ], + dim=2, + ) + x = layer(x, feat_cache[idx]) + feat_cache[idx] = cache_x + feat_idx[0] += 1 + else: + x = layer(x) + return x + + +def count_conv3d(model): + count = 0 + for m in model.modules(): + if isinstance(m, CausalConv3d): + count += 1 + return count + + +class WanVAE(nn.Module): + + def __init__( + self, + dim=160, + dec_dim=256, + z_dim=16, + dim_mult=[1, 2, 4, 4], + num_res_blocks=2, + attn_scales=[], + temperal_downsample=[True, True, False], + dropout=0.0, + ): + super().__init__() + self.dim = dim + self.z_dim = z_dim + self.dim_mult = dim_mult + self.num_res_blocks = num_res_blocks + self.attn_scales = attn_scales + self.temperal_downsample = temperal_downsample + self.temperal_upsample = temperal_downsample[::-1] + + # modules + self.encoder = Encoder3d( + dim, + z_dim * 2, + dim_mult, + num_res_blocks, + attn_scales, + self.temperal_downsample, + dropout, + ) + self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1) + self.conv2 = CausalConv3d(z_dim, z_dim, 1) + self.decoder = Decoder3d( + dec_dim, + z_dim, + dim_mult, + num_res_blocks, + attn_scales, + self.temperal_upsample, + dropout, + ) + + def encode(self, x): + self.clear_cache() + x = patchify(x, patch_size=2) + t = x.shape[2] + iter_ = 1 + (t - 1) // 4 + for i in range(iter_): + self._enc_conv_idx = [0] + if i == 0: + out = self.encoder( + x[:, :, :1, :, :], + feat_cache=self._enc_feat_map, + feat_idx=self._enc_conv_idx, + ) + else: + out_ = self.encoder( + x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :], + feat_cache=self._enc_feat_map, + feat_idx=self._enc_conv_idx, + ) + out = torch.cat([out, out_], 2) + mu, log_var = self.conv1(out).chunk(2, dim=1) + self.clear_cache() + return mu + + def decode(self, z): + self.clear_cache() + iter_ = z.shape[2] + x = self.conv2(z) + for i in range(iter_): + self._conv_idx = [0] + if i == 0: + out = self.decoder( + x[:, :, i:i + 1, :, :], + feat_cache=self._feat_map, + feat_idx=self._conv_idx, + first_chunk=True, + ) + else: + out_ = self.decoder( + x[:, :, i:i + 1, :, :], + feat_cache=self._feat_map, + feat_idx=self._conv_idx, + ) + out = torch.cat([out, out_], 2) + out = unpatchify(out, patch_size=2) + self.clear_cache() + return out + + def reparameterize(self, mu, log_var): + std = torch.exp(0.5 * log_var) + eps = torch.randn_like(std) + return eps * std + mu + + def sample(self, imgs, deterministic=False): + mu, log_var = self.encode(imgs) + if deterministic: + return mu + std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0)) + return mu + std * torch.randn_like(std) + + def clear_cache(self): + self._conv_num = count_conv3d(self.decoder) + self._conv_idx = [0] + self._feat_map = [None] * self._conv_num + # cache encode + self._enc_conv_num = count_conv3d(self.encoder) + self._enc_conv_idx = [0] + self._enc_feat_map = [None] * self._enc_conv_num diff --git a/comfy/model_base.py b/comfy/model_base.py index 4392355ea..d019b991a 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1097,8 +1097,9 @@ class WAN21(BaseModel): image[:, i: i + 16] = self.process_latent_in(image[:, i: i + 16]) image = utils.resize_to_batch_size(image, noise.shape[0]) - if not self.image_to_video or extra_channels == image.shape[1]: - return image + if extra_channels != image.shape[1] + 4: + if not self.image_to_video or extra_channels == image.shape[1]: + return image if image.shape[1] > (extra_channels - 4): image = image[:, :(extra_channels - 4)] @@ -1182,6 +1183,31 @@ class WAN21_Camera(WAN21): out['camera_conditions'] = comfy.conds.CONDRegular(camera_conditions) return out +class WAN22(BaseModel): + def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None): + super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel) + self.image_to_video = image_to_video + + def extra_conds(self, **kwargs): + out = super().extra_conds(**kwargs) + cross_attn = kwargs.get("cross_attn", None) + if cross_attn is not None: + out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) + + denoise_mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None)) + if denoise_mask is not None: + out["denoise_mask"] = comfy.conds.CONDRegular(denoise_mask) + return out + + def process_timestep(self, timestep, x, denoise_mask=None, **kwargs): + if denoise_mask is None: + return timestep + temp_ts = (torch.mean(denoise_mask[:, :, :, ::2, ::2], dim=1, keepdim=True) * timestep.view([timestep.shape[0]] + [1] * (denoise_mask.ndim - 1))).reshape(timestep.shape[0], -1) + return temp_ts + + def scale_latent_inpaint(self, sigma, noise, latent_image, **kwargs): + return latent_image + class Hunyuan3Dv2(BaseModel): def __init__(self, model_config, model_type=ModelType.FLOW, device=None): super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hunyuan3d.model.Hunyuan3Dv2) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 18232ade3..9fc1f42de 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -346,7 +346,9 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config = {} dit_config["image_model"] = "wan2.1" dim = state_dict['{}head.modulation'.format(key_prefix)].shape[-1] + out_dim = state_dict['{}head.head.weight'.format(key_prefix)].shape[0] // 4 dit_config["dim"] = dim + dit_config["out_dim"] = out_dim dit_config["num_heads"] = dim // 128 dit_config["ffn_dim"] = state_dict['{}blocks.0.ffn.0.weight'.format(key_prefix)].shape[0] dit_config["num_layers"] = count_blocks(state_dict_keys, '{}blocks.'.format(key_prefix) + '{}.') diff --git a/comfy/sd.py b/comfy/sd.py index 8081b167c..e0498e585 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -14,6 +14,7 @@ import comfy.ldm.genmo.vae.model import comfy.ldm.lightricks.vae.causal_video_autoencoder import comfy.ldm.cosmos.vae import comfy.ldm.wan.vae +import comfy.ldm.wan.vae2_2 import comfy.ldm.hunyuan3d.vae import comfy.ldm.ace.vae.music_dcae_pipeline import yaml @@ -420,17 +421,30 @@ class VAE: self.memory_used_encode = lambda shape, dtype: (50 * (round((shape[2] + 7) / 8) * 8) * shape[3] * shape[4]) * model_management.dtype_size(dtype) self.working_dtypes = [torch.bfloat16, torch.float32] elif "decoder.middle.0.residual.0.gamma" in sd: - self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 8, 8) - self.upscale_index_formula = (4, 8, 8) - self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 8, 8) - self.downscale_index_formula = (4, 8, 8) - self.latent_dim = 3 - self.latent_channels = 16 - ddconfig = {"dim": 96, "z_dim": self.latent_channels, "dim_mult": [1, 2, 4, 4], "num_res_blocks": 2, "attn_scales": [], "temperal_downsample": [False, True, True], "dropout": 0.0} - self.first_stage_model = comfy.ldm.wan.vae.WanVAE(**ddconfig) - self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32] - self.memory_used_encode = lambda shape, dtype: 6000 * shape[3] * shape[4] * model_management.dtype_size(dtype) - self.memory_used_decode = lambda shape, dtype: 7000 * shape[3] * shape[4] * (8 * 8) * model_management.dtype_size(dtype) + if "decoder.upsamples.0.upsamples.0.residual.2.weight" in sd: # Wan 2.2 VAE + self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 16, 16) + self.upscale_index_formula = (4, 16, 16) + self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 16, 16) + self.downscale_index_formula = (4, 16, 16) + self.latent_dim = 3 + self.latent_channels = 48 + ddconfig = {"dim": 160, "z_dim": self.latent_channels, "dim_mult": [1, 2, 4, 4], "num_res_blocks": 2, "attn_scales": [], "temperal_downsample": [False, True, True], "dropout": 0.0} + self.first_stage_model = comfy.ldm.wan.vae2_2.WanVAE(**ddconfig) + self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32] + self.memory_used_encode = lambda shape, dtype: 3300 * shape[3] * shape[4] * model_management.dtype_size(dtype) + self.memory_used_decode = lambda shape, dtype: 8000 * shape[3] * shape[4] * (16 * 16) * model_management.dtype_size(dtype) + else: # Wan 2.1 VAE + self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 8, 8) + self.upscale_index_formula = (4, 8, 8) + self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 8, 8) + self.downscale_index_formula = (4, 8, 8) + self.latent_dim = 3 + self.latent_channels = 16 + ddconfig = {"dim": 96, "z_dim": self.latent_channels, "dim_mult": [1, 2, 4, 4], "num_res_blocks": 2, "attn_scales": [], "temperal_downsample": [False, True, True], "dropout": 0.0} + self.first_stage_model = comfy.ldm.wan.vae.WanVAE(**ddconfig) + self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32] + self.memory_used_encode = lambda shape, dtype: 6000 * shape[3] * shape[4] * model_management.dtype_size(dtype) + self.memory_used_decode = lambda shape, dtype: 7000 * shape[3] * shape[4] * (8 * 8) * model_management.dtype_size(dtype) elif "geo_decoder.cross_attn_decoder.ln_1.bias" in sd: self.latent_dim = 1 ln_post = "geo_decoder.ln_post.weight" in sd diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 2ca3857f7..8f3f4652d 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1059,6 +1059,19 @@ class WAN21_Vace(WAN21_T2V): out = model_base.WAN21_Vace(self, image_to_video=False, device=device) return out +class WAN22_T2V(WAN21_T2V): + unet_config = { + "image_model": "wan2.1", + "model_type": "t2v", + "out_dim": 48, + } + + latent_format = latent_formats.Wan22 + + def get_model(self, state_dict, prefix="", device=None): + out = model_base.WAN22(self, image_to_video=True, device=device) + return out + class Hunyuan3Dv2(supported_models_base.BASE): unet_config = { "image_model": "hunyuan3d2", @@ -1217,6 +1230,6 @@ class Omnigen2(supported_models_base.BASE): return supported_models_base.ClipTarget(comfy.text_encoders.omnigen2.Omnigen2Tokenizer, comfy.text_encoders.omnigen2.te(**hunyuan_detect)) -models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep, Omnigen2] +models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep, Omnigen2] models += [SVD_img2vid] diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index d71908f31..0b92c68ac 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -685,6 +685,49 @@ class WanTrackToVideo: out_latent["samples"] = latent return (positive, negative, out_latent) + +class Wan22ImageToVideoLatent: + @classmethod + def INPUT_TYPES(s): + return {"required": {"vae": ("VAE", ), + "width": ("INT", {"default": 1280, "min": 32, "max": nodes.MAX_RESOLUTION, "step": 32}), + "height": ("INT", {"default": 704, "min": 32, "max": nodes.MAX_RESOLUTION, "step": 32}), + "length": ("INT", {"default": 49, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), + "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), + }, + "optional": {"start_image": ("IMAGE", ), + }} + + + RETURN_TYPES = ("LATENT",) + FUNCTION = "encode" + + CATEGORY = "conditioning/inpaint" + + def encode(self, vae, width, height, length, batch_size, start_image=None): + latent = torch.zeros([1, 48, ((length - 1) // 4) + 1, height // 16, width // 16], device=comfy.model_management.intermediate_device()) + + if start_image is None: + out_latent = {} + out_latent["samples"] = latent + return (out_latent,) + + mask = torch.ones([latent.shape[0], 1, ((length - 1) // 4) + 1, latent.shape[-2], latent.shape[-1]], device=comfy.model_management.intermediate_device()) + + if start_image is not None: + start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + latent_temp = vae.encode(start_image) + latent[:, :, :latent_temp.shape[-3]] = latent_temp + mask[:, :, :latent_temp.shape[-3]] *= 0.0 + + out_latent = {} + latent_format = comfy.latent_formats.Wan22() + latent = latent_format.process_out(latent) * mask + latent * (1.0 - mask) + out_latent["samples"] = latent.repeat((batch_size, ) + (1,) * (latent.ndim - 1)) + out_latent["noise_mask"] = mask.repeat((batch_size, ) + (1,) * (mask.ndim - 1)) + return (out_latent,) + + NODE_CLASS_MAPPINGS = { "WanTrackToVideo": WanTrackToVideo, "WanImageToVideo": WanImageToVideo, @@ -695,4 +738,5 @@ NODE_CLASS_MAPPINGS = { "TrimVideoLatent": TrimVideoLatent, "WanCameraImageToVideo": WanCameraImageToVideo, "WanPhantomSubjectToVideo": WanPhantomSubjectToVideo, + "Wan22ImageToVideoLatent": Wan22ImageToVideoLatent, } From 9f1388c0a38b9b6ebde0cdde904d94d709d3ca82 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 28 Jul 2025 05:01:53 -0700 Subject: [PATCH 070/243] Add wan2.2 to readme. (#9081) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 8a15136aa..befc4c006 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,7 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith - [Hunyuan Video](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_video/) - [Nvidia Cosmos](https://comfyanonymous.github.io/ComfyUI_examples/cosmos/) and [Cosmos Predict2](https://comfyanonymous.github.io/ComfyUI_examples/cosmos_predict2/) - [Wan 2.1](https://comfyanonymous.github.io/ComfyUI_examples/wan/) + - [Wan 2.2](https://comfyanonymous.github.io/ComfyUI_examples/wan22/) - Audio Models - [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/) - [ACE Step](https://comfyanonymous.github.io/ComfyUI_examples/audio/) From 5d4cc3ba1b412b9acacd37fd23d59e0e1654f83c Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Mon, 28 Jul 2025 08:04:04 -0400 Subject: [PATCH 071/243] ComfyUI 0.3.46 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 180ecaf8a..315710dd2 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.45" +__version__ = "0.3.46" diff --git a/pyproject.toml b/pyproject.toml index b1d6d9df6..59c4c70fb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.45" +version = "0.3.46" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From c60dc4177c16d50da025bda4ac7fd513bb86e699 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 28 Jul 2025 11:48:19 -0700 Subject: [PATCH 072/243] Remove unecessary clones in the wan2.2 VAE. (#9083) --- comfy/ldm/wan/vae2_2.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/comfy/ldm/wan/vae2_2.py b/comfy/ldm/wan/vae2_2.py index c2c150e10..b9c2d1a26 100644 --- a/comfy/ldm/wan/vae2_2.py +++ b/comfy/ldm/wan/vae2_2.py @@ -136,7 +136,7 @@ class ResidualBlock(nn.Module): if in_dim != out_dim else nn.Identity()) def forward(self, x, feat_cache=None, feat_idx=[0]): - h = self.shortcut(x) + old_x = x for layer in self.residual: if isinstance(layer, CausalConv3d) and feat_cache is not None: idx = feat_idx[0] @@ -156,7 +156,7 @@ class ResidualBlock(nn.Module): feat_idx[0] += 1 else: x = layer(x) - return x + h + return x + self.shortcut(old_x) def patchify(x, patch_size): @@ -327,7 +327,7 @@ class Down_ResidualBlock(nn.Module): self.downsamples = nn.Sequential(*downsamples) def forward(self, x, feat_cache=None, feat_idx=[0]): - x_copy = x.clone() + x_copy = x for module in self.downsamples: x = module(x, feat_cache, feat_idx) @@ -369,7 +369,7 @@ class Up_ResidualBlock(nn.Module): self.upsamples = nn.Sequential(*upsamples) def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False): - x_main = x.clone() + x_main = x for module in self.upsamples: x_main = module(x_main, feat_cache, feat_idx) if self.avg_shortcut is not None: From 7d593baf919f468670425c0d9068ead8a3e9b05f Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 29 Jul 2025 01:07:45 -0700 Subject: [PATCH 073/243] Extra reserved vram on large cards on windows. (#9093) --- comfy/model_management.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comfy/model_management.py b/comfy/model_management.py index 232d363aa..9e6149d60 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -529,6 +529,8 @@ WINDOWS = any(platform.win32_ver()) EXTRA_RESERVED_VRAM = 400 * 1024 * 1024 if WINDOWS: EXTRA_RESERVED_VRAM = 600 * 1024 * 1024 #Windows is higher because of the shared vram issue + if total_vram > (15 * 1024): # more extra reserved vram on 16GB+ cards + EXTRA_RESERVED_VRAM += 100 * 1024 * 1024 if args.reserve_vram is not None: EXTRA_RESERVED_VRAM = args.reserve_vram * 1024 * 1024 * 1024 From dca6bdd4fa70770f910ea80e61c72e1f450530d9 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 29 Jul 2025 16:44:18 -0700 Subject: [PATCH 074/243] Make wan2.2 5B i2v take a lot less memory. (#9102) --- comfy/ldm/wan/model.py | 21 ++++++++++++++++----- comfy/model_base.py | 2 +- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index b9e47e9f7..a93a13c86 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -146,6 +146,15 @@ WAN_CROSSATTENTION_CLASSES = { } +def repeat_e(e, x): + repeats = 1 + if e.shape[1] > 1: + repeats = x.shape[1] // e.shape[1] + if repeats == 1: + return e + return torch.repeat_interleave(e, repeats, dim=1) + + class WanAttentionBlock(nn.Module): def __init__(self, @@ -201,6 +210,7 @@ class WanAttentionBlock(nn.Module): freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2] """ # assert e.dtype == torch.float32 + if e.ndim < 4: e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e).chunk(6, dim=1) else: @@ -209,15 +219,15 @@ class WanAttentionBlock(nn.Module): # self-attention y = self.self_attn( - self.norm1(x) * (1 + e[1]) + e[0], + self.norm1(x) * (1 + repeat_e(e[1], x)) + repeat_e(e[0], x), freqs) - x = x + y * e[2] + x = x + y * repeat_e(e[2], x) # cross-attention & ffn x = x + self.cross_attn(self.norm3(x), context, context_img_len=context_img_len) - y = self.ffn(self.norm2(x) * (1 + e[4]) + e[3]) - x = x + y * e[5] + y = self.ffn(self.norm2(x) * (1 + repeat_e(e[4], x)) + repeat_e(e[3], x)) + x = x + y * repeat_e(e[5], x) return x @@ -331,7 +341,8 @@ class Head(nn.Module): e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e.unsqueeze(1)).chunk(2, dim=1) else: e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device).unsqueeze(0) + e.unsqueeze(2)).unbind(2) - x = (self.head(self.norm(x) * (1 + e[1]) + e[0])) + + x = (self.head(self.norm(x) * (1 + repeat_e(e[1], x)) + repeat_e(e[0], x))) return x diff --git a/comfy/model_base.py b/comfy/model_base.py index d019b991a..6b7978949 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1202,7 +1202,7 @@ class WAN22(BaseModel): def process_timestep(self, timestep, x, denoise_mask=None, **kwargs): if denoise_mask is None: return timestep - temp_ts = (torch.mean(denoise_mask[:, :, :, ::2, ::2], dim=1, keepdim=True) * timestep.view([timestep.shape[0]] + [1] * (denoise_mask.ndim - 1))).reshape(timestep.shape[0], -1) + temp_ts = (torch.mean(denoise_mask[:, :, :, :, :], dim=(1, 3, 4), keepdim=True) * timestep.view([timestep.shape[0]] + [1] * (denoise_mask.ndim - 1))).reshape(timestep.shape[0], -1) return temp_ts def scale_latent_inpaint(self, sigma, noise, latent_image, **kwargs): From 2f74e17975696d829af455845c584574bbc85774 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Tue, 29 Jul 2025 20:08:25 -0400 Subject: [PATCH 075/243] ComfyUI version 0.3.47 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 315710dd2..20a2e892a 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.46" +__version__ = "0.3.47" diff --git a/pyproject.toml b/pyproject.toml index 59c4c70fb..723c93069 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.46" +version = "0.3.47" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 0a3d062e0660741146d50f6601e3eeca211d92d5 Mon Sep 17 00:00:00 2001 From: guill Date: Tue, 29 Jul 2025 19:17:22 -0700 Subject: [PATCH 076/243] ComfyAPI Core v0.0.2 (#8962) * ComfyAPI Core v0.0.2 * Respond to PR feedback * Fix Python 3.9 errors * Fix missing backward compatibility proxy * Reorganize types a bit The input types, input impls, and utility types are now all available in the versioned API. See the change in `comfy_extras/nodes_video.py` for an example of their usage. * Remove the need for `--generate-api-stubs` * Fix generated stubs differing by Python version * Fix ruff formatting issues --- comfy_api/generate_api_stubs.py | 86 ++ comfy_api/input/__init__.py | 12 +- comfy_api/input/basic_types.py | 32 +- comfy_api/input/video_types.py | 89 +- comfy_api/input_impl/__init__.py | 4 +- comfy_api/input_impl/video_types.py | 326 +----- comfy_api/internal/__init__.py | 7 + comfy_api/internal/api_registry.py | 39 + comfy_api/internal/async_to_sync.py | 987 ++++++++++++++++++ comfy_api/internal/singleton.py | 33 + comfy_api/latest/__init__.py | 106 ++ comfy_api/latest/_input/__init__.py | 10 + comfy_api/latest/_input/basic_types.py | 42 + comfy_api/latest/_input/video_types.py | 85 ++ comfy_api/latest/_input_impl/__init__.py | 7 + comfy_api/latest/_input_impl/video_types.py | 324 ++++++ comfy_api/latest/_util/__init__.py | 8 + comfy_api/latest/_util/video_types.py | 52 + .../latest/generated/ComfyAPISyncStub.pyi | 20 + comfy_api/util.py | 8 + comfy_api/util/__init__.py | 4 +- comfy_api/util/video_types.py | 61 +- comfy_api/v0_0_1/__init__.py | 42 + .../v0_0_1/generated/ComfyAPISyncStub.pyi | 20 + comfy_api/v0_0_2/__init__.py | 43 + .../v0_0_2/generated/ComfyAPISyncStub.pyi | 20 + comfy_api/version_list.py | 12 + comfy_api_nodes/apis/request_logger.py | 2 + comfy_api_nodes/nodes_gemini.py | 2 + comfy_execution/progress.py | 16 +- comfy_extras/nodes_video.py | 23 +- main.py | 4 +- nodes.py | 36 +- pyproject.toml | 2 +- .../testing_nodes/testing-pack/__init__.py | 4 +- .../testing-pack/api_test_nodes.py | 78 ++ 36 files changed, 2128 insertions(+), 518 deletions(-) create mode 100644 comfy_api/generate_api_stubs.py create mode 100644 comfy_api/internal/__init__.py create mode 100644 comfy_api/internal/api_registry.py create mode 100644 comfy_api/internal/async_to_sync.py create mode 100644 comfy_api/internal/singleton.py create mode 100644 comfy_api/latest/__init__.py create mode 100644 comfy_api/latest/_input/__init__.py create mode 100644 comfy_api/latest/_input/basic_types.py create mode 100644 comfy_api/latest/_input/video_types.py create mode 100644 comfy_api/latest/_input_impl/__init__.py create mode 100644 comfy_api/latest/_input_impl/video_types.py create mode 100644 comfy_api/latest/_util/__init__.py create mode 100644 comfy_api/latest/_util/video_types.py create mode 100644 comfy_api/latest/generated/ComfyAPISyncStub.pyi create mode 100644 comfy_api/util.py create mode 100644 comfy_api/v0_0_1/__init__.py create mode 100644 comfy_api/v0_0_1/generated/ComfyAPISyncStub.pyi create mode 100644 comfy_api/v0_0_2/__init__.py create mode 100644 comfy_api/v0_0_2/generated/ComfyAPISyncStub.pyi create mode 100644 comfy_api/version_list.py create mode 100644 tests/inference/testing_nodes/testing-pack/api_test_nodes.py diff --git a/comfy_api/generate_api_stubs.py b/comfy_api/generate_api_stubs.py new file mode 100644 index 000000000..604a7eced --- /dev/null +++ b/comfy_api/generate_api_stubs.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +""" +Script to generate .pyi stub files for the synchronous API wrappers. +This allows generating stubs without running the full ComfyUI application. +""" + +import os +import sys +import logging +import importlib + +# Add ComfyUI to path so we can import modules +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from comfy_api.internal.async_to_sync import AsyncToSyncConverter +from comfy_api.version_list import supported_versions + + +def generate_stubs_for_module(module_name: str) -> None: + """Generate stub files for a specific module that exports ComfyAPI and ComfyAPISync.""" + try: + # Import the module + module = importlib.import_module(module_name) + + # Check if module has ComfyAPISync (the sync wrapper) + if hasattr(module, "ComfyAPISync"): + # Module already has a sync class + api_class = getattr(module, "ComfyAPI", None) + sync_class = getattr(module, "ComfyAPISync") + + if api_class: + # Generate the stub file + AsyncToSyncConverter.generate_stub_file(api_class, sync_class) + logging.info(f"Generated stub file for {module_name}") + else: + logging.warning( + f"Module {module_name} has ComfyAPISync but no ComfyAPI" + ) + + elif hasattr(module, "ComfyAPI"): + # Module only has async API, need to create sync wrapper first + from comfy_api.internal.async_to_sync import create_sync_class + + api_class = getattr(module, "ComfyAPI") + sync_class = create_sync_class(api_class) + + # Generate the stub file + AsyncToSyncConverter.generate_stub_file(api_class, sync_class) + logging.info(f"Generated stub file for {module_name}") + else: + logging.warning( + f"Module {module_name} does not export ComfyAPI or ComfyAPISync" + ) + + except Exception as e: + logging.error(f"Failed to generate stub for {module_name}: {e}") + import traceback + + traceback.print_exc() + + +def main(): + """Main function to generate all API stub files.""" + logging.basicConfig(level=logging.INFO) + + logging.info("Starting stub generation...") + + # Dynamically get module names from supported_versions + api_modules = [] + for api_class in supported_versions: + # Extract module name from the class + module_name = api_class.__module__ + if module_name not in api_modules: + api_modules.append(module_name) + + logging.info(f"Found {len(api_modules)} API modules: {api_modules}") + + # Generate stubs for each module + for module_name in api_modules: + generate_stubs_for_module(module_name) + + logging.info("Stub generation complete!") + + +if __name__ == "__main__": + main() diff --git a/comfy_api/input/__init__.py b/comfy_api/input/__init__.py index 66667946f..68ff78270 100644 --- a/comfy_api/input/__init__.py +++ b/comfy_api/input/__init__.py @@ -1,8 +1,16 @@ -from .basic_types import ImageInput, AudioInput -from .video_types import VideoInput +# This file only exists for backwards compatibility. +from comfy_api.latest._input import ( + ImageInput, + AudioInput, + MaskInput, + LatentInput, + VideoInput, +) __all__ = [ "ImageInput", "AudioInput", + "MaskInput", + "LatentInput", "VideoInput", ] diff --git a/comfy_api/input/basic_types.py b/comfy_api/input/basic_types.py index 033fb7e27..5eadce86a 100644 --- a/comfy_api/input/basic_types.py +++ b/comfy_api/input/basic_types.py @@ -1,20 +1,14 @@ -import torch -from typing import TypedDict - -ImageInput = torch.Tensor -""" -An image in format [B, H, W, C] where B is the batch size, C is the number of channels, -""" - -class AudioInput(TypedDict): - """ - TypedDict representing audio input. - """ - - waveform: torch.Tensor - """ - Tensor in the format [B, C, T] where B is the batch size, C is the number of channels, - """ - - sample_rate: int +# This file only exists for backwards compatibility. +from comfy_api.latest._input.basic_types import ( + ImageInput, + AudioInput, + MaskInput, + LatentInput, +) +__all__ = [ + "ImageInput", + "AudioInput", + "MaskInput", + "LatentInput", +] diff --git a/comfy_api/input/video_types.py b/comfy_api/input/video_types.py index 5d95dc507..9ace78cbc 100644 --- a/comfy_api/input/video_types.py +++ b/comfy_api/input/video_types.py @@ -1,85 +1,6 @@ -from __future__ import annotations -from abc import ABC, abstractmethod -from typing import Optional, Union -import io -import av -from comfy_api.util import VideoContainer, VideoCodec, VideoComponents +# This file only exists for backwards compatibility. +from comfy_api.latest._input.video_types import VideoInput -class VideoInput(ABC): - """ - Abstract base class for video input types. - """ - - @abstractmethod - def get_components(self) -> VideoComponents: - """ - Abstract method to get the video components (images, audio, and frame rate). - - Returns: - VideoComponents containing images, audio, and frame rate - """ - pass - - @abstractmethod - def save_to( - self, - path: str, - format: VideoContainer = VideoContainer.AUTO, - codec: VideoCodec = VideoCodec.AUTO, - metadata: Optional[dict] = None - ): - """ - Abstract method to save the video input to a file. - """ - pass - - def get_stream_source(self) -> Union[str, io.BytesIO]: - """ - Get a streamable source for the video. This allows processing without - loading the entire video into memory. - - Returns: - Either a file path (str) or a BytesIO object that can be opened with av. - - Default implementation creates a BytesIO buffer, but subclasses should - override this for better performance when possible. - """ - buffer = io.BytesIO() - self.save_to(buffer) - buffer.seek(0) - return buffer - - # Provide a default implementation, but subclasses can provide optimized versions - # if possible. - def get_dimensions(self) -> tuple[int, int]: - """ - Returns the dimensions of the video input. - - Returns: - Tuple of (width, height) - """ - components = self.get_components() - return components.images.shape[2], components.images.shape[1] - - def get_duration(self) -> float: - """ - Returns the duration of the video in seconds. - - Returns: - Duration in seconds - """ - components = self.get_components() - frame_count = components.images.shape[0] - return float(frame_count / components.frame_rate) - - def get_container_format(self) -> str: - """ - Returns the container format of the video (e.g., 'mp4', 'mov', 'avi'). - - Returns: - Container format as string - """ - # Default implementation - subclasses should override for better performance - source = self.get_stream_source() - with av.open(source, mode="r") as container: - return container.format.name +__all__ = [ + "VideoInput", +] diff --git a/comfy_api/input_impl/__init__.py b/comfy_api/input_impl/__init__.py index 02901b8b9..b78ff0c08 100644 --- a/comfy_api/input_impl/__init__.py +++ b/comfy_api/input_impl/__init__.py @@ -1,7 +1,7 @@ -from .video_types import VideoFromFile, VideoFromComponents +# This file only exists for backwards compatibility. +from comfy_api.latest._input_impl import VideoFromFile, VideoFromComponents __all__ = [ - # Implementations "VideoFromFile", "VideoFromComponents", ] diff --git a/comfy_api/input_impl/video_types.py b/comfy_api/input_impl/video_types.py index 91e7c1bfa..bd2e56ad5 100644 --- a/comfy_api/input_impl/video_types.py +++ b/comfy_api/input_impl/video_types.py @@ -1,324 +1,2 @@ -from __future__ import annotations -from av.container import InputContainer -from av.subtitles.stream import SubtitleStream -from fractions import Fraction -from typing import Optional -from comfy_api.input import AudioInput -import av -import io -import json -import numpy as np -import torch -from comfy_api.input import VideoInput -from comfy_api.util import VideoContainer, VideoCodec, VideoComponents - - -def container_to_output_format(container_format: str | None) -> str | None: - """ - A container's `format` may be a comma-separated list of formats. - E.g., iso container's `format` may be `mov,mp4,m4a,3gp,3g2,mj2`. - However, writing to a file/stream with `av.open` requires a single format, - or `None` to auto-detect. - """ - if not container_format: - return None # Auto-detect - - if "," not in container_format: - return container_format - - formats = container_format.split(",") - return formats[0] - - -def get_open_write_kwargs( - dest: str | io.BytesIO, container_format: str, to_format: str | None -) -> dict: - """Get kwargs for writing a `VideoFromFile` to a file/stream with `av.open`""" - open_kwargs = { - "mode": "w", - # If isobmff, preserve custom metadata tags (workflow, prompt, extra_pnginfo) - "options": {"movflags": "use_metadata_tags"}, - } - - is_write_to_buffer = isinstance(dest, io.BytesIO) - if is_write_to_buffer: - # Set output format explicitly, since it cannot be inferred from file extension - if to_format == VideoContainer.AUTO: - to_format = container_format.lower() - elif isinstance(to_format, str): - to_format = to_format.lower() - open_kwargs["format"] = container_to_output_format(to_format) - - return open_kwargs - - -class VideoFromFile(VideoInput): - """ - Class representing video input from a file. - """ - - def __init__(self, file: str | io.BytesIO): - """ - Initialize the VideoFromFile object based off of either a path on disk or a BytesIO object - containing the file contents. - """ - self.__file = file - - def get_stream_source(self) -> str | io.BytesIO: - """ - Return the underlying file source for efficient streaming. - This avoids unnecessary memory copies when the source is already a file path. - """ - if isinstance(self.__file, io.BytesIO): - self.__file.seek(0) - return self.__file - - def get_dimensions(self) -> tuple[int, int]: - """ - Returns the dimensions of the video input. - - Returns: - Tuple of (width, height) - """ - if isinstance(self.__file, io.BytesIO): - self.__file.seek(0) # Reset the BytesIO object to the beginning - with av.open(self.__file, mode='r') as container: - for stream in container.streams: - if stream.type == 'video': - assert isinstance(stream, av.VideoStream) - return stream.width, stream.height - raise ValueError(f"No video stream found in file '{self.__file}'") - - def get_duration(self) -> float: - """ - Returns the duration of the video in seconds. - - Returns: - Duration in seconds - """ - if isinstance(self.__file, io.BytesIO): - self.__file.seek(0) - with av.open(self.__file, mode="r") as container: - if container.duration is not None: - return float(container.duration / av.time_base) - - # Fallback: calculate from frame count and frame rate - video_stream = next( - (s for s in container.streams if s.type == "video"), None - ) - if video_stream and video_stream.frames and video_stream.average_rate: - return float(video_stream.frames / video_stream.average_rate) - - # Last resort: decode frames to count them - if video_stream and video_stream.average_rate: - frame_count = 0 - container.seek(0) - for packet in container.demux(video_stream): - for _ in packet.decode(): - frame_count += 1 - if frame_count > 0: - return float(frame_count / video_stream.average_rate) - - raise ValueError(f"Could not determine duration for file '{self.__file}'") - - def get_container_format(self) -> str: - """ - Returns the container format of the video (e.g., 'mp4', 'mov', 'avi'). - - Returns: - Container format as string - """ - if isinstance(self.__file, io.BytesIO): - self.__file.seek(0) - with av.open(self.__file, mode='r') as container: - return container.format.name - - def get_components_internal(self, container: InputContainer) -> VideoComponents: - # Get video frames - frames = [] - for frame in container.decode(video=0): - img = frame.to_ndarray(format='rgb24') # shape: (H, W, 3) - img = torch.from_numpy(img) / 255.0 # shape: (H, W, 3) - frames.append(img) - - images = torch.stack(frames) if len(frames) > 0 else torch.zeros(0, 3, 0, 0) - - # Get frame rate - video_stream = next(s for s in container.streams if s.type == 'video') - frame_rate = Fraction(video_stream.average_rate) if video_stream and video_stream.average_rate else Fraction(1) - - # Get audio if available - audio = None - try: - container.seek(0) # Reset the container to the beginning - for stream in container.streams: - if stream.type != 'audio': - continue - assert isinstance(stream, av.AudioStream) - audio_frames = [] - for packet in container.demux(stream): - for frame in packet.decode(): - assert isinstance(frame, av.AudioFrame) - audio_frames.append(frame.to_ndarray()) # shape: (channels, samples) - if len(audio_frames) > 0: - audio_data = np.concatenate(audio_frames, axis=1) # shape: (channels, total_samples) - audio_tensor = torch.from_numpy(audio_data).unsqueeze(0) # shape: (1, channels, total_samples) - audio = AudioInput({ - "waveform": audio_tensor, - "sample_rate": int(stream.sample_rate) if stream.sample_rate else 1, - }) - except StopIteration: - pass # No audio stream - - metadata = container.metadata - return VideoComponents(images=images, audio=audio, frame_rate=frame_rate, metadata=metadata) - - def get_components(self) -> VideoComponents: - if isinstance(self.__file, io.BytesIO): - self.__file.seek(0) # Reset the BytesIO object to the beginning - with av.open(self.__file, mode='r') as container: - return self.get_components_internal(container) - raise ValueError(f"No video stream found in file '{self.__file}'") - - def save_to( - self, - path: str | io.BytesIO, - format: VideoContainer = VideoContainer.AUTO, - codec: VideoCodec = VideoCodec.AUTO, - metadata: Optional[dict] = None - ): - if isinstance(self.__file, io.BytesIO): - self.__file.seek(0) # Reset the BytesIO object to the beginning - with av.open(self.__file, mode='r') as container: - container_format = container.format.name - video_encoding = container.streams.video[0].codec.name if len(container.streams.video) > 0 else None - reuse_streams = True - if format != VideoContainer.AUTO and format not in container_format.split(","): - reuse_streams = False - if codec != VideoCodec.AUTO and codec != video_encoding and video_encoding is not None: - reuse_streams = False - - if not reuse_streams: - components = self.get_components_internal(container) - video = VideoFromComponents(components) - return video.save_to( - path, - format=format, - codec=codec, - metadata=metadata - ) - - streams = container.streams - - open_kwargs = get_open_write_kwargs(path, container_format, format) - with av.open(path, **open_kwargs) as output_container: - # Copy over the original metadata - for key, value in container.metadata.items(): - if metadata is None or key not in metadata: - output_container.metadata[key] = value - - # Add our new metadata - if metadata is not None: - for key, value in metadata.items(): - if isinstance(value, str): - output_container.metadata[key] = value - else: - output_container.metadata[key] = json.dumps(value) - - # Add streams to the new container - stream_map = {} - for stream in streams: - if isinstance(stream, (av.VideoStream, av.AudioStream, SubtitleStream)): - out_stream = output_container.add_stream_from_template(template=stream, opaque=True) - stream_map[stream] = out_stream - - # Write packets to the new container - for packet in container.demux(): - if packet.stream in stream_map and packet.dts is not None: - packet.stream = stream_map[packet.stream] - output_container.mux(packet) - -class VideoFromComponents(VideoInput): - """ - Class representing video input from tensors. - """ - - def __init__(self, components: VideoComponents): - self.__components = components - - def get_components(self) -> VideoComponents: - return VideoComponents( - images=self.__components.images, - audio=self.__components.audio, - frame_rate=self.__components.frame_rate - ) - - def save_to( - self, - path: str, - format: VideoContainer = VideoContainer.AUTO, - codec: VideoCodec = VideoCodec.AUTO, - metadata: Optional[dict] = None - ): - if format != VideoContainer.AUTO and format != VideoContainer.MP4: - raise ValueError("Only MP4 format is supported for now") - if codec != VideoCodec.AUTO and codec != VideoCodec.H264: - raise ValueError("Only H264 codec is supported for now") - with av.open(path, mode='w', options={'movflags': 'use_metadata_tags'}) as output: - # Add metadata before writing any streams - if metadata is not None: - for key, value in metadata.items(): - output.metadata[key] = json.dumps(value) - - frame_rate = Fraction(round(self.__components.frame_rate * 1000), 1000) - # Create a video stream - video_stream = output.add_stream('h264', rate=frame_rate) - video_stream.width = self.__components.images.shape[2] - video_stream.height = self.__components.images.shape[1] - video_stream.pix_fmt = 'yuv420p' - - # Create an audio stream - audio_sample_rate = 1 - audio_stream: Optional[av.AudioStream] = None - if self.__components.audio: - audio_sample_rate = int(self.__components.audio['sample_rate']) - audio_stream = output.add_stream('aac', rate=audio_sample_rate) - audio_stream.sample_rate = audio_sample_rate - audio_stream.format = 'fltp' - - # Encode video - for i, frame in enumerate(self.__components.images): - img = (frame * 255).clamp(0, 255).byte().cpu().numpy() # shape: (H, W, 3) - frame = av.VideoFrame.from_ndarray(img, format='rgb24') - frame = frame.reformat(format='yuv420p') # Convert to YUV420P as required by h264 - packet = video_stream.encode(frame) - output.mux(packet) - - # Flush video - packet = video_stream.encode(None) - output.mux(packet) - - if audio_stream and self.__components.audio: - # Encode audio - samples_per_frame = int(audio_sample_rate / frame_rate) - num_frames = self.__components.audio['waveform'].shape[2] // samples_per_frame - for i in range(num_frames): - start = i * samples_per_frame - end = start + samples_per_frame - # TODO(Feature) - Add support for stereo audio - chunk = ( - self.__components.audio["waveform"][0, 0, start:end] - .unsqueeze(0) - .contiguous() - .numpy() - ) - audio_frame = av.AudioFrame.from_ndarray(chunk, format='fltp', layout='mono') - audio_frame.sample_rate = audio_sample_rate - audio_frame.pts = i * samples_per_frame - for packet in audio_stream.encode(audio_frame): - output.mux(packet) - - # Flush audio - for packet in audio_stream.encode(None): - output.mux(packet) - +# This file only exists for backwards compatibility. +from comfy_api.latest._input_impl.video_types import * # noqa: F403 diff --git a/comfy_api/internal/__init__.py b/comfy_api/internal/__init__.py new file mode 100644 index 000000000..c00b1fdbb --- /dev/null +++ b/comfy_api/internal/__init__.py @@ -0,0 +1,7 @@ +# Internal infrastructure for ComfyAPI +from .api_registry import ( + ComfyAPIBase as ComfyAPIBase, + ComfyAPIWithVersion as ComfyAPIWithVersion, + register_versions as register_versions, + get_all_versions as get_all_versions, +) diff --git a/comfy_api/internal/api_registry.py b/comfy_api/internal/api_registry.py new file mode 100644 index 000000000..7e3375cf6 --- /dev/null +++ b/comfy_api/internal/api_registry.py @@ -0,0 +1,39 @@ +from typing import Type, List, NamedTuple +from comfy_api.internal.singleton import ProxiedSingleton +from packaging import version as packaging_version + + +class ComfyAPIBase(ProxiedSingleton): + def __init__(self): + pass + + +class ComfyAPIWithVersion(NamedTuple): + version: str + api_class: Type[ComfyAPIBase] + + +def parse_version(version_str: str) -> packaging_version.Version: + """ + Parses a version string into a packaging_version.Version object. + Raises ValueError if the version string is invalid. + """ + if version_str == "latest": + return packaging_version.parse("9999999.9999999.9999999") + return packaging_version.parse(version_str) + + +registered_versions: List[ComfyAPIWithVersion] = [] + + +def register_versions(versions: List[ComfyAPIWithVersion]): + versions.sort(key=lambda x: parse_version(x.version)) + global registered_versions + registered_versions = versions + + +def get_all_versions() -> List[ComfyAPIWithVersion]: + """ + Returns a list of all registered ComfyAPI versions. + """ + return registered_versions diff --git a/comfy_api/internal/async_to_sync.py b/comfy_api/internal/async_to_sync.py new file mode 100644 index 000000000..f5f805a62 --- /dev/null +++ b/comfy_api/internal/async_to_sync.py @@ -0,0 +1,987 @@ +import asyncio +import concurrent.futures +import contextvars +import functools +import inspect +import logging +import os +import textwrap +import threading +from enum import Enum +from typing import Optional, Type, get_origin, get_args + + +class TypeTracker: + """Tracks types discovered during stub generation for automatic import generation.""" + + def __init__(self): + self.discovered_types = {} # type_name -> (module, qualname) + self.builtin_types = { + "Any", + "Dict", + "List", + "Optional", + "Tuple", + "Union", + "Set", + "Sequence", + "cast", + "NamedTuple", + "str", + "int", + "float", + "bool", + "None", + "bytes", + "object", + "type", + "dict", + "list", + "tuple", + "set", + } + self.already_imported = ( + set() + ) # Track types already imported to avoid duplicates + + def track_type(self, annotation): + """Track a type annotation and record its module/import info.""" + if annotation is None or annotation is type(None): + return + + # Skip builtins and typing module types we already import + type_name = getattr(annotation, "__name__", None) + if type_name and ( + type_name in self.builtin_types or type_name in self.already_imported + ): + return + + # Get module and qualname + module = getattr(annotation, "__module__", None) + qualname = getattr(annotation, "__qualname__", type_name or "") + + # Skip types from typing module (they're already imported) + if module == "typing": + return + + # Skip UnionType and GenericAlias from types module as they're handled specially + if module == "types" and type_name in ("UnionType", "GenericAlias"): + return + + if module and module not in ["builtins", "__main__"]: + # Store the type info + if type_name: + self.discovered_types[type_name] = (module, qualname) + + def get_imports(self, main_module_name: str) -> list[str]: + """Generate import statements for all discovered types.""" + imports = [] + imports_by_module = {} + + for type_name, (module, qualname) in sorted(self.discovered_types.items()): + # Skip types from the main module (they're already imported) + if main_module_name and module == main_module_name: + continue + + if module not in imports_by_module: + imports_by_module[module] = [] + if type_name not in imports_by_module[module]: # Avoid duplicates + imports_by_module[module].append(type_name) + + # Generate import statements + for module, types in sorted(imports_by_module.items()): + if len(types) == 1: + imports.append(f"from {module} import {types[0]}") + else: + imports.append(f"from {module} import {', '.join(sorted(set(types)))}") + + return imports + + +class AsyncToSyncConverter: + """ + Provides utilities to convert async classes to sync classes with proper type hints. + """ + + _thread_pool: Optional[concurrent.futures.ThreadPoolExecutor] = None + _thread_pool_lock = threading.Lock() + _thread_pool_initialized = False + + @classmethod + def get_thread_pool(cls, max_workers=None) -> concurrent.futures.ThreadPoolExecutor: + """Get or create the shared thread pool with proper thread-safe initialization.""" + # Fast path - check if already initialized without acquiring lock + if cls._thread_pool_initialized: + assert cls._thread_pool is not None, "Thread pool should be initialized" + return cls._thread_pool + + # Slow path - acquire lock and create pool if needed + with cls._thread_pool_lock: + if not cls._thread_pool_initialized: + cls._thread_pool = concurrent.futures.ThreadPoolExecutor( + max_workers=max_workers, thread_name_prefix="async_to_sync_" + ) + cls._thread_pool_initialized = True + + # This should never be None at this point, but add assertion for type checker + assert cls._thread_pool is not None + return cls._thread_pool + + @classmethod + def run_async_in_thread(cls, coro_func, *args, **kwargs): + """ + Run an async function in a separate thread from the thread pool. + Blocks until the async function completes. + Properly propagates contextvars between threads and manages event loops. + """ + # Capture current context - this includes all context variables + context = contextvars.copy_context() + + # Store the result and any exception that occurs + result_container: dict = {"result": None, "exception": None} + + # Function that runs in the thread pool + def run_in_thread(): + # Create new event loop for this thread + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + try: + # Create the coroutine within the context + async def run_with_context(): + # The coroutine function might access context variables + return await coro_func(*args, **kwargs) + + # Run the coroutine with the captured context + # This ensures all context variables are available in the async function + result = context.run(loop.run_until_complete, run_with_context()) + result_container["result"] = result + except Exception as e: + # Store the exception to re-raise in the calling thread + result_container["exception"] = e + finally: + # Ensure event loop is properly closed to prevent warnings + try: + # Cancel any remaining tasks + pending = asyncio.all_tasks(loop) + for task in pending: + task.cancel() + + # Run the loop briefly to handle cancellations + if pending: + loop.run_until_complete( + asyncio.gather(*pending, return_exceptions=True) + ) + except Exception: + pass # Ignore errors during cleanup + + # Close the event loop + loop.close() + + # Clear the event loop from the thread + asyncio.set_event_loop(None) + + # Submit to thread pool and wait for result + thread_pool = cls.get_thread_pool() + future = thread_pool.submit(run_in_thread) + future.result() # Wait for completion + + # Re-raise any exception that occurred in the thread + if result_container["exception"] is not None: + raise result_container["exception"] + + return result_container["result"] + + @classmethod + def create_sync_class(cls, async_class: Type, thread_pool_size=10) -> Type: + """ + Creates a new class with synchronous versions of all async methods. + + Args: + async_class: The async class to convert + thread_pool_size: Size of thread pool to use + + Returns: + A new class with sync versions of all async methods + """ + sync_class_name = "ComfyAPISyncStub" + cls.get_thread_pool(thread_pool_size) + + # Create a proper class with docstrings and proper base classes + sync_class_dict = { + "__doc__": async_class.__doc__, + "__module__": async_class.__module__, + "__qualname__": sync_class_name, + "__orig_class__": async_class, # Store original class for typing references + } + + # Create __init__ method + def __init__(self, *args, **kwargs): + self._async_instance = async_class(*args, **kwargs) + + # Handle annotated class attributes (like execution: Execution) + # Get all annotations from the class hierarchy + all_annotations = {} + for base_class in reversed(inspect.getmro(async_class)): + if hasattr(base_class, "__annotations__"): + all_annotations.update(base_class.__annotations__) + + # For each annotated attribute, check if it needs to be created or wrapped + for attr_name, attr_type in all_annotations.items(): + if hasattr(self._async_instance, attr_name): + # Attribute exists on the instance + attr = getattr(self._async_instance, attr_name) + # Check if this attribute needs a sync wrapper + if hasattr(attr, "__class__"): + from comfy_api.internal.singleton import ProxiedSingleton + + if isinstance(attr, ProxiedSingleton): + # Create a sync version of this attribute + try: + sync_attr_class = cls.create_sync_class(attr.__class__) + # Create instance of the sync wrapper with the async instance + sync_attr = object.__new__(sync_attr_class) # type: ignore + sync_attr._async_instance = attr + setattr(self, attr_name, sync_attr) + except Exception: + # If we can't create a sync version, keep the original + setattr(self, attr_name, attr) + else: + # Not async, just copy the reference + setattr(self, attr_name, attr) + else: + # Attribute doesn't exist, but is annotated - create it + # This handles cases like execution: Execution + if isinstance(attr_type, type): + # Check if the type is defined as an inner class + if hasattr(async_class, attr_type.__name__): + inner_class = getattr(async_class, attr_type.__name__) + from comfy_api.internal.singleton import ProxiedSingleton + + # Create an instance of the inner class + try: + # For ProxiedSingleton classes, get or create the singleton instance + if issubclass(inner_class, ProxiedSingleton): + async_instance = inner_class.get_instance() + else: + async_instance = inner_class() + + # Create sync wrapper + sync_attr_class = cls.create_sync_class(inner_class) + sync_attr = object.__new__(sync_attr_class) # type: ignore + sync_attr._async_instance = async_instance + setattr(self, attr_name, sync_attr) + # Also set on the async instance for consistency + setattr(self._async_instance, attr_name, async_instance) + except Exception as e: + logging.warning( + f"Failed to create instance for {attr_name}: {e}" + ) + + # Handle other instance attributes that might not be annotated + for name, attr in inspect.getmembers(self._async_instance): + if name.startswith("_") or hasattr(self, name): + continue + + # If attribute is an instance of a class, and that class is defined in the original class + # we need to check if it needs a sync wrapper + if isinstance(attr, object) and not isinstance( + attr, (str, int, float, bool, list, dict, tuple) + ): + from comfy_api.internal.singleton import ProxiedSingleton + + if isinstance(attr, ProxiedSingleton): + # Create a sync version of this nested class + try: + sync_attr_class = cls.create_sync_class(attr.__class__) + # Create instance of the sync wrapper with the async instance + sync_attr = object.__new__(sync_attr_class) # type: ignore + sync_attr._async_instance = attr + setattr(self, name, sync_attr) + except Exception: + # If we can't create a sync version, keep the original + setattr(self, name, attr) + + sync_class_dict["__init__"] = __init__ + + # Process methods from the async class + for name, method in inspect.getmembers( + async_class, predicate=inspect.isfunction + ): + if name.startswith("_"): + continue + + # Extract the actual return type from a coroutine + if inspect.iscoroutinefunction(method): + # Create sync version of async method with proper signature + @functools.wraps(method) + def sync_method(self, *args, _method_name=name, **kwargs): + async_method = getattr(self._async_instance, _method_name) + return AsyncToSyncConverter.run_async_in_thread( + async_method, *args, **kwargs + ) + + # Add to the class dict + sync_class_dict[name] = sync_method + else: + # For regular methods, create a proxy method + @functools.wraps(method) + def proxy_method(self, *args, _method_name=name, **kwargs): + method = getattr(self._async_instance, _method_name) + return method(*args, **kwargs) + + # Add to the class dict + sync_class_dict[name] = proxy_method + + # Handle property access + for name, prop in inspect.getmembers( + async_class, lambda x: isinstance(x, property) + ): + + def make_property(name, prop_obj): + def getter(self): + value = getattr(self._async_instance, name) + if inspect.iscoroutinefunction(value): + + def sync_fn(*args, **kwargs): + return AsyncToSyncConverter.run_async_in_thread( + value, *args, **kwargs + ) + + return sync_fn + return value + + def setter(self, value): + setattr(self._async_instance, name, value) + + return property(getter, setter if prop_obj.fset else None) + + sync_class_dict[name] = make_property(name, prop) + + # Create the class + sync_class = type(sync_class_name, (object,), sync_class_dict) + + return sync_class + + @classmethod + def _format_type_annotation( + cls, annotation, type_tracker: Optional[TypeTracker] = None + ) -> str: + """Convert a type annotation to its string representation for stub files.""" + if ( + annotation is inspect.Parameter.empty + or annotation is inspect.Signature.empty + ): + return "Any" + + # Handle None type + if annotation is type(None): + return "None" + + # Track the type if we have a tracker + if type_tracker: + type_tracker.track_type(annotation) + + # Try using typing.get_origin/get_args for Python 3.8+ + try: + origin = get_origin(annotation) + args = get_args(annotation) + + if origin is not None: + # Track the origin type + if type_tracker: + type_tracker.track_type(origin) + + # Get the origin name + origin_name = getattr(origin, "__name__", str(origin)) + if "." in origin_name: + origin_name = origin_name.split(".")[-1] + + # Special handling for types.UnionType (Python 3.10+ pipe operator) + # Convert to old-style Union for compatibility + if str(origin) == "" or origin_name == "UnionType": + origin_name = "Union" + + # Format arguments recursively + if args: + formatted_args = [] + for arg in args: + # Track each type in the union + if type_tracker: + type_tracker.track_type(arg) + formatted_args.append(cls._format_type_annotation(arg, type_tracker)) + return f"{origin_name}[{', '.join(formatted_args)}]" + else: + return origin_name + except (AttributeError, TypeError): + # Fallback for older Python versions or non-generic types + pass + + # Handle generic types the old way for compatibility + if hasattr(annotation, "__origin__") and hasattr(annotation, "__args__"): + origin = annotation.__origin__ + origin_name = ( + origin.__name__ + if hasattr(origin, "__name__") + else str(origin).split("'")[1] + ) + + # Format each type argument + args = [] + for arg in annotation.__args__: + args.append(cls._format_type_annotation(arg, type_tracker)) + + return f"{origin_name}[{', '.join(args)}]" + + # Handle regular types with __name__ + if hasattr(annotation, "__name__"): + return annotation.__name__ + + # Handle special module types (like types from typing module) + if hasattr(annotation, "__module__") and hasattr(annotation, "__qualname__"): + # For types like typing.Literal, typing.TypedDict, etc. + return annotation.__qualname__ + + # Last resort: string conversion with cleanup + type_str = str(annotation) + + # Clean up common patterns more robustly + if type_str.startswith(""): + type_str = type_str[8:-2] # Remove "" + + # Remove module prefixes for common modules + for prefix in ["typing.", "builtins.", "types."]: + if type_str.startswith(prefix): + type_str = type_str[len(prefix) :] + + # Handle special cases + if type_str in ("_empty", "inspect._empty"): + return "None" + + # Fix NoneType (this should rarely be needed now) + if type_str == "NoneType": + return "None" + + return type_str + + @classmethod + def _extract_coroutine_return_type(cls, annotation): + """Extract the actual return type from a Coroutine annotation.""" + if hasattr(annotation, "__args__") and len(annotation.__args__) > 2: + # Coroutine[Any, Any, ReturnType] -> extract ReturnType + return annotation.__args__[2] + return annotation + + @classmethod + def _format_parameter_default(cls, default_value) -> str: + """Format a parameter's default value for stub files.""" + if default_value is inspect.Parameter.empty: + return "" + elif default_value is None: + return " = None" + elif isinstance(default_value, bool): + return f" = {default_value}" + elif default_value == {}: + return " = {}" + elif default_value == []: + return " = []" + else: + return f" = {default_value}" + + @classmethod + def _format_method_parameters( + cls, + sig: inspect.Signature, + skip_self: bool = True, + type_hints: Optional[dict] = None, + type_tracker: Optional[TypeTracker] = None, + ) -> str: + """Format method parameters for stub files.""" + params = [] + if type_hints is None: + type_hints = {} + + for i, (param_name, param) in enumerate(sig.parameters.items()): + if i == 0 and param_name == "self" and skip_self: + params.append("self") + else: + # Get type annotation from type hints if available, otherwise from signature + annotation = type_hints.get(param_name, param.annotation) + type_str = cls._format_type_annotation(annotation, type_tracker) + + # Get default value + default_str = cls._format_parameter_default(param.default) + + # Combine parameter parts + if annotation is inspect.Parameter.empty: + params.append(f"{param_name}: Any{default_str}") + else: + params.append(f"{param_name}: {type_str}{default_str}") + + return ", ".join(params) + + @classmethod + def _generate_method_signature( + cls, + method_name: str, + method, + is_async: bool = False, + type_tracker: Optional[TypeTracker] = None, + ) -> str: + """Generate a complete method signature for stub files.""" + sig = inspect.signature(method) + + # Try to get evaluated type hints to resolve string annotations + try: + from typing import get_type_hints + type_hints = get_type_hints(method) + except Exception: + # Fallback to empty dict if we can't get type hints + type_hints = {} + + # For async methods, extract the actual return type + return_annotation = type_hints.get('return', sig.return_annotation) + if is_async and inspect.iscoroutinefunction(method): + return_annotation = cls._extract_coroutine_return_type(return_annotation) + + # Format parameters with type hints + params_str = cls._format_method_parameters(sig, type_hints=type_hints, type_tracker=type_tracker) + + # Format return type + return_type = cls._format_type_annotation(return_annotation, type_tracker) + if return_annotation is inspect.Signature.empty: + return_type = "None" + + return f"def {method_name}({params_str}) -> {return_type}: ..." + + @classmethod + def _generate_imports( + cls, async_class: Type, type_tracker: TypeTracker + ) -> list[str]: + """Generate import statements for the stub file.""" + imports = [] + + # Add standard typing imports + imports.append( + "from typing import Any, Dict, List, Optional, Tuple, Union, Set, Sequence, cast, NamedTuple" + ) + + # Add imports from the original module + if async_class.__module__ != "builtins": + module = inspect.getmodule(async_class) + additional_types = [] + + if module: + # Check if module has __all__ defined + module_all = getattr(module, "__all__", None) + + for name, obj in sorted(inspect.getmembers(module)): + if isinstance(obj, type): + # Skip if __all__ is defined and this name isn't in it + # unless it's already been tracked as used in type annotations + if module_all is not None and name not in module_all: + # Check if this type was actually used in annotations + if name not in type_tracker.discovered_types: + continue + + # Check for NamedTuple + if issubclass(obj, tuple) and hasattr(obj, "_fields"): + additional_types.append(name) + # Mark as already imported + type_tracker.already_imported.add(name) + # Check for Enum + elif issubclass(obj, Enum) and name != "Enum": + additional_types.append(name) + # Mark as already imported + type_tracker.already_imported.add(name) + + if additional_types: + type_imports = ", ".join([async_class.__name__] + additional_types) + imports.append(f"from {async_class.__module__} import {type_imports}") + else: + imports.append( + f"from {async_class.__module__} import {async_class.__name__}" + ) + + # Add imports for all discovered types + # Pass the main module name to avoid duplicate imports + imports.extend( + type_tracker.get_imports(main_module_name=async_class.__module__) + ) + + # Add base module import if needed + if hasattr(inspect.getmodule(async_class), "__name__"): + module_name = inspect.getmodule(async_class).__name__ + if "." in module_name: + base_module = module_name.split(".")[0] + # Only add if not already importing from it + if not any(imp.startswith(f"from {base_module}") for imp in imports): + imports.append(f"import {base_module}") + + return imports + + @classmethod + def _get_class_attributes(cls, async_class: Type) -> list[tuple[str, Type]]: + """Extract class attributes that are classes themselves.""" + class_attributes = [] + + # Look for class attributes that are classes + for name, attr in sorted(inspect.getmembers(async_class)): + if isinstance(attr, type) and not name.startswith("_"): + class_attributes.append((name, attr)) + elif ( + hasattr(async_class, "__annotations__") + and name in async_class.__annotations__ + ): + annotation = async_class.__annotations__[name] + if isinstance(annotation, type): + class_attributes.append((name, annotation)) + + return class_attributes + + @classmethod + def _generate_inner_class_stub( + cls, + name: str, + attr: Type, + indent: str = " ", + type_tracker: Optional[TypeTracker] = None, + ) -> list[str]: + """Generate stub for an inner class.""" + stub_lines = [] + stub_lines.append(f"{indent}class {name}Sync:") + + # Add docstring if available + if hasattr(attr, "__doc__") and attr.__doc__: + stub_lines.extend( + cls._format_docstring_for_stub(attr.__doc__, f"{indent} ") + ) + + # Add __init__ if it exists + if hasattr(attr, "__init__"): + try: + init_method = getattr(attr, "__init__") + init_sig = inspect.signature(init_method) + + # Try to get type hints + try: + from typing import get_type_hints + init_hints = get_type_hints(init_method) + except Exception: + init_hints = {} + + # Format parameters + params_str = cls._format_method_parameters( + init_sig, type_hints=init_hints, type_tracker=type_tracker + ) + # Add __init__ docstring if available (before the method) + if hasattr(init_method, "__doc__") and init_method.__doc__: + stub_lines.extend( + cls._format_docstring_for_stub( + init_method.__doc__, f"{indent} " + ) + ) + stub_lines.append( + f"{indent} def __init__({params_str}) -> None: ..." + ) + except (ValueError, TypeError): + stub_lines.append( + f"{indent} def __init__(self, *args, **kwargs) -> None: ..." + ) + + # Add methods to the inner class + has_methods = False + for method_name, method in sorted( + inspect.getmembers(attr, predicate=inspect.isfunction) + ): + if method_name.startswith("_"): + continue + + has_methods = True + try: + # Add method docstring if available (before the method signature) + if method.__doc__: + stub_lines.extend( + cls._format_docstring_for_stub(method.__doc__, f"{indent} ") + ) + + method_sig = cls._generate_method_signature( + method_name, method, is_async=True, type_tracker=type_tracker + ) + stub_lines.append(f"{indent} {method_sig}") + except (ValueError, TypeError): + stub_lines.append( + f"{indent} def {method_name}(self, *args, **kwargs): ..." + ) + + if not has_methods: + stub_lines.append(f"{indent} pass") + + return stub_lines + + @classmethod + def _format_docstring_for_stub( + cls, docstring: str, indent: str = " " + ) -> list[str]: + """Format a docstring for inclusion in a stub file with proper indentation.""" + if not docstring: + return [] + + # First, dedent the docstring to remove any existing indentation + dedented = textwrap.dedent(docstring).strip() + + # Split into lines + lines = dedented.split("\n") + + # Build the properly indented docstring + result = [] + result.append(f'{indent}"""') + + for line in lines: + if line.strip(): # Non-empty line + result.append(f"{indent}{line}") + else: # Empty line + result.append("") + + result.append(f'{indent}"""') + return result + + @classmethod + def _post_process_stub_content(cls, stub_content: list[str]) -> list[str]: + """Post-process stub content to fix any remaining issues.""" + processed = [] + + for line in stub_content: + # Skip processing imports + if line.startswith(("from ", "import ")): + processed.append(line) + continue + + # Fix method signatures missing return types + if ( + line.strip().startswith("def ") + and line.strip().endswith(": ...") + and ") -> " not in line + ): + # Add -> None for methods without return annotation + line = line.replace(": ...", " -> None: ...") + + processed.append(line) + + return processed + + @classmethod + def generate_stub_file(cls, async_class: Type, sync_class: Type) -> None: + """ + Generate a .pyi stub file for the sync class to help IDEs with type checking. + """ + try: + # Only generate stub if we can determine module path + if async_class.__module__ == "__main__": + return + + module = inspect.getmodule(async_class) + if not module: + return + + module_path = module.__file__ + if not module_path: + return + + # Create stub file path in a 'generated' subdirectory + module_dir = os.path.dirname(module_path) + stub_dir = os.path.join(module_dir, "generated") + + # Ensure the generated directory exists + os.makedirs(stub_dir, exist_ok=True) + + module_name = os.path.basename(module_path) + if module_name.endswith(".py"): + module_name = module_name[:-3] + + sync_stub_path = os.path.join(stub_dir, f"{sync_class.__name__}.pyi") + + # Create a type tracker for this stub generation + type_tracker = TypeTracker() + + stub_content = [] + + # We'll generate imports after processing all methods to capture all types + # Leave a placeholder for imports + imports_placeholder_index = len(stub_content) + stub_content.append("") # Will be replaced with imports later + + # Class definition + stub_content.append(f"class {sync_class.__name__}:") + + # Docstring + if async_class.__doc__: + stub_content.extend( + cls._format_docstring_for_stub(async_class.__doc__, " ") + ) + + # Generate __init__ + try: + init_method = async_class.__init__ + init_signature = inspect.signature(init_method) + + # Try to get type hints for __init__ + try: + from typing import get_type_hints + init_hints = get_type_hints(init_method) + except Exception: + init_hints = {} + + # Format parameters + params_str = cls._format_method_parameters( + init_signature, type_hints=init_hints, type_tracker=type_tracker + ) + # Add __init__ docstring if available (before the method) + if hasattr(init_method, "__doc__") and init_method.__doc__: + stub_content.extend( + cls._format_docstring_for_stub(init_method.__doc__, " ") + ) + stub_content.append(f" def __init__({params_str}) -> None: ...") + except (ValueError, TypeError): + stub_content.append( + " def __init__(self, *args, **kwargs) -> None: ..." + ) + + stub_content.append("") # Add newline after __init__ + + # Get class attributes + class_attributes = cls._get_class_attributes(async_class) + + # Generate inner classes + for name, attr in class_attributes: + inner_class_stub = cls._generate_inner_class_stub( + name, attr, type_tracker=type_tracker + ) + stub_content.extend(inner_class_stub) + stub_content.append("") # Add newline after the inner class + + # Add methods to the main class + processed_methods = set() # Keep track of methods we've processed + for name, method in sorted( + inspect.getmembers(async_class, predicate=inspect.isfunction) + ): + if name.startswith("_") or name in processed_methods: + continue + + processed_methods.add(name) + + try: + method_sig = cls._generate_method_signature( + name, method, is_async=True, type_tracker=type_tracker + ) + + # Add docstring if available (before the method signature for proper formatting) + if method.__doc__: + stub_content.extend( + cls._format_docstring_for_stub(method.__doc__, " ") + ) + + stub_content.append(f" {method_sig}") + + stub_content.append("") # Add newline after each method + + except (ValueError, TypeError): + # If we can't get the signature, just add a simple stub + stub_content.append(f" def {name}(self, *args, **kwargs): ...") + stub_content.append("") # Add newline + + # Add properties + for name, prop in sorted( + inspect.getmembers(async_class, lambda x: isinstance(x, property)) + ): + stub_content.append(" @property") + stub_content.append(f" def {name}(self) -> Any: ...") + if prop.fset: + stub_content.append(f" @{name}.setter") + stub_content.append( + f" def {name}(self, value: Any) -> None: ..." + ) + stub_content.append("") # Add newline after each property + + # Add placeholders for the nested class instances + # Check the actual attribute names from class annotations and attributes + attribute_mappings = {} + + # First check annotations for typed attributes (including from parent classes) + # Collect all annotations from the class hierarchy + all_annotations = {} + for base_class in reversed(inspect.getmro(async_class)): + if hasattr(base_class, "__annotations__"): + all_annotations.update(base_class.__annotations__) + + for attr_name, attr_type in sorted(all_annotations.items()): + for class_name, class_type in class_attributes: + # If the class type matches the annotated type + if ( + attr_type == class_type + or (hasattr(attr_type, "__name__") and attr_type.__name__ == class_name) + or (isinstance(attr_type, str) and attr_type == class_name) + ): + attribute_mappings[class_name] = attr_name + + # Remove the extra checking - annotations should be sufficient + + # Add the attribute declarations with proper names + for class_name, class_type in class_attributes: + # Check if there's a mapping from annotation + attr_name = attribute_mappings.get(class_name, class_name) + # Use the annotation name if it exists, even if the attribute doesn't exist yet + # This is because the attribute might be created at runtime + stub_content.append(f" {attr_name}: {class_name}Sync") + + stub_content.append("") # Add a final newline + + # Now generate imports with all discovered types + imports = cls._generate_imports(async_class, type_tracker) + + # Deduplicate imports while preserving order + seen = set() + unique_imports = [] + for imp in imports: + if imp not in seen: + seen.add(imp) + unique_imports.append(imp) + else: + logging.warning(f"Duplicate import detected: {imp}") + + # Replace the placeholder with actual imports + stub_content[imports_placeholder_index : imports_placeholder_index + 1] = ( + unique_imports + ) + + # Post-process stub content + stub_content = cls._post_process_stub_content(stub_content) + + # Write stub file + with open(sync_stub_path, "w") as f: + f.write("\n".join(stub_content)) + + logging.info(f"Generated stub file: {sync_stub_path}") + + except Exception as e: + # If stub generation fails, log the error but don't break the main functionality + logging.error( + f"Error generating stub file for {sync_class.__name__}: {str(e)}" + ) + import traceback + + logging.error(traceback.format_exc()) + + +def create_sync_class(async_class: Type, thread_pool_size=10) -> Type: + """ + Creates a sync version of an async class + + Args: + async_class: The async class to convert + thread_pool_size: Size of thread pool to use + + Returns: + A new class with sync versions of all async methods + """ + return AsyncToSyncConverter.create_sync_class(async_class, thread_pool_size) diff --git a/comfy_api/internal/singleton.py b/comfy_api/internal/singleton.py new file mode 100644 index 000000000..75f16f98e --- /dev/null +++ b/comfy_api/internal/singleton.py @@ -0,0 +1,33 @@ +from typing import Type, TypeVar + +class SingletonMetaclass(type): + T = TypeVar("T", bound="SingletonMetaclass") + _instances = {} + + def __call__(cls, *args, **kwargs): + if cls not in cls._instances: + cls._instances[cls] = super(SingletonMetaclass, cls).__call__( + *args, **kwargs + ) + return cls._instances[cls] + + def inject_instance(cls: Type[T], instance: T) -> None: + assert cls not in SingletonMetaclass._instances, ( + "Cannot inject instance after first instantiation" + ) + SingletonMetaclass._instances[cls] = instance + + def get_instance(cls: Type[T], *args, **kwargs) -> T: + """ + Gets the singleton instance of the class, creating it if it doesn't exist. + """ + if cls not in SingletonMetaclass._instances: + SingletonMetaclass._instances[cls] = super( + SingletonMetaclass, cls + ).__call__(*args, **kwargs) + return cls._instances[cls] + + +class ProxiedSingleton(object, metaclass=SingletonMetaclass): + def __init__(self): + super().__init__() diff --git a/comfy_api/latest/__init__.py b/comfy_api/latest/__init__.py new file mode 100644 index 000000000..e1f3a3655 --- /dev/null +++ b/comfy_api/latest/__init__.py @@ -0,0 +1,106 @@ +from __future__ import annotations + +from typing import Type, TYPE_CHECKING +from comfy_api.internal import ComfyAPIBase +from comfy_api.internal.singleton import ProxiedSingleton +from comfy_api.internal.async_to_sync import create_sync_class +from comfy_api.latest._input import ImageInput, AudioInput, MaskInput, LatentInput, VideoInput +from comfy_api.latest._input_impl import VideoFromFile, VideoFromComponents +from comfy_api.latest._util import VideoCodec, VideoContainer, VideoComponents +from comfy_execution.utils import get_executing_context +from comfy_execution.progress import get_progress_state, PreviewImageTuple +from PIL import Image +from comfy.cli_args import args +import numpy as np + + +class ComfyAPI_latest(ComfyAPIBase): + VERSION = "latest" + STABLE = False + + class Execution(ProxiedSingleton): + async def set_progress( + self, + value: float, + max_value: float, + node_id: str | None = None, + preview_image: Image.Image | ImageInput | None = None, + ignore_size_limit: bool = False, + ) -> None: + """ + Update the progress bar displayed in the ComfyUI interface. + + This function allows custom nodes and API calls to report their progress + back to the user interface, providing visual feedback during long operations. + + Migration from previous API: comfy.utils.PROGRESS_BAR_HOOK + """ + executing_context = get_executing_context() + if node_id is None and executing_context is not None: + node_id = executing_context.node_id + if node_id is None: + raise ValueError("node_id must be provided if not in executing context") + + # Convert preview_image to PreviewImageTuple if needed + to_display: PreviewImageTuple | Image.Image | ImageInput | None = preview_image + if to_display is not None: + # First convert to PIL Image if needed + if isinstance(to_display, ImageInput): + # Convert ImageInput (torch.Tensor) to PIL Image + # Handle tensor shape [B, H, W, C] -> get first image if batch + tensor = to_display + if len(tensor.shape) == 4: + tensor = tensor[0] + + # Convert to numpy array and scale to 0-255 + image_np = (tensor.cpu().numpy() * 255).astype(np.uint8) + to_display = Image.fromarray(image_np) + + if isinstance(to_display, Image.Image): + # Detect image format from PIL Image + image_format = to_display.format if to_display.format else "JPEG" + # Use None for preview_size if ignore_size_limit is True + preview_size = None if ignore_size_limit else args.preview_size + to_display = (image_format, to_display, preview_size) + + get_progress_state().update_progress( + node_id=node_id, + value=value, + max_value=max_value, + image=to_display, + ) + + execution: Execution + +class Input: + Image = ImageInput + Audio = AudioInput + Mask = MaskInput + Latent = LatentInput + Video = VideoInput + +class InputImpl: + VideoFromFile = VideoFromFile + VideoFromComponents = VideoFromComponents + +class Types: + VideoCodec = VideoCodec + VideoContainer = VideoContainer + VideoComponents = VideoComponents + +ComfyAPI = ComfyAPI_latest + +# Create a synchronous version of the API +if TYPE_CHECKING: + import comfy_api.latest.generated.ComfyAPISyncStub # type: ignore + + ComfyAPISync: Type[comfy_api.latest.generated.ComfyAPISyncStub.ComfyAPISyncStub] +ComfyAPISync = create_sync_class(ComfyAPI_latest) + +__all__ = [ + "ComfyAPI", + "ComfyAPISync", + "Input", + "InputImpl", + "Types", +] diff --git a/comfy_api/latest/_input/__init__.py b/comfy_api/latest/_input/__init__.py new file mode 100644 index 000000000..14f0e72f4 --- /dev/null +++ b/comfy_api/latest/_input/__init__.py @@ -0,0 +1,10 @@ +from .basic_types import ImageInput, AudioInput, MaskInput, LatentInput +from .video_types import VideoInput + +__all__ = [ + "ImageInput", + "AudioInput", + "VideoInput", + "MaskInput", + "LatentInput", +] diff --git a/comfy_api/latest/_input/basic_types.py b/comfy_api/latest/_input/basic_types.py new file mode 100644 index 000000000..245c6cbb1 --- /dev/null +++ b/comfy_api/latest/_input/basic_types.py @@ -0,0 +1,42 @@ +import torch +from typing import TypedDict, List, Optional + +ImageInput = torch.Tensor +""" +An image in format [B, H, W, C] where B is the batch size, C is the number of channels, +""" + +MaskInput = torch.Tensor +""" +A mask in format [B, H, W] where B is the batch size +""" + +class AudioInput(TypedDict): + """ + TypedDict representing audio input. + """ + + waveform: torch.Tensor + """ + Tensor in the format [B, C, T] where B is the batch size, C is the number of channels, + """ + + sample_rate: int + +class LatentInput(TypedDict): + """ + TypedDict representing latent input. + """ + + samples: torch.Tensor + """ + Tensor in the format [B, C, H, W] where B is the batch size, C is the number of channels, + H is the height, and W is the width. + """ + + noise_mask: Optional[MaskInput] + """ + Optional noise mask tensor in the same format as samples. + """ + + batch_index: Optional[List[int]] diff --git a/comfy_api/latest/_input/video_types.py b/comfy_api/latest/_input/video_types.py new file mode 100644 index 000000000..5d95dc507 --- /dev/null +++ b/comfy_api/latest/_input/video_types.py @@ -0,0 +1,85 @@ +from __future__ import annotations +from abc import ABC, abstractmethod +from typing import Optional, Union +import io +import av +from comfy_api.util import VideoContainer, VideoCodec, VideoComponents + +class VideoInput(ABC): + """ + Abstract base class for video input types. + """ + + @abstractmethod + def get_components(self) -> VideoComponents: + """ + Abstract method to get the video components (images, audio, and frame rate). + + Returns: + VideoComponents containing images, audio, and frame rate + """ + pass + + @abstractmethod + def save_to( + self, + path: str, + format: VideoContainer = VideoContainer.AUTO, + codec: VideoCodec = VideoCodec.AUTO, + metadata: Optional[dict] = None + ): + """ + Abstract method to save the video input to a file. + """ + pass + + def get_stream_source(self) -> Union[str, io.BytesIO]: + """ + Get a streamable source for the video. This allows processing without + loading the entire video into memory. + + Returns: + Either a file path (str) or a BytesIO object that can be opened with av. + + Default implementation creates a BytesIO buffer, but subclasses should + override this for better performance when possible. + """ + buffer = io.BytesIO() + self.save_to(buffer) + buffer.seek(0) + return buffer + + # Provide a default implementation, but subclasses can provide optimized versions + # if possible. + def get_dimensions(self) -> tuple[int, int]: + """ + Returns the dimensions of the video input. + + Returns: + Tuple of (width, height) + """ + components = self.get_components() + return components.images.shape[2], components.images.shape[1] + + def get_duration(self) -> float: + """ + Returns the duration of the video in seconds. + + Returns: + Duration in seconds + """ + components = self.get_components() + frame_count = components.images.shape[0] + return float(frame_count / components.frame_rate) + + def get_container_format(self) -> str: + """ + Returns the container format of the video (e.g., 'mp4', 'mov', 'avi'). + + Returns: + Container format as string + """ + # Default implementation - subclasses should override for better performance + source = self.get_stream_source() + with av.open(source, mode="r") as container: + return container.format.name diff --git a/comfy_api/latest/_input_impl/__init__.py b/comfy_api/latest/_input_impl/__init__.py new file mode 100644 index 000000000..02901b8b9 --- /dev/null +++ b/comfy_api/latest/_input_impl/__init__.py @@ -0,0 +1,7 @@ +from .video_types import VideoFromFile, VideoFromComponents + +__all__ = [ + # Implementations + "VideoFromFile", + "VideoFromComponents", +] diff --git a/comfy_api/latest/_input_impl/video_types.py b/comfy_api/latest/_input_impl/video_types.py new file mode 100644 index 000000000..28de9651d --- /dev/null +++ b/comfy_api/latest/_input_impl/video_types.py @@ -0,0 +1,324 @@ +from __future__ import annotations +from av.container import InputContainer +from av.subtitles.stream import SubtitleStream +from fractions import Fraction +from typing import Optional +from comfy_api.latest._input import AudioInput, VideoInput +import av +import io +import json +import numpy as np +import torch +from comfy_api.latest._util import VideoContainer, VideoCodec, VideoComponents + + +def container_to_output_format(container_format: str | None) -> str | None: + """ + A container's `format` may be a comma-separated list of formats. + E.g., iso container's `format` may be `mov,mp4,m4a,3gp,3g2,mj2`. + However, writing to a file/stream with `av.open` requires a single format, + or `None` to auto-detect. + """ + if not container_format: + return None # Auto-detect + + if "," not in container_format: + return container_format + + formats = container_format.split(",") + return formats[0] + + +def get_open_write_kwargs( + dest: str | io.BytesIO, container_format: str, to_format: str | None +) -> dict: + """Get kwargs for writing a `VideoFromFile` to a file/stream with `av.open`""" + open_kwargs = { + "mode": "w", + # If isobmff, preserve custom metadata tags (workflow, prompt, extra_pnginfo) + "options": {"movflags": "use_metadata_tags"}, + } + + is_write_to_buffer = isinstance(dest, io.BytesIO) + if is_write_to_buffer: + # Set output format explicitly, since it cannot be inferred from file extension + if to_format == VideoContainer.AUTO: + to_format = container_format.lower() + elif isinstance(to_format, str): + to_format = to_format.lower() + open_kwargs["format"] = container_to_output_format(to_format) + + return open_kwargs + + +class VideoFromFile(VideoInput): + """ + Class representing video input from a file. + """ + + def __init__(self, file: str | io.BytesIO): + """ + Initialize the VideoFromFile object based off of either a path on disk or a BytesIO object + containing the file contents. + """ + self.__file = file + + def get_stream_source(self) -> str | io.BytesIO: + """ + Return the underlying file source for efficient streaming. + This avoids unnecessary memory copies when the source is already a file path. + """ + if isinstance(self.__file, io.BytesIO): + self.__file.seek(0) + return self.__file + + def get_dimensions(self) -> tuple[int, int]: + """ + Returns the dimensions of the video input. + + Returns: + Tuple of (width, height) + """ + if isinstance(self.__file, io.BytesIO): + self.__file.seek(0) # Reset the BytesIO object to the beginning + with av.open(self.__file, mode='r') as container: + for stream in container.streams: + if stream.type == 'video': + assert isinstance(stream, av.VideoStream) + return stream.width, stream.height + raise ValueError(f"No video stream found in file '{self.__file}'") + + def get_duration(self) -> float: + """ + Returns the duration of the video in seconds. + + Returns: + Duration in seconds + """ + if isinstance(self.__file, io.BytesIO): + self.__file.seek(0) + with av.open(self.__file, mode="r") as container: + if container.duration is not None: + return float(container.duration / av.time_base) + + # Fallback: calculate from frame count and frame rate + video_stream = next( + (s for s in container.streams if s.type == "video"), None + ) + if video_stream and video_stream.frames and video_stream.average_rate: + return float(video_stream.frames / video_stream.average_rate) + + # Last resort: decode frames to count them + if video_stream and video_stream.average_rate: + frame_count = 0 + container.seek(0) + for packet in container.demux(video_stream): + for _ in packet.decode(): + frame_count += 1 + if frame_count > 0: + return float(frame_count / video_stream.average_rate) + + raise ValueError(f"Could not determine duration for file '{self.__file}'") + + def get_container_format(self) -> str: + """ + Returns the container format of the video (e.g., 'mp4', 'mov', 'avi'). + + Returns: + Container format as string + """ + if isinstance(self.__file, io.BytesIO): + self.__file.seek(0) + with av.open(self.__file, mode='r') as container: + return container.format.name + + def get_components_internal(self, container: InputContainer) -> VideoComponents: + # Get video frames + frames = [] + for frame in container.decode(video=0): + img = frame.to_ndarray(format='rgb24') # shape: (H, W, 3) + img = torch.from_numpy(img) / 255.0 # shape: (H, W, 3) + frames.append(img) + + images = torch.stack(frames) if len(frames) > 0 else torch.zeros(0, 3, 0, 0) + + # Get frame rate + video_stream = next(s for s in container.streams if s.type == 'video') + frame_rate = Fraction(video_stream.average_rate) if video_stream and video_stream.average_rate else Fraction(1) + + # Get audio if available + audio = None + try: + container.seek(0) # Reset the container to the beginning + for stream in container.streams: + if stream.type != 'audio': + continue + assert isinstance(stream, av.AudioStream) + audio_frames = [] + for packet in container.demux(stream): + for frame in packet.decode(): + assert isinstance(frame, av.AudioFrame) + audio_frames.append(frame.to_ndarray()) # shape: (channels, samples) + if len(audio_frames) > 0: + audio_data = np.concatenate(audio_frames, axis=1) # shape: (channels, total_samples) + audio_tensor = torch.from_numpy(audio_data).unsqueeze(0) # shape: (1, channels, total_samples) + audio = AudioInput({ + "waveform": audio_tensor, + "sample_rate": int(stream.sample_rate) if stream.sample_rate else 1, + }) + except StopIteration: + pass # No audio stream + + metadata = container.metadata + return VideoComponents(images=images, audio=audio, frame_rate=frame_rate, metadata=metadata) + + def get_components(self) -> VideoComponents: + if isinstance(self.__file, io.BytesIO): + self.__file.seek(0) # Reset the BytesIO object to the beginning + with av.open(self.__file, mode='r') as container: + return self.get_components_internal(container) + raise ValueError(f"No video stream found in file '{self.__file}'") + + def save_to( + self, + path: str | io.BytesIO, + format: VideoContainer = VideoContainer.AUTO, + codec: VideoCodec = VideoCodec.AUTO, + metadata: Optional[dict] = None + ): + if isinstance(self.__file, io.BytesIO): + self.__file.seek(0) # Reset the BytesIO object to the beginning + with av.open(self.__file, mode='r') as container: + container_format = container.format.name + video_encoding = container.streams.video[0].codec.name if len(container.streams.video) > 0 else None + reuse_streams = True + if format != VideoContainer.AUTO and format not in container_format.split(","): + reuse_streams = False + if codec != VideoCodec.AUTO and codec != video_encoding and video_encoding is not None: + reuse_streams = False + + if not reuse_streams: + components = self.get_components_internal(container) + video = VideoFromComponents(components) + return video.save_to( + path, + format=format, + codec=codec, + metadata=metadata + ) + + streams = container.streams + + open_kwargs = get_open_write_kwargs(path, container_format, format) + with av.open(path, **open_kwargs) as output_container: + # Copy over the original metadata + for key, value in container.metadata.items(): + if metadata is None or key not in metadata: + output_container.metadata[key] = value + + # Add our new metadata + if metadata is not None: + for key, value in metadata.items(): + if isinstance(value, str): + output_container.metadata[key] = value + else: + output_container.metadata[key] = json.dumps(value) + + # Add streams to the new container + stream_map = {} + for stream in streams: + if isinstance(stream, (av.VideoStream, av.AudioStream, SubtitleStream)): + out_stream = output_container.add_stream_from_template(template=stream, opaque=True) + stream_map[stream] = out_stream + + # Write packets to the new container + for packet in container.demux(): + if packet.stream in stream_map and packet.dts is not None: + packet.stream = stream_map[packet.stream] + output_container.mux(packet) + +class VideoFromComponents(VideoInput): + """ + Class representing video input from tensors. + """ + + def __init__(self, components: VideoComponents): + self.__components = components + + def get_components(self) -> VideoComponents: + return VideoComponents( + images=self.__components.images, + audio=self.__components.audio, + frame_rate=self.__components.frame_rate + ) + + def save_to( + self, + path: str, + format: VideoContainer = VideoContainer.AUTO, + codec: VideoCodec = VideoCodec.AUTO, + metadata: Optional[dict] = None + ): + if format != VideoContainer.AUTO and format != VideoContainer.MP4: + raise ValueError("Only MP4 format is supported for now") + if codec != VideoCodec.AUTO and codec != VideoCodec.H264: + raise ValueError("Only H264 codec is supported for now") + with av.open(path, mode='w', options={'movflags': 'use_metadata_tags'}) as output: + # Add metadata before writing any streams + if metadata is not None: + for key, value in metadata.items(): + output.metadata[key] = json.dumps(value) + + frame_rate = Fraction(round(self.__components.frame_rate * 1000), 1000) + # Create a video stream + video_stream = output.add_stream('h264', rate=frame_rate) + video_stream.width = self.__components.images.shape[2] + video_stream.height = self.__components.images.shape[1] + video_stream.pix_fmt = 'yuv420p' + + # Create an audio stream + audio_sample_rate = 1 + audio_stream: Optional[av.AudioStream] = None + if self.__components.audio: + audio_sample_rate = int(self.__components.audio['sample_rate']) + audio_stream = output.add_stream('aac', rate=audio_sample_rate) + audio_stream.sample_rate = audio_sample_rate + audio_stream.format = 'fltp' + + # Encode video + for i, frame in enumerate(self.__components.images): + img = (frame * 255).clamp(0, 255).byte().cpu().numpy() # shape: (H, W, 3) + frame = av.VideoFrame.from_ndarray(img, format='rgb24') + frame = frame.reformat(format='yuv420p') # Convert to YUV420P as required by h264 + packet = video_stream.encode(frame) + output.mux(packet) + + # Flush video + packet = video_stream.encode(None) + output.mux(packet) + + if audio_stream and self.__components.audio: + # Encode audio + samples_per_frame = int(audio_sample_rate / frame_rate) + num_frames = self.__components.audio['waveform'].shape[2] // samples_per_frame + for i in range(num_frames): + start = i * samples_per_frame + end = start + samples_per_frame + # TODO(Feature) - Add support for stereo audio + chunk = ( + self.__components.audio["waveform"][0, 0, start:end] + .unsqueeze(0) + .contiguous() + .numpy() + ) + audio_frame = av.AudioFrame.from_ndarray(chunk, format='fltp', layout='mono') + audio_frame.sample_rate = audio_sample_rate + audio_frame.pts = i * samples_per_frame + for packet in audio_stream.encode(audio_frame): + output.mux(packet) + + # Flush audio + for packet in audio_stream.encode(None): + output.mux(packet) + + diff --git a/comfy_api/latest/_util/__init__.py b/comfy_api/latest/_util/__init__.py new file mode 100644 index 000000000..9019c46db --- /dev/null +++ b/comfy_api/latest/_util/__init__.py @@ -0,0 +1,8 @@ +from .video_types import VideoContainer, VideoCodec, VideoComponents + +__all__ = [ + # Utility Types + "VideoContainer", + "VideoCodec", + "VideoComponents", +] diff --git a/comfy_api/latest/_util/video_types.py b/comfy_api/latest/_util/video_types.py new file mode 100644 index 000000000..c3e3d8e3a --- /dev/null +++ b/comfy_api/latest/_util/video_types.py @@ -0,0 +1,52 @@ +from __future__ import annotations +from dataclasses import dataclass +from enum import Enum +from fractions import Fraction +from typing import Optional +from comfy_api.latest._input import ImageInput, AudioInput + +class VideoCodec(str, Enum): + AUTO = "auto" + H264 = "h264" + + @classmethod + def as_input(cls) -> list[str]: + """ + Returns a list of codec names that can be used as node input. + """ + return [member.value for member in cls] + +class VideoContainer(str, Enum): + AUTO = "auto" + MP4 = "mp4" + + @classmethod + def as_input(cls) -> list[str]: + """ + Returns a list of container names that can be used as node input. + """ + return [member.value for member in cls] + + @classmethod + def get_extension(cls, value) -> str: + """ + Returns the file extension for the container. + """ + if isinstance(value, str): + value = cls(value) + if value == VideoContainer.MP4 or value == VideoContainer.AUTO: + return "mp4" + return "" + +@dataclass +class VideoComponents: + """ + Dataclass representing the components of a video. + """ + + images: ImageInput + frame_rate: Fraction + audio: Optional[AudioInput] = None + metadata: Optional[dict] = None + + diff --git a/comfy_api/latest/generated/ComfyAPISyncStub.pyi b/comfy_api/latest/generated/ComfyAPISyncStub.pyi new file mode 100644 index 000000000..525c074dd --- /dev/null +++ b/comfy_api/latest/generated/ComfyAPISyncStub.pyi @@ -0,0 +1,20 @@ +from typing import Any, Dict, List, Optional, Tuple, Union, Set, Sequence, cast, NamedTuple +from comfy_api.latest import ComfyAPI_latest +from PIL.Image import Image +from torch import Tensor +class ComfyAPISyncStub: + def __init__(self) -> None: ... + + class ExecutionSync: + def __init__(self) -> None: ... + """ + Update the progress bar displayed in the ComfyUI interface. + + This function allows custom nodes and API calls to report their progress + back to the user interface, providing visual feedback during long operations. + + Migration from previous API: comfy.utils.PROGRESS_BAR_HOOK + """ + def set_progress(self, value: float, max_value: float, node_id: Union[str, None] = None, preview_image: Union[Image, Tensor, None] = None, ignore_size_limit: bool = False) -> None: ... + + execution: ExecutionSync diff --git a/comfy_api/util.py b/comfy_api/util.py new file mode 100644 index 000000000..1aa9606d2 --- /dev/null +++ b/comfy_api/util.py @@ -0,0 +1,8 @@ +# This file only exists for backwards compatibility. +from comfy_api.latest._util import VideoCodec, VideoContainer, VideoComponents + +__all__ = [ + "VideoCodec", + "VideoContainer", + "VideoComponents", +] diff --git a/comfy_api/util/__init__.py b/comfy_api/util/__init__.py index 9019c46db..4c8a89d1e 100644 --- a/comfy_api/util/__init__.py +++ b/comfy_api/util/__init__.py @@ -1,7 +1,7 @@ -from .video_types import VideoContainer, VideoCodec, VideoComponents +# This file only exists for backwards compatibility. +from comfy_api.latest._util import VideoContainer, VideoCodec, VideoComponents __all__ = [ - # Utility Types "VideoContainer", "VideoCodec", "VideoComponents", diff --git a/comfy_api/util/video_types.py b/comfy_api/util/video_types.py index d09663db9..68c780d64 100644 --- a/comfy_api/util/video_types.py +++ b/comfy_api/util/video_types.py @@ -1,51 +1,12 @@ -from __future__ import annotations -from dataclasses import dataclass -from enum import Enum -from fractions import Fraction -from typing import Optional -from comfy_api.input import ImageInput, AudioInput - -class VideoCodec(str, Enum): - AUTO = "auto" - H264 = "h264" - - @classmethod - def as_input(cls) -> list[str]: - """ - Returns a list of codec names that can be used as node input. - """ - return [member.value for member in cls] - -class VideoContainer(str, Enum): - AUTO = "auto" - MP4 = "mp4" - - @classmethod - def as_input(cls) -> list[str]: - """ - Returns a list of container names that can be used as node input. - """ - return [member.value for member in cls] - - @classmethod - def get_extension(cls, value) -> str: - """ - Returns the file extension for the container. - """ - if isinstance(value, str): - value = cls(value) - if value == VideoContainer.MP4 or value == VideoContainer.AUTO: - return "mp4" - return "" - -@dataclass -class VideoComponents: - """ - Dataclass representing the components of a video. - """ - - images: ImageInput - frame_rate: Fraction - audio: Optional[AudioInput] = None - metadata: Optional[dict] = None +# This file only exists for backwards compatibility. +from comfy_api.latest._util.video_types import ( + VideoContainer, + VideoCodec, + VideoComponents, +) +__all__ = [ + "VideoContainer", + "VideoCodec", + "VideoComponents", +] diff --git a/comfy_api/v0_0_1/__init__.py b/comfy_api/v0_0_1/__init__.py new file mode 100644 index 000000000..93608771d --- /dev/null +++ b/comfy_api/v0_0_1/__init__.py @@ -0,0 +1,42 @@ +from comfy_api.v0_0_2 import ( + ComfyAPIAdapter_v0_0_2, + Input as Input_v0_0_2, + InputImpl as InputImpl_v0_0_2, + Types as Types_v0_0_2, +) +from typing import Type, TYPE_CHECKING +from comfy_api.internal.async_to_sync import create_sync_class + + +# This version only exists to serve as a template for future version adapters. +# There is no reason anyone should ever use it. +class ComfyAPIAdapter_v0_0_1(ComfyAPIAdapter_v0_0_2): + VERSION = "0.0.1" + STABLE = True + +class Input(Input_v0_0_2): + pass + +class InputImpl(InputImpl_v0_0_2): + pass + +class Types(Types_v0_0_2): + pass + +ComfyAPI = ComfyAPIAdapter_v0_0_1 + +# Create a synchronous version of the API +if TYPE_CHECKING: + from comfy_api.v0_0_1.generated.ComfyAPISyncStub import ComfyAPISyncStub # type: ignore + + ComfyAPISync: Type[ComfyAPISyncStub] + +ComfyAPISync = create_sync_class(ComfyAPIAdapter_v0_0_1) + +__all__ = [ + "ComfyAPI", + "ComfyAPISync", + "Input", + "InputImpl", + "Types", +] diff --git a/comfy_api/v0_0_1/generated/ComfyAPISyncStub.pyi b/comfy_api/v0_0_1/generated/ComfyAPISyncStub.pyi new file mode 100644 index 000000000..270030324 --- /dev/null +++ b/comfy_api/v0_0_1/generated/ComfyAPISyncStub.pyi @@ -0,0 +1,20 @@ +from typing import Any, Dict, List, Optional, Tuple, Union, Set, Sequence, cast, NamedTuple +from comfy_api.v0_0_1 import ComfyAPIAdapter_v0_0_1 +from PIL.Image import Image +from torch import Tensor +class ComfyAPISyncStub: + def __init__(self) -> None: ... + + class ExecutionSync: + def __init__(self) -> None: ... + """ + Update the progress bar displayed in the ComfyUI interface. + + This function allows custom nodes and API calls to report their progress + back to the user interface, providing visual feedback during long operations. + + Migration from previous API: comfy.utils.PROGRESS_BAR_HOOK + """ + def set_progress(self, value: float, max_value: float, node_id: Union[str, None] = None, preview_image: Union[Image, Tensor, None] = None, ignore_size_limit: bool = False) -> None: ... + + execution: ExecutionSync diff --git a/comfy_api/v0_0_2/__init__.py b/comfy_api/v0_0_2/__init__.py new file mode 100644 index 000000000..ea83833fb --- /dev/null +++ b/comfy_api/v0_0_2/__init__.py @@ -0,0 +1,43 @@ +from comfy_api.latest import ( + ComfyAPI_latest, + Input as Input_latest, + InputImpl as InputImpl_latest, + Types as Types_latest, +) +from typing import Type, TYPE_CHECKING +from comfy_api.internal.async_to_sync import create_sync_class + + +class ComfyAPIAdapter_v0_0_2(ComfyAPI_latest): + VERSION = "0.0.2" + STABLE = False + + +class Input(Input_latest): + pass + + +class InputImpl(InputImpl_latest): + pass + + +class Types(Types_latest): + pass + + +ComfyAPI = ComfyAPIAdapter_v0_0_2 + +# Create a synchronous version of the API +if TYPE_CHECKING: + from comfy_api.v0_0_2.generated.ComfyAPISyncStub import ComfyAPISyncStub # type: ignore + + ComfyAPISync: Type[ComfyAPISyncStub] +ComfyAPISync = create_sync_class(ComfyAPIAdapter_v0_0_2) + +__all__ = [ + "ComfyAPI", + "ComfyAPISync", + "Input", + "InputImpl", + "Types", +] diff --git a/comfy_api/v0_0_2/generated/ComfyAPISyncStub.pyi b/comfy_api/v0_0_2/generated/ComfyAPISyncStub.pyi new file mode 100644 index 000000000..7fcec685e --- /dev/null +++ b/comfy_api/v0_0_2/generated/ComfyAPISyncStub.pyi @@ -0,0 +1,20 @@ +from typing import Any, Dict, List, Optional, Tuple, Union, Set, Sequence, cast, NamedTuple +from comfy_api.v0_0_2 import ComfyAPIAdapter_v0_0_2 +from PIL.Image import Image +from torch import Tensor +class ComfyAPISyncStub: + def __init__(self) -> None: ... + + class ExecutionSync: + def __init__(self) -> None: ... + """ + Update the progress bar displayed in the ComfyUI interface. + + This function allows custom nodes and API calls to report their progress + back to the user interface, providing visual feedback during long operations. + + Migration from previous API: comfy.utils.PROGRESS_BAR_HOOK + """ + def set_progress(self, value: float, max_value: float, node_id: Union[str, None] = None, preview_image: Union[Image, Tensor, None] = None, ignore_size_limit: bool = False) -> None: ... + + execution: ExecutionSync diff --git a/comfy_api/version_list.py b/comfy_api/version_list.py new file mode 100644 index 000000000..7cb1871d5 --- /dev/null +++ b/comfy_api/version_list.py @@ -0,0 +1,12 @@ +from comfy_api.latest import ComfyAPI_latest +from comfy_api.v0_0_2 import ComfyAPIAdapter_v0_0_2 +from comfy_api.v0_0_1 import ComfyAPIAdapter_v0_0_1 +from comfy_api.internal import ComfyAPIBase +from typing import List, Type + +supported_versions: List[Type[ComfyAPIBase]] = [ + ComfyAPI_latest, + ComfyAPIAdapter_v0_0_2, + ComfyAPIAdapter_v0_0_1, +] + diff --git a/comfy_api_nodes/apis/request_logger.py b/comfy_api_nodes/apis/request_logger.py index 93517ede9..42901e141 100644 --- a/comfy_api_nodes/apis/request_logger.py +++ b/comfy_api_nodes/apis/request_logger.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import os import datetime import json diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index 5935ab2bb..af33279d5 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -2,6 +2,8 @@ API Nodes for Gemini Multimodal LLM Usage via Remote API See: https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference """ +from __future__ import annotations + import os from enum import Enum diff --git a/comfy_execution/progress.py b/comfy_execution/progress.py index 73dba3f75..e8f5ede1e 100644 --- a/comfy_execution/progress.py +++ b/comfy_execution/progress.py @@ -1,4 +1,6 @@ -from typing import TypedDict, Dict, Optional +from __future__ import annotations + +from typing import TypedDict, Dict, Optional, Tuple from typing_extensions import override from PIL import Image from enum import Enum @@ -10,6 +12,7 @@ if TYPE_CHECKING: from protocol import BinaryEventTypes from comfy_api import feature_flags +PreviewImageTuple = Tuple[str, Image.Image, Optional[int]] class NodeState(Enum): Pending = "pending" @@ -52,7 +55,7 @@ class ProgressHandler(ABC): max_value: float, state: NodeProgressState, prompt_id: str, - image: Optional[Image.Image] = None, + image: PreviewImageTuple | None = None, ): """Called when a node's progress is updated""" pass @@ -103,7 +106,7 @@ class CLIProgressHandler(ProgressHandler): max_value: float, state: NodeProgressState, prompt_id: str, - image: Optional[Image.Image] = None, + image: PreviewImageTuple | None = None, ): # Handle case where start_handler wasn't called if node_id not in self.progress_bars: @@ -196,7 +199,7 @@ class WebUIProgressHandler(ProgressHandler): max_value: float, state: NodeProgressState, prompt_id: str, - image: Optional[Image.Image] = None, + image: PreviewImageTuple | None = None, ): # Send progress state of all nodes if self.registry: @@ -231,7 +234,6 @@ class WebUIProgressHandler(ProgressHandler): if self.registry: self._send_progress_state(prompt_id, self.registry.nodes) - class ProgressRegistry: """ Registry that maintains node progress state and notifies registered handlers. @@ -285,7 +287,7 @@ class ProgressRegistry: handler.start_handler(node_id, entry, self.prompt_id) def update_progress( - self, node_id: str, value: float, max_value: float, image: Optional[Image.Image] + self, node_id: str, value: float, max_value: float, image: PreviewImageTuple | None = None ) -> None: """Update progress for a node""" entry = self.ensure_entry(node_id) @@ -317,7 +319,7 @@ class ProgressRegistry: handler.reset() # Global registry instance -global_progress_registry: ProgressRegistry = None +global_progress_registry: ProgressRegistry | None = None def reset_progress_state(prompt_id: str, dynprompt: "DynamicPrompt") -> None: global global_progress_registry diff --git a/comfy_extras/nodes_video.py b/comfy_extras/nodes_video.py index 61f7171b2..969f888b9 100644 --- a/comfy_extras/nodes_video.py +++ b/comfy_extras/nodes_video.py @@ -8,9 +8,7 @@ import json from typing import Optional, Literal from fractions import Fraction from comfy.comfy_types import IO, FileLocator, ComfyNodeABC -from comfy_api.input import ImageInput, AudioInput, VideoInput -from comfy_api.util import VideoContainer, VideoCodec, VideoComponents -from comfy_api.input_impl import VideoFromFile, VideoFromComponents +from comfy_api.latest import Input, InputImpl, Types from comfy.cli_args import args class SaveWEBM: @@ -91,8 +89,8 @@ class SaveVideo(ComfyNodeABC): "required": { "video": (IO.VIDEO, {"tooltip": "The video to save."}), "filename_prefix": ("STRING", {"default": "video/ComfyUI", "tooltip": "The prefix for the file to save. This may include formatting information such as %date:yyyy-MM-dd% or %Empty Latent Image.width% to include values from nodes."}), - "format": (VideoContainer.as_input(), {"default": "auto", "tooltip": "The format to save the video as."}), - "codec": (VideoCodec.as_input(), {"default": "auto", "tooltip": "The codec to use for the video."}), + "format": (Types.VideoContainer.as_input(), {"default": "auto", "tooltip": "The format to save the video as."}), + "codec": (Types.VideoCodec.as_input(), {"default": "auto", "tooltip": "The codec to use for the video."}), }, "hidden": { "prompt": "PROMPT", @@ -108,7 +106,7 @@ class SaveVideo(ComfyNodeABC): CATEGORY = "image/video" DESCRIPTION = "Saves the input images to your ComfyUI output directory." - def save_video(self, video: VideoInput, filename_prefix, format, codec, prompt=None, extra_pnginfo=None): + def save_video(self, video: Input.Video, filename_prefix, format, codec, prompt=None, extra_pnginfo=None): filename_prefix += self.prefix_append width, height = video.get_dimensions() full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path( @@ -127,7 +125,7 @@ class SaveVideo(ComfyNodeABC): metadata["prompt"] = prompt if len(metadata) > 0: saved_metadata = metadata - file = f"{filename}_{counter:05}_.{VideoContainer.get_extension(format)}" + file = f"{filename}_{counter:05}_.{Types.VideoContainer.get_extension(format)}" video.save_to( os.path.join(full_output_folder, file), format=format, @@ -163,9 +161,9 @@ class CreateVideo(ComfyNodeABC): CATEGORY = "image/video" DESCRIPTION = "Create a video from images." - def create_video(self, images: ImageInput, fps: float, audio: Optional[AudioInput] = None): - return (VideoFromComponents( - VideoComponents( + def create_video(self, images: Input.Image, fps: float, audio: Optional[Input.Audio] = None): + return (InputImpl.VideoFromComponents( + Types.VideoComponents( images=images, audio=audio, frame_rate=Fraction(fps), @@ -187,7 +185,7 @@ class GetVideoComponents(ComfyNodeABC): CATEGORY = "image/video" DESCRIPTION = "Extracts all components from a video: frames, audio, and framerate." - def get_components(self, video: VideoInput): + def get_components(self, video: Input.Video): components = video.get_components() return (components.images, components.audio, float(components.frame_rate)) @@ -208,7 +206,7 @@ class LoadVideo(ComfyNodeABC): FUNCTION = "load_video" def load_video(self, file): video_path = folder_paths.get_annotated_filepath(file) - return (VideoFromFile(video_path),) + return (InputImpl.VideoFromFile(video_path),) @classmethod def IS_CHANGED(cls, file): @@ -239,3 +237,4 @@ NODE_DISPLAY_NAME_MAPPINGS = { "GetVideoComponents": "Get Video Components", "LoadVideo": "Load Video", } + diff --git a/main.py b/main.py index e8ca8152a..9b2a33011 100644 --- a/main.py +++ b/main.py @@ -313,10 +313,10 @@ def start_comfyui(asyncio_loop=None): prompt_server = server.PromptServer(asyncio_loop) hook_breaker_ac10a0.save_functions() - nodes.init_extra_nodes( + asyncio_loop.run_until_complete(nodes.init_extra_nodes( init_custom_nodes=(not args.disable_all_custom_nodes) or len(args.whitelist_custom_nodes) > 0, init_api_nodes=not args.disable_api_nodes - ) + )) hook_breaker_ac10a0.restore_functions() cuda_malloc_warning() diff --git a/nodes.py b/nodes.py index 231d4d4de..54e530388 100644 --- a/nodes.py +++ b/nodes.py @@ -1,6 +1,7 @@ from __future__ import annotations import torch + import os import sys import json @@ -26,6 +27,8 @@ import comfy.sd import comfy.utils import comfy.controlnet from comfy.comfy_types import IO, ComfyNodeABC, InputTypeDict, FileLocator +from comfy_api.internal import register_versions, ComfyAPIWithVersion +from comfy_api.version_list import supported_versions import comfy.clip_vision @@ -2101,7 +2104,7 @@ def get_module_name(module_path: str) -> str: return base_path -def load_custom_node(module_path: str, ignore=set(), module_parent="custom_nodes") -> bool: +async def load_custom_node(module_path: str, ignore=set(), module_parent="custom_nodes") -> bool: module_name = get_module_name(module_path) if os.path.isfile(module_path): sp = os.path.splitext(module_path) @@ -2165,7 +2168,7 @@ def load_custom_node(module_path: str, ignore=set(), module_parent="custom_nodes logging.warning(f"Cannot import {module_path} module for custom nodes: {e}") return False -def init_external_custom_nodes(): +async def init_external_custom_nodes(): """ Initializes the external custom nodes. @@ -2191,7 +2194,7 @@ def init_external_custom_nodes(): logging.info(f"Skipping {possible_module} due to disable_all_custom_nodes and whitelist_custom_nodes") continue time_before = time.perf_counter() - success = load_custom_node(module_path, base_node_names, module_parent="custom_nodes") + success = await load_custom_node(module_path, base_node_names, module_parent="custom_nodes") node_import_times.append((time.perf_counter() - time_before, module_path, success)) if len(node_import_times) > 0: @@ -2204,7 +2207,7 @@ def init_external_custom_nodes(): logging.info("{:6.1f} seconds{}: {}".format(n[0], import_message, n[1])) logging.info("") -def init_builtin_extra_nodes(): +async def init_builtin_extra_nodes(): """ Initializes the built-in extra nodes in ComfyUI. @@ -2288,13 +2291,13 @@ def init_builtin_extra_nodes(): import_failed = [] for node_file in extras_files: - if not load_custom_node(os.path.join(extras_dir, node_file), module_parent="comfy_extras"): + if not await load_custom_node(os.path.join(extras_dir, node_file), module_parent="comfy_extras"): import_failed.append(node_file) return import_failed -def init_builtin_api_nodes(): +async def init_builtin_api_nodes(): api_nodes_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "comfy_api_nodes") api_nodes_files = [ "nodes_ideogram.py", @@ -2315,26 +2318,35 @@ def init_builtin_api_nodes(): "nodes_gemini.py", ] - if not load_custom_node(os.path.join(api_nodes_dir, "canary.py"), module_parent="comfy_api_nodes"): + if not await load_custom_node(os.path.join(api_nodes_dir, "canary.py"), module_parent="comfy_api_nodes"): return api_nodes_files import_failed = [] for node_file in api_nodes_files: - if not load_custom_node(os.path.join(api_nodes_dir, node_file), module_parent="comfy_api_nodes"): + if not await load_custom_node(os.path.join(api_nodes_dir, node_file), module_parent="comfy_api_nodes"): import_failed.append(node_file) return import_failed +async def init_public_apis(): + register_versions([ + ComfyAPIWithVersion( + version=getattr(v, "VERSION"), + api_class=v + ) for v in supported_versions + ]) -def init_extra_nodes(init_custom_nodes=True, init_api_nodes=True): - import_failed = init_builtin_extra_nodes() +async def init_extra_nodes(init_custom_nodes=True, init_api_nodes=True): + await init_public_apis() + + import_failed = await init_builtin_extra_nodes() import_failed_api = [] if init_api_nodes: - import_failed_api = init_builtin_api_nodes() + import_failed_api = await init_builtin_api_nodes() if init_custom_nodes: - init_external_custom_nodes() + await init_external_custom_nodes() else: logging.info("Skipping loading of custom nodes") diff --git a/pyproject.toml b/pyproject.toml index 723c93069..244fdd232 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,4 +21,4 @@ lint.select = [ # See all rules here: https://docs.astral.sh/ruff/rules/#pyflakes-f "F", ] -exclude = ["*.ipynb"] +exclude = ["*.ipynb", "**/generated/*.pyi"] diff --git a/tests/inference/testing_nodes/testing-pack/__init__.py b/tests/inference/testing_nodes/testing-pack/__init__.py index 20f9533c7..3d5ac8a94 100644 --- a/tests/inference/testing_nodes/testing-pack/__init__.py +++ b/tests/inference/testing_nodes/testing-pack/__init__.py @@ -4,6 +4,7 @@ from .util import UTILITY_NODE_CLASS_MAPPINGS, UTILITY_NODE_DISPLAY_NAME_MAPPING from .conditions import CONDITION_NODE_CLASS_MAPPINGS, CONDITION_NODE_DISPLAY_NAME_MAPPINGS from .stubs import TEST_STUB_NODE_CLASS_MAPPINGS, TEST_STUB_NODE_DISPLAY_NAME_MAPPINGS from .async_test_nodes import ASYNC_TEST_NODE_CLASS_MAPPINGS, ASYNC_TEST_NODE_DISPLAY_NAME_MAPPINGS +from .api_test_nodes import API_TEST_NODE_CLASS_MAPPINGS, API_TEST_NODE_DISPLAY_NAME_MAPPINGS # NODE_CLASS_MAPPINGS = GENERAL_NODE_CLASS_MAPPINGS.update(COMPONENT_NODE_CLASS_MAPPINGS) # NODE_DISPLAY_NAME_MAPPINGS = GENERAL_NODE_DISPLAY_NAME_MAPPINGS.update(COMPONENT_NODE_DISPLAY_NAME_MAPPINGS) @@ -15,6 +16,7 @@ NODE_CLASS_MAPPINGS.update(UTILITY_NODE_CLASS_MAPPINGS) NODE_CLASS_MAPPINGS.update(CONDITION_NODE_CLASS_MAPPINGS) NODE_CLASS_MAPPINGS.update(TEST_STUB_NODE_CLASS_MAPPINGS) NODE_CLASS_MAPPINGS.update(ASYNC_TEST_NODE_CLASS_MAPPINGS) +NODE_CLASS_MAPPINGS.update(API_TEST_NODE_CLASS_MAPPINGS) NODE_DISPLAY_NAME_MAPPINGS = {} NODE_DISPLAY_NAME_MAPPINGS.update(TEST_NODE_DISPLAY_NAME_MAPPINGS) @@ -23,4 +25,4 @@ NODE_DISPLAY_NAME_MAPPINGS.update(UTILITY_NODE_DISPLAY_NAME_MAPPINGS) NODE_DISPLAY_NAME_MAPPINGS.update(CONDITION_NODE_DISPLAY_NAME_MAPPINGS) NODE_DISPLAY_NAME_MAPPINGS.update(TEST_STUB_NODE_DISPLAY_NAME_MAPPINGS) NODE_DISPLAY_NAME_MAPPINGS.update(ASYNC_TEST_NODE_DISPLAY_NAME_MAPPINGS) - +NODE_DISPLAY_NAME_MAPPINGS.update(API_TEST_NODE_DISPLAY_NAME_MAPPINGS) diff --git a/tests/inference/testing_nodes/testing-pack/api_test_nodes.py b/tests/inference/testing_nodes/testing-pack/api_test_nodes.py new file mode 100644 index 000000000..b2eaae05e --- /dev/null +++ b/tests/inference/testing_nodes/testing-pack/api_test_nodes.py @@ -0,0 +1,78 @@ +import asyncio +import time +from comfy.comfy_types import IO, ComfyNodeABC, InputTypeDict +from comfy_api.v0_0_2 import ComfyAPI, ComfyAPISync + +api = ComfyAPI() +api_sync = ComfyAPISync() + + +class TestAsyncProgressUpdate(ComfyNodeABC): + """Test node with async VALIDATE_INPUTS.""" + + @classmethod + def INPUT_TYPES(cls) -> InputTypeDict: + return { + "required": { + "value": (IO.ANY, {}), + "sleep_seconds": (IO.FLOAT, {"default": 1.0}), + }, + } + + RETURN_TYPES = (IO.ANY,) + FUNCTION = "execute" + CATEGORY = "_for_testing/async" + + async def execute(self, value, sleep_seconds): + start = time.time() + expiration = start + sleep_seconds + now = start + while now < expiration: + now = time.time() + await api.execution.set_progress( + value=(now - start) / sleep_seconds, + max_value=1.0, + ) + await asyncio.sleep(0.01) + return (value,) + + +class TestSyncProgressUpdate(ComfyNodeABC): + """Test node with async VALIDATE_INPUTS.""" + + @classmethod + def INPUT_TYPES(cls) -> InputTypeDict: + return { + "required": { + "value": (IO.ANY, {}), + "sleep_seconds": (IO.FLOAT, {"default": 1.0}), + }, + } + + RETURN_TYPES = (IO.ANY,) + FUNCTION = "execute" + CATEGORY = "_for_testing/async" + + def execute(self, value, sleep_seconds): + start = time.time() + expiration = start + sleep_seconds + now = start + while now < expiration: + now = time.time() + api_sync.execution.set_progress( + value=(now - start) / sleep_seconds, + max_value=1.0, + ) + time.sleep(0.01) + return (value,) + + +API_TEST_NODE_CLASS_MAPPINGS = { + "TestAsyncProgressUpdate": TestAsyncProgressUpdate, + "TestSyncProgressUpdate": TestSyncProgressUpdate, +} + +API_TEST_NODE_DISPLAY_NAME_MAPPINGS = { + "TestAsyncProgressUpdate": "Async Progress Update Test Node", + "TestSyncProgressUpdate": "Sync Progress Update Test Node", +} From d2aaef029cfb60611f2c9aad1b5dfb7070f9c162 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Wed, 30 Jul 2025 10:50:49 +0800 Subject: [PATCH 077/243] Update template to 0.1.44 (#9104) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 14a085a2c..8f2f6a56c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.41 +comfyui-workflow-templates==0.1.44 comfyui-embedded-docs==0.2.4 torch torchsde From da9dab7edd36fa75e010d2c8498e1ba23b8b97dd Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 30 Jul 2025 02:55:26 -0700 Subject: [PATCH 078/243] Small wan camera memory optimization. (#9111) --- comfy/ldm/wan/model.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index a93a13c86..86d0795e9 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -769,8 +769,7 @@ class CameraWanModel(WanModel): # embeddings x = self.patch_embedding(x.float()).to(x.dtype) if self.control_adapter is not None and camera_conditions is not None: - x_camera = self.control_adapter(camera_conditions).to(x.dtype) - x = x + x_camera + x = x + self.control_adapter(camera_conditions).to(x.dtype) grid_sizes = x.shape[2:] x = x.flatten(2).transpose(1, 2) From 61b08d4ba65fec37070376bf50da3ec3c534e859 Mon Sep 17 00:00:00 2001 From: chaObserv <154517000+chaObserv@users.noreply.github.com> Date: Thu, 31 Jul 2025 07:25:56 +0800 Subject: [PATCH 079/243] Replace manual x * sigmoid(x) with torch silu in VAE nonlinearity (#9057) --- comfy/ldm/cosmos/cosmos_tokenizer/utils.py | 3 ++- comfy/ldm/modules/diffusionmodules/model.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/comfy/ldm/cosmos/cosmos_tokenizer/utils.py b/comfy/ldm/cosmos/cosmos_tokenizer/utils.py index 3af8d0d05..ca993006f 100644 --- a/comfy/ldm/cosmos/cosmos_tokenizer/utils.py +++ b/comfy/ldm/cosmos/cosmos_tokenizer/utils.py @@ -58,7 +58,8 @@ def is_odd(n: int) -> bool: def nonlinearity(x): - return x * torch.sigmoid(x) + # x * sigmoid(x) + return torch.nn.functional.silu(x) def Normalize(in_channels, num_groups=32): diff --git a/comfy/ldm/modules/diffusionmodules/model.py b/comfy/ldm/modules/diffusionmodules/model.py index 8162742cf..5c0373b74 100644 --- a/comfy/ldm/modules/diffusionmodules/model.py +++ b/comfy/ldm/modules/diffusionmodules/model.py @@ -36,7 +36,7 @@ def get_timestep_embedding(timesteps, embedding_dim): def nonlinearity(x): # swish - return x*torch.sigmoid(x) + return torch.nn.functional.silu(x) def Normalize(in_channels, num_groups=32): From 97eb256a355b434bbc96ec27bbce33dd10273857 Mon Sep 17 00:00:00 2001 From: guill Date: Wed, 30 Jul 2025 19:55:28 -0700 Subject: [PATCH 080/243] Add support for partial execution in backend (#9123) When a prompt is submitted, it can optionally include `partial_execution_targets` as a list of ids. If it does, rather than adding all outputs to the execution list, we add only those in the list. --- execution.py | 7 +- server.py | 7 +- tests/inference/test_async_nodes.py | 15 +- tests/inference/test_execution.py | 202 ++++++++++++++++-- .../testing-pack/specific_tests.py | 21 ++ 5 files changed, 233 insertions(+), 19 deletions(-) diff --git a/execution.py b/execution.py index 8a9663a7d..cde14c52f 100644 --- a/execution.py +++ b/execution.py @@ -7,7 +7,7 @@ import threading import time import traceback from enum import Enum -from typing import List, Literal, NamedTuple, Optional +from typing import List, Literal, NamedTuple, Optional, Union import asyncio import torch @@ -891,7 +891,7 @@ def full_type_name(klass): return klass.__qualname__ return module + '.' + klass.__qualname__ -async def validate_prompt(prompt_id, prompt): +async def validate_prompt(prompt_id, prompt, partial_execution_list: Union[list[str], None]): outputs = set() for x in prompt: if 'class_type' not in prompt[x]: @@ -915,7 +915,8 @@ async def validate_prompt(prompt_id, prompt): return (False, error, [], {}) if hasattr(class_, 'OUTPUT_NODE') and class_.OUTPUT_NODE is True: - outputs.add(x) + if partial_execution_list is None or x in partial_execution_list: + outputs.add(x) if len(outputs) == 0: error = { diff --git a/server.py b/server.py index f4de0079b..3e06d2fbb 100644 --- a/server.py +++ b/server.py @@ -681,7 +681,12 @@ class PromptServer(): if "prompt" in json_data: prompt = json_data["prompt"] prompt_id = str(json_data.get("prompt_id", uuid.uuid4())) - valid = await execution.validate_prompt(prompt_id, prompt) + + partial_execution_targets = None + if "partial_execution_targets" in json_data: + partial_execution_targets = json_data["partial_execution_targets"] + + valid = await execution.validate_prompt(prompt_id, prompt, partial_execution_targets) extra_data = {} if "extra_data" in json_data: extra_data = json_data["extra_data"] diff --git a/tests/inference/test_async_nodes.py b/tests/inference/test_async_nodes.py index b243bbca9..f029953dd 100644 --- a/tests/inference/test_async_nodes.py +++ b/tests/inference/test_async_nodes.py @@ -7,7 +7,7 @@ import subprocess from pytest import fixture from comfy_execution.graph_utils import GraphBuilder -from tests.inference.test_execution import ComfyClient +from tests.inference.test_execution import ComfyClient, run_warmup @pytest.mark.execution @@ -24,6 +24,7 @@ class TestAsyncNodes: '--listen', args_pytest["listen"], '--port', str(args_pytest["port"]), '--extra-model-paths-config', 'tests/inference/extra_model_paths.yaml', + '--cpu', ] use_lru, lru_size = request.param if use_lru: @@ -82,6 +83,9 @@ class TestAsyncNodes: def test_multiple_async_parallel_execution(self, client: ComfyClient, builder: GraphBuilder): """Test that multiple async nodes execute in parallel.""" + # Warmup execution to ensure server is fully initialized + run_warmup(client) + g = builder image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) @@ -148,6 +152,9 @@ class TestAsyncNodes: def test_async_lazy_evaluation(self, client: ComfyClient, builder: GraphBuilder): """Test async nodes with lazy evaluation.""" + # Warmup execution to ensure server is fully initialized + run_warmup(client, prefix="warmup_lazy") + g = builder input1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) input2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) @@ -305,6 +312,9 @@ class TestAsyncNodes: def test_async_caching_behavior(self, client: ComfyClient, builder: GraphBuilder): """Test that async nodes are properly cached.""" + # Warmup execution to ensure server is fully initialized + run_warmup(client, prefix="warmup_cache") + g = builder image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) sleep_node = g.node("TestSleep", value=image.out(0), seconds=0.2) @@ -324,6 +334,9 @@ class TestAsyncNodes: def test_async_with_dynamic_prompts(self, client: ComfyClient, builder: GraphBuilder): """Test async nodes within dynamically generated prompts.""" + # Warmup execution to ensure server is fully initialized + run_warmup(client, prefix="warmup_dynamic") + g = builder image1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) image2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) diff --git a/tests/inference/test_execution.py b/tests/inference/test_execution.py index 9d3d685cc..e7b29302e 100644 --- a/tests/inference/test_execution.py +++ b/tests/inference/test_execution.py @@ -15,10 +15,18 @@ import urllib.parse import urllib.error from comfy_execution.graph_utils import GraphBuilder, Node +def run_warmup(client, prefix="warmup"): + """Run a simple workflow to warm up the server.""" + warmup_g = GraphBuilder(prefix=prefix) + warmup_image = warmup_g.node("StubImage", content="BLACK", height=32, width=32, batch_size=1) + warmup_g.node("PreviewImage", images=warmup_image.out(0)) + client.run(warmup_g) + class RunResult: def __init__(self, prompt_id: str): self.outputs: Dict[str,Dict] = {} self.runs: Dict[str,bool] = {} + self.cached: Dict[str,bool] = {} self.prompt_id: str = prompt_id def get_output(self, node: Node): @@ -27,6 +35,13 @@ class RunResult: def did_run(self, node: Node): return self.runs.get(node.id, False) + def was_cached(self, node: Node): + return self.cached.get(node.id, False) + + def was_executed(self, node: Node): + """Returns True if node was either run or cached""" + return self.did_run(node) or self.was_cached(node) + def get_images(self, node: Node): output = self.get_output(node) if output is None: @@ -51,8 +66,10 @@ class ComfyClient: ws.connect("ws://{}/ws?clientId={}".format(self.server_address, self.client_id)) self.ws = ws - def queue_prompt(self, prompt): + def queue_prompt(self, prompt, partial_execution_targets=None): p = {"prompt": prompt, "client_id": self.client_id} + if partial_execution_targets is not None: + p["partial_execution_targets"] = partial_execution_targets data = json.dumps(p).encode('utf-8') req = urllib.request.Request("http://{}/prompt".format(self.server_address), data=data) return json.loads(urllib.request.urlopen(req).read()) @@ -70,13 +87,13 @@ class ComfyClient: def set_test_name(self, name): self.test_name = name - def run(self, graph): + def run(self, graph, partial_execution_targets=None): prompt = graph.finalize() for node in graph.nodes.values(): if node.class_type == 'SaveImage': node.inputs['filename_prefix'] = self.test_name - prompt_id = self.queue_prompt(prompt)['prompt_id'] + prompt_id = self.queue_prompt(prompt, partial_execution_targets)['prompt_id'] result = RunResult(prompt_id) while True: out = self.ws.recv() @@ -92,7 +109,10 @@ class ComfyClient: elif message['type'] == 'execution_error': raise Exception(message['data']) elif message['type'] == 'execution_cached': - pass # Probably want to store this off for testing + if message['data']['prompt_id'] == prompt_id: + cached_nodes = message['data'].get('nodes', []) + for node_id in cached_nodes: + result.cached[node_id] = True history = self.get_history(prompt_id)[prompt_id] for node_id in history['outputs']: @@ -130,6 +150,7 @@ class TestExecution: '--listen', args_pytest["listen"], '--port', str(args_pytest["port"]), '--extra-model-paths-config', 'tests/inference/extra_model_paths.yaml', + '--cpu', ] use_lru, lru_size = request.param if use_lru: @@ -498,12 +519,15 @@ class TestExecution: assert not result.did_run(test_node), "The execution should have been cached" def test_parallel_sleep_nodes(self, client: ComfyClient, builder: GraphBuilder): + # Warmup execution to ensure server is fully initialized + run_warmup(client) + g = builder image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) # Create sleep nodes for each duration - sleep_node1 = g.node("TestSleep", value=image.out(0), seconds=2.8) - sleep_node2 = g.node("TestSleep", value=image.out(0), seconds=2.9) + sleep_node1 = g.node("TestSleep", value=image.out(0), seconds=2.9) + sleep_node2 = g.node("TestSleep", value=image.out(0), seconds=3.1) sleep_node3 = g.node("TestSleep", value=image.out(0), seconds=3.0) # Add outputs to verify the execution @@ -515,10 +539,9 @@ class TestExecution: result = client.run(g) elapsed_time = time.time() - start_time - # The test should take around 0.4 seconds (the longest sleep duration) - # plus some overhead, but definitely less than the sum of all sleeps (0.9s) - # We'll allow for up to 0.8s total to account for overhead - assert elapsed_time < 4.0, f"Parallel execution took {elapsed_time}s, expected less than 0.8s" + # The test should take around 3.0 seconds (the longest sleep duration) + # plus some overhead, but definitely less than the sum of all sleeps (9.0s) + assert elapsed_time < 8.9, f"Parallel execution took {elapsed_time}s, expected less than 8.9s" # Verify that all nodes executed assert result.did_run(sleep_node1), "Sleep node 1 should have run" @@ -526,6 +549,9 @@ class TestExecution: assert result.did_run(sleep_node3), "Sleep node 3 should have run" def test_parallel_sleep_expansion(self, client: ComfyClient, builder: GraphBuilder): + # Warmup execution to ensure server is fully initialized + run_warmup(client) + g = builder # Create input images with different values image1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) @@ -537,9 +563,9 @@ class TestExecution: image1=image1.out(0), image2=image2.out(0), image3=image3.out(0), - sleep1=0.4, - sleep2=0.5, - sleep3=0.6) + sleep1=4.8, + sleep2=4.9, + sleep3=5.0) output = g.node("SaveImage", images=parallel_sleep.out(0)) start_time = time.time() @@ -548,7 +574,7 @@ class TestExecution: # Similar to the previous test, expect parallel execution of the sleep nodes # which should complete in less than the sum of all sleeps - assert elapsed_time < 0.8, f"Expansion execution took {elapsed_time}s, expected less than 0.8s" + assert elapsed_time < 10.0, f"Expansion execution took {elapsed_time}s, expected less than 5.5s" # Verify the parallel sleep node executed assert result.did_run(parallel_sleep), "ParallelSleep node should have run" @@ -585,3 +611,151 @@ class TestExecution: assert len(images) == 2, "Should have 2 images" assert numpy.array(images[0]).min() == 0 and numpy.array(images[0]).max() == 0, "First image should be black" assert numpy.array(images[1]).min() == 0 and numpy.array(images[1]).max() == 0, "Second image should also be black" + + # Output nodes included in the partial execution list are executed + def test_partial_execution_included_outputs(self, client: ComfyClient, builder: GraphBuilder): + g = builder + input1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + input2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) + + # Create two separate output nodes + output1 = g.node("SaveImage", images=input1.out(0)) + output2 = g.node("SaveImage", images=input2.out(0)) + + # Run with partial execution targeting only output1 + result = client.run(g, partial_execution_targets=[output1.id]) + + assert result.was_executed(input1), "Input1 should have been executed (run or cached)" + assert result.was_executed(output1), "Output1 should have been executed (run or cached)" + assert not result.did_run(input2), "Input2 should not have run" + assert not result.did_run(output2), "Output2 should not have run" + + # Verify only output1 produced results + assert len(result.get_images(output1)) == 1, "Output1 should have produced an image" + assert len(result.get_images(output2)) == 0, "Output2 should not have produced an image" + + # Output nodes NOT included in the partial execution list are NOT executed + def test_partial_execution_excluded_outputs(self, client: ComfyClient, builder: GraphBuilder): + g = builder + input1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + input2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) + input3 = g.node("StubImage", content="NOISE", height=512, width=512, batch_size=1) + + # Create three output nodes + output1 = g.node("SaveImage", images=input1.out(0)) + output2 = g.node("SaveImage", images=input2.out(0)) + output3 = g.node("SaveImage", images=input3.out(0)) + + # Run with partial execution targeting only output1 and output3 + result = client.run(g, partial_execution_targets=[output1.id, output3.id]) + + assert result.was_executed(input1), "Input1 should have been executed" + assert result.was_executed(input3), "Input3 should have been executed" + assert result.was_executed(output1), "Output1 should have been executed" + assert result.was_executed(output3), "Output3 should have been executed" + assert not result.did_run(input2), "Input2 should not have run" + assert not result.did_run(output2), "Output2 should not have run" + + # Output nodes NOT in list ARE executed if necessary for nodes that are in the list + def test_partial_execution_dependencies(self, client: ComfyClient, builder: GraphBuilder): + g = builder + input1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + + # Create a processing chain with an OUTPUT_NODE that has socket outputs + output_with_socket = g.node("TestOutputNodeWithSocketOutput", image=input1.out(0), value=2.0) + + # Create another node that depends on the output_with_socket + dependent_node = g.node("TestLazyMixImages", + image1=output_with_socket.out(0), + image2=input1.out(0), + mask=g.node("StubMask", value=0.5, height=512, width=512, batch_size=1).out(0)) + + # Create the final output + final_output = g.node("SaveImage", images=dependent_node.out(0)) + + # Run with partial execution targeting only the final output + result = client.run(g, partial_execution_targets=[final_output.id]) + + # All nodes should have been executed because they're dependencies + assert result.was_executed(input1), "Input1 should have been executed" + assert result.was_executed(output_with_socket), "Output with socket should have been executed (dependency)" + assert result.was_executed(dependent_node), "Dependent node should have been executed" + assert result.was_executed(final_output), "Final output should have been executed" + + # Lazy execution works with partial execution + def test_partial_execution_with_lazy_nodes(self, client: ComfyClient, builder: GraphBuilder): + g = builder + input1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + input2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) + input3 = g.node("StubImage", content="NOISE", height=512, width=512, batch_size=1) + + # Create masks that will trigger different lazy execution paths + mask1 = g.node("StubMask", value=0.0, height=512, width=512, batch_size=1) # Will only need image1 + mask2 = g.node("StubMask", value=0.5, height=512, width=512, batch_size=1) # Will need both images + + # Create two lazy mix nodes + lazy_mix1 = g.node("TestLazyMixImages", image1=input1.out(0), image2=input2.out(0), mask=mask1.out(0)) + lazy_mix2 = g.node("TestLazyMixImages", image1=input2.out(0), image2=input3.out(0), mask=mask2.out(0)) + + output1 = g.node("SaveImage", images=lazy_mix1.out(0)) + output2 = g.node("SaveImage", images=lazy_mix2.out(0)) + + # Run with partial execution targeting only output1 + result = client.run(g, partial_execution_targets=[output1.id]) + + # For output1 path - only input1 should run due to lazy evaluation (mask=0.0) + assert result.was_executed(input1), "Input1 should have been executed" + assert not result.did_run(input2), "Input2 should not have run (lazy evaluation)" + assert result.was_executed(mask1), "Mask1 should have been executed" + assert result.was_executed(lazy_mix1), "Lazy mix1 should have been executed" + assert result.was_executed(output1), "Output1 should have been executed" + + # Nothing from output2 path should run + assert not result.did_run(input3), "Input3 should not have run" + assert not result.did_run(mask2), "Mask2 should not have run" + assert not result.did_run(lazy_mix2), "Lazy mix2 should not have run" + assert not result.did_run(output2), "Output2 should not have run" + + # Multiple OUTPUT_NODEs with dependencies + def test_partial_execution_multiple_output_nodes(self, client: ComfyClient, builder: GraphBuilder): + g = builder + input1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + input2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) + + # Create a chain of OUTPUT_NODEs + output_node1 = g.node("TestOutputNodeWithSocketOutput", image=input1.out(0), value=1.5) + output_node2 = g.node("TestOutputNodeWithSocketOutput", image=output_node1.out(0), value=2.0) + + # Create regular output nodes + save1 = g.node("SaveImage", images=output_node1.out(0)) + save2 = g.node("SaveImage", images=output_node2.out(0)) + save3 = g.node("SaveImage", images=input2.out(0)) + + # Run targeting only save2 + result = client.run(g, partial_execution_targets=[save2.id]) + + # Should run: input1, output_node1, output_node2, save2 + assert result.was_executed(input1), "Input1 should have been executed" + assert result.was_executed(output_node1), "Output node 1 should have been executed (dependency)" + assert result.was_executed(output_node2), "Output node 2 should have been executed (dependency)" + assert result.was_executed(save2), "Save2 should have been executed" + + # Should NOT run: input2, save1, save3 + assert not result.did_run(input2), "Input2 should not have run" + assert not result.did_run(save1), "Save1 should not have run" + assert not result.did_run(save3), "Save3 should not have run" + + # Empty partial execution list (should execute nothing) + def test_partial_execution_empty_list(self, client: ComfyClient, builder: GraphBuilder): + g = builder + input1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + _output1 = g.node("SaveImage", images=input1.out(0)) + + # Run with empty partial execution list + try: + _result = client.run(g, partial_execution_targets=[]) + # Should get an error because no outputs are selected + assert False, "Should have raised an error for empty partial execution list" + except urllib.error.HTTPError: + pass # Expected behavior + diff --git a/tests/inference/testing_nodes/testing-pack/specific_tests.py b/tests/inference/testing_nodes/testing-pack/specific_tests.py index 657d49f2f..4f8f01ae4 100644 --- a/tests/inference/testing_nodes/testing-pack/specific_tests.py +++ b/tests/inference/testing_nodes/testing-pack/specific_tests.py @@ -463,6 +463,25 @@ class TestParallelSleep(ComfyNodeABC): "expand": g.finalize(), } +class TestOutputNodeWithSocketOutput: + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "image": ("IMAGE",), + "value": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0}), + }, + } + RETURN_TYPES = ("IMAGE",) + FUNCTION = "process" + CATEGORY = "_for_testing" + OUTPUT_NODE = True + + def process(self, image, value): + # Apply value scaling and return both as output and socket + result = image * value + return (result,) + TEST_NODE_CLASS_MAPPINGS = { "TestLazyMixImages": TestLazyMixImages, "TestVariadicAverage": TestVariadicAverage, @@ -478,6 +497,7 @@ TEST_NODE_CLASS_MAPPINGS = { "TestSamplingInExpansion": TestSamplingInExpansion, "TestSleep": TestSleep, "TestParallelSleep": TestParallelSleep, + "TestOutputNodeWithSocketOutput": TestOutputNodeWithSocketOutput, } TEST_NODE_DISPLAY_NAME_MAPPINGS = { @@ -495,4 +515,5 @@ TEST_NODE_DISPLAY_NAME_MAPPINGS = { "TestSamplingInExpansion": "Sampling In Expansion", "TestSleep": "Test Sleep", "TestParallelSleep": "Test Parallel Sleep", + "TestOutputNodeWithSocketOutput": "Test Output Node With Socket Output", } From 97b8a2c26a335fe70ac6cfb44bf225454f51d700 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 31 Jul 2025 02:46:23 -0700 Subject: [PATCH 081/243] More accurate explanation of release process. (#9126) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index befc4c006..2abd8e600 100644 --- a/README.md +++ b/README.md @@ -111,7 +111,7 @@ Workflow examples can be found on the [Examples page](https://comfyanonymous.git ## Release Process -ComfyUI follows a weekly release cycle every Friday, with three interconnected repositories: +ComfyUI follows a weekly release cycle targeting Friday but this regularly changes because of model releases or large changes to the codebase. There are three interconnected repositories: 1. **[ComfyUI Core](https://github.com/comfyanonymous/ComfyUI)** - Releases a new stable version (e.g., v0.7.0) From 4887743a2aef67e05909aeea61f6cdc93e269de3 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Thu, 31 Jul 2025 15:02:12 -0700 Subject: [PATCH 082/243] V3 Node Schema Definition - initial (#8656) --- comfy_api/internal/__init__.py | 143 +++ comfy_api/latest/__init__.py | 18 + comfy_api/latest/_io.py | 1618 ++++++++++++++++++++++++++++++++ comfy_api/latest/_resources.py | 72 ++ comfy_api/latest/_ui.py | 457 +++++++++ comfy_api/v0_0_2/__init__.py | 2 + comfy_execution/graph.py | 23 +- comfy_execution/graph_utils.py | 16 + execution.py | 140 ++- nodes.py | 37 +- server.py | 3 + 11 files changed, 2475 insertions(+), 54 deletions(-) create mode 100644 comfy_api/latest/_io.py create mode 100644 comfy_api/latest/_resources.py create mode 100644 comfy_api/latest/_ui.py diff --git a/comfy_api/internal/__init__.py b/comfy_api/internal/__init__.py index c00b1fdbb..4ca02e320 100644 --- a/comfy_api/internal/__init__.py +++ b/comfy_api/internal/__init__.py @@ -5,3 +5,146 @@ from .api_registry import ( register_versions as register_versions, get_all_versions as get_all_versions, ) + +import asyncio +from dataclasses import asdict +from typing import Callable, Optional + + +def first_real_override(cls: type, name: str, *, base: type=None) -> Optional[Callable]: + """Return the *callable* override of `name` visible on `cls`, or None if every + implementation up to (and including) `base` is the placeholder defined on `base`. + + If base is not provided, it will assume cls has a GET_BASE_CLASS + """ + if base is None: + if not hasattr(cls, "GET_BASE_CLASS"): + raise ValueError("base is required if cls does not have a GET_BASE_CLASS; is this a valid ComfyNode subclass?") + base = cls.GET_BASE_CLASS() + base_attr = getattr(base, name, None) + if base_attr is None: + return None + base_func = base_attr.__func__ + for c in cls.mro(): # NodeB, NodeA, ComfyNode, object … + if c is base: # reached the placeholder – we're done + break + if name in c.__dict__: # first class that *defines* the attr + func = getattr(c, name).__func__ + if func is not base_func: # real override + return getattr(cls, name) # bound to *cls* + return None + + +class _ComfyNodeInternal: + """Class that all V3-based APIs inherit from for ComfyNode. + + This is intended to only be referenced within execution.py, as it has to handle all V3 APIs going forward.""" + @classmethod + def GET_NODE_INFO_V1(cls): + ... + + +class _NodeOutputInternal: + """Class that all V3-based APIs inherit from for NodeOutput. + + This is intended to only be referenced within execution.py, as it has to handle all V3 APIs going forward.""" + ... + + +def as_pruned_dict(dataclass_obj): + '''Return dict of dataclass object with pruned None values.''' + return prune_dict(asdict(dataclass_obj)) + +def prune_dict(d: dict): + return {k: v for k,v in d.items() if v is not None} + + +def is_class(obj): + ''' + Returns True if is a class type. + Returns False if is a class instance. + ''' + return isinstance(obj, type) + + +def copy_class(cls: type) -> type: + ''' + Copy a class and its attributes. + ''' + if cls is None: + return None + cls_dict = { + k: v for k, v in cls.__dict__.items() + if k not in ('__dict__', '__weakref__', '__module__', '__doc__') + } + # new class + new_cls = type( + cls.__name__, + (cls,), + cls_dict + ) + # metadata preservation + new_cls.__module__ = cls.__module__ + new_cls.__doc__ = cls.__doc__ + return new_cls + + +class classproperty(object): + def __init__(self, f): + self.f = f + def __get__(self, obj, owner): + return self.f(owner) + + +# NOTE: this was ai generated and validated by hand +def shallow_clone_class(cls, new_name=None): + ''' + Shallow clone a class while preserving super() functionality. + ''' + new_name = new_name or f"{cls.__name__}Clone" + # Include the original class in the bases to maintain proper inheritance + new_bases = (cls,) + cls.__bases__ + return type(new_name, new_bases, dict(cls.__dict__)) + +# NOTE: this was ai generated and validated by hand +def lock_class(cls): + ''' + Lock a class so that its top-levelattributes cannot be modified. + ''' + # Locked instance __setattr__ + def locked_instance_setattr(self, name, value): + raise AttributeError( + f"Cannot set attribute '{name}' on immutable instance of {type(self).__name__}" + ) + # Locked metaclass + class LockedMeta(type(cls)): + def __setattr__(cls_, name, value): + raise AttributeError( + f"Cannot modify class attribute '{name}' on locked class '{cls_.__name__}'" + ) + # Rebuild class with locked behavior + locked_dict = dict(cls.__dict__) + locked_dict['__setattr__'] = locked_instance_setattr + + return LockedMeta(cls.__name__, cls.__bases__, locked_dict) + + +def make_locked_method_func(type_obj, func, class_clone): + """ + Returns a function that, when called with **inputs, will execute: + getattr(type_obj, func).__func__(lock_class(class_clone), **inputs) + + Supports both synchronous and asynchronous methods. + """ + locked_class = lock_class(class_clone) + method = getattr(type_obj, func).__func__ + + # Check if the original method is async + if asyncio.iscoroutinefunction(method): + async def wrapped_async_func(**inputs): + return await method(locked_class, **inputs) + return wrapped_async_func + else: + def wrapped_func(**inputs): + return method(locked_class, **inputs) + return wrapped_func diff --git a/comfy_api/latest/__init__.py b/comfy_api/latest/__init__.py index e1f3a3655..2cee65aa9 100644 --- a/comfy_api/latest/__init__.py +++ b/comfy_api/latest/__init__.py @@ -1,5 +1,6 @@ from __future__ import annotations +from abc import ABC, abstractmethod from typing import Type, TYPE_CHECKING from comfy_api.internal import ComfyAPIBase from comfy_api.internal.singleton import ProxiedSingleton @@ -7,6 +8,9 @@ from comfy_api.internal.async_to_sync import create_sync_class from comfy_api.latest._input import ImageInput, AudioInput, MaskInput, LatentInput, VideoInput from comfy_api.latest._input_impl import VideoFromFile, VideoFromComponents from comfy_api.latest._util import VideoCodec, VideoContainer, VideoComponents +from comfy_api.latest._io import _IO as io #noqa: F401 +from comfy_api.latest._ui import _UI as ui #noqa: F401 +# from comfy_api.latest._resources import _RESOURCES as resources #noqa: F401 from comfy_execution.utils import get_executing_context from comfy_execution.progress import get_progress_state, PreviewImageTuple from PIL import Image @@ -72,6 +76,19 @@ class ComfyAPI_latest(ComfyAPIBase): execution: Execution +class ComfyExtension(ABC): + async def on_load(self) -> None: + """ + Called when an extension is loaded. + This should be used to initialize any global resources neeeded by the extension. + """ + + @abstractmethod + async def get_node_list(self) -> list[type[io.ComfyNode]]: + """ + Returns a list of nodes that this extension provides. + """ + class Input: Image = ImageInput Audio = AudioInput @@ -103,4 +120,5 @@ __all__ = [ "Input", "InputImpl", "Types", + "ComfyExtension", ] diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py new file mode 100644 index 000000000..ec1efb51d --- /dev/null +++ b/comfy_api/latest/_io.py @@ -0,0 +1,1618 @@ +from __future__ import annotations + +import copy +import inspect +from abc import ABC, abstractmethod +from collections import Counter +from dataclasses import asdict, dataclass +from enum import Enum +from typing import Any, Callable, Literal, TypedDict, TypeVar, TYPE_CHECKING +from typing_extensions import NotRequired, final + +# used for type hinting +import torch + +if TYPE_CHECKING: + from spandrel import ImageModelDescriptor + from comfy.clip_vision import ClipVisionModel + from comfy.clip_vision import Output as ClipVisionOutput_ + from comfy.controlnet import ControlNet + from comfy.hooks import HookGroup, HookKeyframeGroup + from comfy.model_patcher import ModelPatcher + from comfy.samplers import CFGGuider, Sampler + from comfy.sd import CLIP, VAE + from comfy.sd import StyleModel as StyleModel_ + from comfy_api.input import VideoInput +from comfy_api.internal import (_ComfyNodeInternal, _NodeOutputInternal, classproperty, copy_class, first_real_override, is_class, + prune_dict, shallow_clone_class) +from comfy_api.latest._resources import Resources, ResourcesLocal +from comfy_execution.graph_utils import ExecutionBlocker + +# from comfy_extras.nodes_images import SVG as SVG_ # NOTE: needs to be moved before can be imported due to circular reference + +class FolderType(str, Enum): + input = "input" + output = "output" + temp = "temp" + + +class UploadType(str, Enum): + image = "image_upload" + audio = "audio_upload" + video = "video_upload" + model = "file_upload" + + +class RemoteOptions: + def __init__(self, route: str, refresh_button: bool, control_after_refresh: Literal["first", "last"]="first", + timeout: int=None, max_retries: int=None, refresh: int=None): + self.route = route + """The route to the remote source.""" + self.refresh_button = refresh_button + """Specifies whether to show a refresh button in the UI below the widget.""" + self.control_after_refresh = control_after_refresh + """Specifies the control after the refresh button is clicked. If "first", the first item will be automatically selected, and so on.""" + self.timeout = timeout + """The maximum amount of time to wait for a response from the remote source in milliseconds.""" + self.max_retries = max_retries + """The maximum number of retries before aborting the request.""" + self.refresh = refresh + """The TTL of the remote input's value in milliseconds. Specifies the interval at which the remote input's value is refreshed.""" + + def as_dict(self): + return prune_dict({ + "route": self.route, + "refresh_button": self.refresh_button, + "control_after_refresh": self.control_after_refresh, + "timeout": self.timeout, + "max_retries": self.max_retries, + "refresh": self.refresh, + }) + + +class NumberDisplay(str, Enum): + number = "number" + slider = "slider" + + +class _StringIOType(str): + def __ne__(self, value: object) -> bool: + if self == "*" or value == "*": + return False + if not isinstance(value, str): + return True + a = frozenset(self.split(",")) + b = frozenset(value.split(",")) + return not (b.issubset(a) or a.issubset(b)) + +class _ComfyType(ABC): + Type = Any + io_type: str = None + +# NOTE: this is a workaround to make the decorator return the correct type +T = TypeVar("T", bound=type) +def comfytype(io_type: str, **kwargs): + ''' + Decorator to mark nested classes as ComfyType; io_type will be bound to the class. + + A ComfyType may have the following attributes: + - Type = + - class Input(Input): ... + - class Output(Output): ... + ''' + def decorator(cls: T) -> T: + if isinstance(cls, _ComfyType) or issubclass(cls, _ComfyType): + # clone Input and Output classes to avoid modifying the original class + new_cls = cls + if hasattr(new_cls, "Input"): + new_cls.Input = copy_class(new_cls.Input) + if hasattr(new_cls, "Output"): + new_cls.Output = copy_class(new_cls.Output) + else: + # copy class attributes except for special ones that shouldn't be in type() + cls_dict = { + k: v for k, v in cls.__dict__.items() + if k not in ('__dict__', '__weakref__', '__module__', '__doc__') + } + # new class + new_cls: ComfyTypeIO = type( + cls.__name__, + (cls, ComfyTypeIO), + cls_dict + ) + # metadata preservation + new_cls.__module__ = cls.__module__ + new_cls.__doc__ = cls.__doc__ + # assign ComfyType attributes, if needed + # NOTE: use __ne__ trick for io_type (see node_typing.IO.__ne__ for details) + new_cls.io_type = _StringIOType(io_type) + if hasattr(new_cls, "Input") and new_cls.Input is not None: + new_cls.Input.Parent = new_cls + if hasattr(new_cls, "Output") and new_cls.Output is not None: + new_cls.Output.Parent = new_cls + return new_cls + return decorator + +def Custom(io_type: str) -> type[ComfyTypeIO]: + '''Create a ComfyType for a custom io_type.''' + @comfytype(io_type=io_type) + class CustomComfyType(ComfyTypeIO): + ... + return CustomComfyType + +class _IO_V3: + ''' + Base class for V3 Inputs and Outputs. + ''' + Parent: _ComfyType = None + + def __init__(self): + pass + + @property + def io_type(self): + return self.Parent.io_type + + @property + def Type(self): + return self.Parent.Type + +class Input(_IO_V3): + ''' + Base class for a V3 Input. + ''' + def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, extra_dict=None): + super().__init__() + self.id = id + self.display_name = display_name + self.optional = optional + self.tooltip = tooltip + self.lazy = lazy + self.extra_dict = extra_dict if extra_dict is not None else {} + + def as_dict(self): + return prune_dict({ + "display_name": self.display_name, + "optional": self.optional, + "tooltip": self.tooltip, + "lazy": self.lazy, + }) | prune_dict(self.extra_dict) + + def get_io_type(self): + return _StringIOType(self.io_type) + +class WidgetInput(Input): + ''' + Base class for a V3 Input with widget. + ''' + def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, + default: Any=None, + socketless: bool=None, widget_type: str=None, force_input: bool=None, extra_dict=None): + super().__init__(id, display_name, optional, tooltip, lazy, extra_dict) + self.default = default + self.socketless = socketless + self.widget_type = widget_type + self.force_input = force_input + + def as_dict(self): + return super().as_dict() | prune_dict({ + "default": self.default, + "socketless": self.socketless, + "widgetType": self.widget_type, + "forceInput": self.force_input, + }) + + def get_io_type(self): + return self.widget_type if self.widget_type is not None else super().get_io_type() + + +class Output(_IO_V3): + def __init__(self, id: str=None, display_name: str=None, tooltip: str=None, + is_output_list=False): + self.id = id + self.display_name = display_name + self.tooltip = tooltip + self.is_output_list = is_output_list + + def as_dict(self): + return prune_dict({ + "display_name": self.display_name, + "tooltip": self.tooltip, + "is_output_list": self.is_output_list, + }) + + def get_io_type(self): + return self.io_type + + +class ComfyTypeI(_ComfyType): + '''ComfyType subclass that only has a default Input class - intended for types that only have Inputs.''' + class Input(Input): + ... + +class ComfyTypeIO(ComfyTypeI): + '''ComfyType subclass that has default Input and Output classes; useful for types with both Inputs and Outputs.''' + class Output(Output): + ... + + +@comfytype(io_type="BOOLEAN") +class Boolean(ComfyTypeIO): + Type = bool + + class Input(WidgetInput): + '''Boolean input.''' + def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, + default: bool=None, label_on: str=None, label_off: str=None, + socketless: bool=None, force_input: bool=None): + super().__init__(id, display_name, optional, tooltip, lazy, default, socketless, None, force_input) + self.label_on = label_on + self.label_off = label_off + self.default: bool + + def as_dict(self): + return super().as_dict() | prune_dict({ + "label_on": self.label_on, + "label_off": self.label_off, + }) + +@comfytype(io_type="INT") +class Int(ComfyTypeIO): + Type = int + + class Input(WidgetInput): + '''Integer input.''' + def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, + default: int=None, min: int=None, max: int=None, step: int=None, control_after_generate: bool=None, + display_mode: NumberDisplay=None, socketless: bool=None, force_input: bool=None): + super().__init__(id, display_name, optional, tooltip, lazy, default, socketless, None, force_input) + self.min = min + self.max = max + self.step = step + self.control_after_generate = control_after_generate + self.display_mode = display_mode + self.default: int + + def as_dict(self): + return super().as_dict() | prune_dict({ + "min": self.min, + "max": self.max, + "step": self.step, + "control_after_generate": self.control_after_generate, + "display": self.display_mode.value if self.display_mode else None, + }) + +@comfytype(io_type="FLOAT") +class Float(ComfyTypeIO): + Type = float + + class Input(WidgetInput): + '''Float input.''' + def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, + default: float=None, min: float=None, max: float=None, step: float=None, round: float=None, + display_mode: NumberDisplay=None, socketless: bool=None, force_input: bool=None): + super().__init__(id, display_name, optional, tooltip, lazy, default, socketless, None, force_input) + self.min = min + self.max = max + self.step = step + self.round = round + self.display_mode = display_mode + self.default: float + + def as_dict(self): + return super().as_dict() | prune_dict({ + "min": self.min, + "max": self.max, + "step": self.step, + "round": self.round, + "display": self.display_mode, + }) + +@comfytype(io_type="STRING") +class String(ComfyTypeIO): + Type = str + + class Input(WidgetInput): + '''String input.''' + def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, + multiline=False, placeholder: str=None, default: str=None, dynamic_prompts: bool=None, + socketless: bool=None, force_input: bool=None): + super().__init__(id, display_name, optional, tooltip, lazy, default, socketless, None, force_input) + self.multiline = multiline + self.placeholder = placeholder + self.dynamic_prompts = dynamic_prompts + self.default: str + + def as_dict(self): + return super().as_dict() | prune_dict({ + "multiline": self.multiline, + "placeholder": self.placeholder, + "dynamicPrompts": self.dynamic_prompts, + }) + +@comfytype(io_type="COMBO") +class Combo(ComfyTypeI): + Type = str + class Input(WidgetInput): + """Combo input (dropdown).""" + Type = str + def __init__(self, id: str, options: list[str]=None, display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, + default: str=None, control_after_generate: bool=None, + upload: UploadType=None, image_folder: FolderType=None, + remote: RemoteOptions=None, + socketless: bool=None): + super().__init__(id, display_name, optional, tooltip, lazy, default, socketless) + self.multiselect = False + self.options = options + self.control_after_generate = control_after_generate + self.upload = upload + self.image_folder = image_folder + self.remote = remote + self.default: str + + def as_dict(self): + return super().as_dict() | prune_dict({ + "multiselect": self.multiselect, + "options": self.options, + "control_after_generate": self.control_after_generate, + **({self.upload.value: True} if self.upload is not None else {}), + "image_folder": self.image_folder.value if self.image_folder else None, + "remote": self.remote.as_dict() if self.remote else None, + }) + + +@comfytype(io_type="COMBO") +class MultiCombo(ComfyTypeI): + '''Multiselect Combo input (dropdown for selecting potentially more than one value).''' + # TODO: something is wrong with the serialization, frontend does not recognize it as multiselect + Type = list[str] + class Input(Combo.Input): + def __init__(self, id: str, options: list[str], display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, + default: list[str]=None, placeholder: str=None, chip: bool=None, control_after_generate: bool=None, + socketless: bool=None): + super().__init__(id, options, display_name, optional, tooltip, lazy, default, control_after_generate, socketless=socketless) + self.multiselect = True + self.placeholder = placeholder + self.chip = chip + self.default: list[str] + + def as_dict(self): + to_return = super().as_dict() | prune_dict({ + "multi_select": self.multiselect, + "placeholder": self.placeholder, + "chip": self.chip, + }) + return to_return + +@comfytype(io_type="IMAGE") +class Image(ComfyTypeIO): + Type = torch.Tensor + + +@comfytype(io_type="WAN_CAMERA_EMBEDDING") +class WanCameraEmbedding(ComfyTypeIO): + Type = torch.Tensor + + +@comfytype(io_type="WEBCAM") +class Webcam(ComfyTypeIO): + Type = str + + class Input(WidgetInput): + """Webcam input.""" + Type = str + def __init__( + self, id: str, display_name: str=None, optional=False, + tooltip: str=None, lazy: bool=None, default: str=None, socketless: bool=None + ): + super().__init__(id, display_name, optional, tooltip, lazy, default, socketless) + + +@comfytype(io_type="MASK") +class Mask(ComfyTypeIO): + Type = torch.Tensor + +@comfytype(io_type="LATENT") +class Latent(ComfyTypeIO): + '''Latents are stored as a dictionary.''' + class LatentDict(TypedDict): + samples: torch.Tensor + '''Latent tensors.''' + noise_mask: NotRequired[torch.Tensor] + batch_index: NotRequired[list[int]] + type: NotRequired[str] + '''Only needed if dealing with these types: audio, hunyuan3dv2''' + Type = LatentDict + +@comfytype(io_type="CONDITIONING") +class Conditioning(ComfyTypeIO): + class PooledDict(TypedDict): + pooled_output: torch.Tensor + '''Pooled output from CLIP.''' + control: NotRequired[ControlNet] + '''ControlNet to apply to conditioning.''' + control_apply_to_uncond: NotRequired[bool] + '''Whether to apply ControlNet to matching negative conditioning at sample time, if applicable.''' + cross_attn_controlnet: NotRequired[torch.Tensor] + '''CrossAttn from CLIP to use for controlnet only.''' + pooled_output_controlnet: NotRequired[torch.Tensor] + '''Pooled output from CLIP to use for controlnet only.''' + gligen: NotRequired[tuple[str, Gligen, list[tuple[torch.Tensor, int, ...]]]] + '''GLIGEN to apply to conditioning.''' + area: NotRequired[tuple[int, ...] | tuple[str, float, ...]] + '''Set area of conditioning. First half of values apply to dimensions, the second half apply to coordinates. + By default, the dimensions are based on total pixel amount, but the first value can be set to "percentage" to use a percentage of the image size instead. + + (1024, 1024, 0, 0) would apply conditioning to the top-left 1024x1024 pixels. + + ("percentage", 0.5, 0.5, 0, 0) would apply conditioning to the top-left 50% of the image.''' # TODO: verify its actually top-left + strength: NotRequired[float] + '''Strength of conditioning. Default strength is 1.0.''' + mask: NotRequired[torch.Tensor] + '''Mask to apply conditioning to.''' + mask_strength: NotRequired[float] + '''Strength of conditioning mask. Default strength is 1.0.''' + set_area_to_bounds: NotRequired[bool] + '''Whether conditioning mask should determine bounds of area - if set to false, latents are sampled at full resolution and result is applied in mask.''' + concat_latent_image: NotRequired[torch.Tensor] + '''Used for inpainting and specific models.''' + concat_mask: NotRequired[torch.Tensor] + '''Used for inpainting and specific models.''' + concat_image: NotRequired[torch.Tensor] + '''Used by SD_4XUpscale_Conditioning.''' + noise_augmentation: NotRequired[float] + '''Used by SD_4XUpscale_Conditioning.''' + hooks: NotRequired[HookGroup] + '''Applies hooks to conditioning.''' + default: NotRequired[bool] + '''Whether to this conditioning is 'default'; default conditioning gets applied to any areas of the image that have no masks/areas applied, assuming at least one area/mask is present during sampling.''' + start_percent: NotRequired[float] + '''Determines relative step to begin applying conditioning, expressed as a float between 0.0 and 1.0.''' + end_percent: NotRequired[float] + '''Determines relative step to end applying conditioning, expressed as a float between 0.0 and 1.0.''' + clip_start_percent: NotRequired[float] + '''Internal variable for conditioning scheduling - start of application, expressed as a float between 0.0 and 1.0.''' + clip_end_percent: NotRequired[float] + '''Internal variable for conditioning scheduling - end of application, expressed as a float between 0.0 and 1.0.''' + attention_mask: NotRequired[torch.Tensor] + '''Masks text conditioning; used by StyleModel among others.''' + attention_mask_img_shape: NotRequired[tuple[int, ...]] + '''Masks text conditioning; used by StyleModel among others.''' + unclip_conditioning: NotRequired[list[dict]] + '''Used by unCLIP.''' + conditioning_lyrics: NotRequired[torch.Tensor] + '''Used by AceT5Model.''' + seconds_start: NotRequired[float] + '''Used by StableAudio.''' + seconds_total: NotRequired[float] + '''Used by StableAudio.''' + lyrics_strength: NotRequired[float] + '''Used by AceStepAudio.''' + width: NotRequired[int] + '''Used by certain models (e.g. CLIPTextEncodeSDXL/Refiner, PixArtAlpha).''' + height: NotRequired[int] + '''Used by certain models (e.g. CLIPTextEncodeSDXL/Refiner, PixArtAlpha).''' + aesthetic_score: NotRequired[float] + '''Used by CLIPTextEncodeSDXL/Refiner.''' + crop_w: NotRequired[int] + '''Used by CLIPTextEncodeSDXL.''' + crop_h: NotRequired[int] + '''Used by CLIPTextEncodeSDXL.''' + target_width: NotRequired[int] + '''Used by CLIPTextEncodeSDXL.''' + target_height: NotRequired[int] + '''Used by CLIPTextEncodeSDXL.''' + reference_latents: NotRequired[list[torch.Tensor]] + '''Used by ReferenceLatent.''' + guidance: NotRequired[float] + '''Used by Flux-like models with guidance embed.''' + guiding_frame_index: NotRequired[int] + '''Used by Hunyuan ImageToVideo.''' + ref_latent: NotRequired[torch.Tensor] + '''Used by Hunyuan ImageToVideo.''' + keyframe_idxs: NotRequired[list[int]] + '''Used by LTXV.''' + frame_rate: NotRequired[float] + '''Used by LTXV.''' + stable_cascade_prior: NotRequired[torch.Tensor] + '''Used by StableCascade.''' + elevation: NotRequired[list[float]] + '''Used by SV3D.''' + azimuth: NotRequired[list[float]] + '''Used by SV3D.''' + motion_bucket_id: NotRequired[int] + '''Used by SVD-like models.''' + fps: NotRequired[int] + '''Used by SVD-like models.''' + augmentation_level: NotRequired[float] + '''Used by SVD-like models.''' + clip_vision_output: NotRequired[ClipVisionOutput_] + '''Used by WAN-like models.''' + vace_frames: NotRequired[torch.Tensor] + '''Used by WAN VACE.''' + vace_mask: NotRequired[torch.Tensor] + '''Used by WAN VACE.''' + vace_strength: NotRequired[float] + '''Used by WAN VACE.''' + camera_conditions: NotRequired[Any] # TODO: assign proper type once defined + '''Used by WAN Camera.''' + time_dim_concat: NotRequired[torch.Tensor] + '''Used by WAN Phantom Subject.''' + + CondList = list[tuple[torch.Tensor, PooledDict]] + Type = CondList + +@comfytype(io_type="SAMPLER") +class Sampler(ComfyTypeIO): + if TYPE_CHECKING: + Type = Sampler + +@comfytype(io_type="SIGMAS") +class Sigmas(ComfyTypeIO): + Type = torch.Tensor + +@comfytype(io_type="NOISE") +class Noise(ComfyTypeIO): + Type = torch.Tensor + +@comfytype(io_type="GUIDER") +class Guider(ComfyTypeIO): + if TYPE_CHECKING: + Type = CFGGuider + +@comfytype(io_type="CLIP") +class Clip(ComfyTypeIO): + if TYPE_CHECKING: + Type = CLIP + +@comfytype(io_type="CONTROL_NET") +class ControlNet(ComfyTypeIO): + if TYPE_CHECKING: + Type = ControlNet + +@comfytype(io_type="VAE") +class Vae(ComfyTypeIO): + if TYPE_CHECKING: + Type = VAE + +@comfytype(io_type="MODEL") +class Model(ComfyTypeIO): + if TYPE_CHECKING: + Type = ModelPatcher + +@comfytype(io_type="CLIP_VISION") +class ClipVision(ComfyTypeIO): + if TYPE_CHECKING: + Type = ClipVisionModel + +@comfytype(io_type="CLIP_VISION_OUTPUT") +class ClipVisionOutput(ComfyTypeIO): + if TYPE_CHECKING: + Type = ClipVisionOutput_ + +@comfytype(io_type="STYLE_MODEL") +class StyleModel(ComfyTypeIO): + if TYPE_CHECKING: + Type = StyleModel_ + +@comfytype(io_type="GLIGEN") +class Gligen(ComfyTypeIO): + '''ModelPatcher that wraps around a 'Gligen' model.''' + if TYPE_CHECKING: + Type = ModelPatcher + +@comfytype(io_type="UPSCALE_MODEL") +class UpscaleModel(ComfyTypeIO): + if TYPE_CHECKING: + Type = ImageModelDescriptor + +@comfytype(io_type="AUDIO") +class Audio(ComfyTypeIO): + class AudioDict(TypedDict): + waveform: torch.Tensor + sampler_rate: int + Type = AudioDict + +@comfytype(io_type="VIDEO") +class Video(ComfyTypeIO): + if TYPE_CHECKING: + Type = VideoInput + +@comfytype(io_type="SVG") +class SVG(ComfyTypeIO): + Type = Any # TODO: SVG class is defined in comfy_extras/nodes_images.py, causing circular reference; should be moved to somewhere else before referenced directly in v3 + +@comfytype(io_type="LORA_MODEL") +class LoraModel(ComfyTypeIO): + Type = dict[str, torch.Tensor] + +@comfytype(io_type="LOSS_MAP") +class LossMap(ComfyTypeIO): + class LossMapDict(TypedDict): + loss: list[torch.Tensor] + Type = LossMapDict + +@comfytype(io_type="VOXEL") +class Voxel(ComfyTypeIO): + Type = Any # TODO: VOXEL class is defined in comfy_extras/nodes_hunyuan3d.py; should be moved to somewhere else before referenced directly in v3 + +@comfytype(io_type="MESH") +class Mesh(ComfyTypeIO): + Type = Any # TODO: MESH class is defined in comfy_extras/nodes_hunyuan3d.py; should be moved to somewhere else before referenced directly in v3 + +@comfytype(io_type="HOOKS") +class Hooks(ComfyTypeIO): + if TYPE_CHECKING: + Type = HookGroup + +@comfytype(io_type="HOOK_KEYFRAMES") +class HookKeyframes(ComfyTypeIO): + if TYPE_CHECKING: + Type = HookKeyframeGroup + +@comfytype(io_type="TIMESTEPS_RANGE") +class TimestepsRange(ComfyTypeIO): + '''Range defined by start and endpoint, between 0.0 and 1.0.''' + Type = tuple[int, int] + +@comfytype(io_type="LATENT_OPERATION") +class LatentOperation(ComfyTypeIO): + Type = Callable[[torch.Tensor], torch.Tensor] + +@comfytype(io_type="FLOW_CONTROL") +class FlowControl(ComfyTypeIO): + # NOTE: only used in testing_nodes right now + Type = tuple[str, Any] + +@comfytype(io_type="ACCUMULATION") +class Accumulation(ComfyTypeIO): + # NOTE: only used in testing_nodes right now + class AccumulationDict(TypedDict): + accum: list[Any] + Type = AccumulationDict + + +@comfytype(io_type="LOAD3D_CAMERA") +class Load3DCamera(ComfyTypeIO): + class CameraInfo(TypedDict): + position: dict[str, float | int] + target: dict[str, float | int] + zoom: int + cameraType: str + + Type = CameraInfo + + +@comfytype(io_type="LOAD_3D") +class Load3D(ComfyTypeIO): + """3D models are stored as a dictionary.""" + class Model3DDict(TypedDict): + image: str + mask: str + normal: str + camera_info: Load3DCamera.CameraInfo + recording: NotRequired[str] + + Type = Model3DDict + + +@comfytype(io_type="LOAD_3D_ANIMATION") +class Load3DAnimation(Load3D): + ... + + +@comfytype(io_type="PHOTOMAKER") +class Photomaker(ComfyTypeIO): + Type = Any + + +@comfytype(io_type="POINT") +class Point(ComfyTypeIO): + Type = Any # NOTE: I couldn't find any references in core code to POINT io_type. Does this exist? + +@comfytype(io_type="FACE_ANALYSIS") +class FaceAnalysis(ComfyTypeIO): + Type = Any # NOTE: I couldn't find any references in core code to POINT io_type. Does this exist? + +@comfytype(io_type="BBOX") +class BBOX(ComfyTypeIO): + Type = Any # NOTE: I couldn't find any references in core code to POINT io_type. Does this exist? + +@comfytype(io_type="SEGS") +class SEGS(ComfyTypeIO): + Type = Any # NOTE: I couldn't find any references in core code to POINT io_type. Does this exist? + +@comfytype(io_type="*") +class AnyType(ComfyTypeIO): + Type = Any + +@comfytype(io_type="COMFY_MULTITYPED_V3") +class MultiType: + Type = Any + class Input(Input): + ''' + Input that permits more than one input type; if `id` is an instance of `ComfyType.Input`, then that input will be used to create a widget (if applicable) with overridden values. + ''' + def __init__(self, id: str | Input, types: list[type[_ComfyType] | _ComfyType], display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, extra_dict=None): + # if id is an Input, then use that Input with overridden values + self.input_override = None + if isinstance(id, Input): + self.input_override = copy.copy(id) + optional = id.optional if id.optional is True else optional + tooltip = id.tooltip if id.tooltip is not None else tooltip + display_name = id.display_name if id.display_name is not None else display_name + lazy = id.lazy if id.lazy is not None else lazy + id = id.id + # if is a widget input, make sure widget_type is set appropriately + if isinstance(self.input_override, WidgetInput): + self.input_override.widget_type = self.input_override.get_io_type() + super().__init__(id, display_name, optional, tooltip, lazy, extra_dict) + self._io_types = types + + @property + def io_types(self) -> list[type[Input]]: + ''' + Returns list of Input class types permitted. + ''' + io_types = [] + for x in self._io_types: + if not is_class(x): + io_types.append(type(x)) + else: + io_types.append(x) + return io_types + + def get_io_type(self): + # ensure types are unique and order is preserved + str_types = [x.io_type for x in self.io_types] + if self.input_override is not None: + str_types.insert(0, self.input_override.get_io_type()) + return ",".join(list(dict.fromkeys(str_types))) + + def as_dict(self): + if self.input_override is not None: + return self.input_override.as_dict() | super().as_dict() + else: + return super().as_dict() + +class DynamicInput(Input, ABC): + ''' + Abstract class for dynamic input registration. + ''' + @abstractmethod + def get_dynamic(self) -> list[Input]: + ... + +class DynamicOutput(Output, ABC): + ''' + Abstract class for dynamic output registration. + ''' + def __init__(self, id: str=None, display_name: str=None, tooltip: str=None, + is_output_list=False): + super().__init__(id, display_name, tooltip, is_output_list) + + @abstractmethod + def get_dynamic(self) -> list[Output]: + ... + + +@comfytype(io_type="COMFY_AUTOGROW_V3") +class AutogrowDynamic(ComfyTypeI): + Type = list[Any] + class Input(DynamicInput): + def __init__(self, id: str, template_input: Input, min: int=1, max: int=None, + display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, extra_dict=None): + super().__init__(id, display_name, optional, tooltip, lazy, extra_dict) + self.template_input = template_input + if min is not None: + assert(min >= 1) + if max is not None: + assert(max >= 1) + self.min = min + self.max = max + + def get_dynamic(self) -> list[Input]: + curr_count = 1 + new_inputs = [] + for i in range(self.min): + new_input = copy.copy(self.template_input) + new_input.id = f"{new_input.id}{curr_count}_${self.id}_ag$" + if new_input.display_name is not None: + new_input.display_name = f"{new_input.display_name}{curr_count}" + new_input.optional = self.optional or new_input.optional + if isinstance(self.template_input, WidgetInput): + new_input.force_input = True + new_inputs.append(new_input) + curr_count += 1 + # pretend to expand up to max + for i in range(curr_count-1, self.max): + new_input = copy.copy(self.template_input) + new_input.id = f"{new_input.id}{curr_count}_${self.id}_ag$" + if new_input.display_name is not None: + new_input.display_name = f"{new_input.display_name}{curr_count}" + new_input.optional = True + if isinstance(self.template_input, WidgetInput): + new_input.force_input = True + new_inputs.append(new_input) + curr_count += 1 + return new_inputs + +@comfytype(io_type="COMFY_COMBODYNAMIC_V3") +class ComboDynamic(ComfyTypeI): + class Input(DynamicInput): + def __init__(self, id: str): + pass + +@comfytype(io_type="COMFY_MATCHTYPE_V3") +class MatchType(ComfyTypeIO): + class Template: + def __init__(self, template_id: str, allowed_types: _ComfyType | list[_ComfyType]): + self.template_id = template_id + self.allowed_types = [allowed_types] if isinstance(allowed_types, _ComfyType) else allowed_types + + def as_dict(self): + return { + "template_id": self.template_id, + "allowed_types": "".join(t.io_type for t in self.allowed_types), + } + + class Input(DynamicInput): + def __init__(self, id: str, template: MatchType.Template, + display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, extra_dict=None): + super().__init__(id, display_name, optional, tooltip, lazy, extra_dict) + self.template = template + + def get_dynamic(self) -> list[Input]: + return [self] + + def as_dict(self): + return super().as_dict() | prune_dict({ + "template": self.template.as_dict(), + }) + + class Output(DynamicOutput): + def __init__(self, id: str, template: MatchType.Template, display_name: str=None, tooltip: str=None, + is_output_list=False): + super().__init__(id, display_name, tooltip, is_output_list) + self.template = template + + def get_dynamic(self) -> list[Output]: + return [self] + + def as_dict(self): + return super().as_dict() | prune_dict({ + "template": self.template.as_dict(), + }) + + +class HiddenHolder: + def __init__(self, unique_id: str, prompt: Any, + extra_pnginfo: Any, dynprompt: Any, + auth_token_comfy_org: str, api_key_comfy_org: str, **kwargs): + self.unique_id = unique_id + """UNIQUE_ID is the unique identifier of the node, and matches the id property of the node on the client side. It is commonly used in client-server communications (see messages).""" + self.prompt = prompt + """PROMPT is the complete prompt sent by the client to the server. See the prompt object for a full description.""" + self.extra_pnginfo = extra_pnginfo + """EXTRA_PNGINFO is a dictionary that will be copied into the metadata of any .png files saved. Custom nodes can store additional information in this dictionary for saving (or as a way to communicate with a downstream node).""" + self.dynprompt = dynprompt + """DYNPROMPT is an instance of comfy_execution.graph.DynamicPrompt. It differs from PROMPT in that it may mutate during the course of execution in response to Node Expansion.""" + self.auth_token_comfy_org = auth_token_comfy_org + """AUTH_TOKEN_COMFY_ORG is a token acquired from signing into a ComfyOrg account on frontend.""" + self.api_key_comfy_org = api_key_comfy_org + """API_KEY_COMFY_ORG is an API Key generated by ComfyOrg that allows skipping signing into a ComfyOrg account on frontend.""" + + def __getattr__(self, key: str): + '''If hidden variable not found, return None.''' + return None + + @classmethod + def from_dict(cls, d: dict | None): + if d is None: + d = {} + return cls( + unique_id=d.get(Hidden.unique_id, None), + prompt=d.get(Hidden.prompt, None), + extra_pnginfo=d.get(Hidden.extra_pnginfo, None), + dynprompt=d.get(Hidden.dynprompt, None), + auth_token_comfy_org=d.get(Hidden.auth_token_comfy_org, None), + api_key_comfy_org=d.get(Hidden.api_key_comfy_org, None), + ) + +class Hidden(str, Enum): + ''' + Enumerator for requesting hidden variables in nodes. + ''' + unique_id = "UNIQUE_ID" + """UNIQUE_ID is the unique identifier of the node, and matches the id property of the node on the client side. It is commonly used in client-server communications (see messages).""" + prompt = "PROMPT" + """PROMPT is the complete prompt sent by the client to the server. See the prompt object for a full description.""" + extra_pnginfo = "EXTRA_PNGINFO" + """EXTRA_PNGINFO is a dictionary that will be copied into the metadata of any .png files saved. Custom nodes can store additional information in this dictionary for saving (or as a way to communicate with a downstream node).""" + dynprompt = "DYNPROMPT" + """DYNPROMPT is an instance of comfy_execution.graph.DynamicPrompt. It differs from PROMPT in that it may mutate during the course of execution in response to Node Expansion.""" + auth_token_comfy_org = "AUTH_TOKEN_COMFY_ORG" + """AUTH_TOKEN_COMFY_ORG is a token acquired from signing into a ComfyOrg account on frontend.""" + api_key_comfy_org = "API_KEY_COMFY_ORG" + """API_KEY_COMFY_ORG is an API Key generated by ComfyOrg that allows skipping signing into a ComfyOrg account on frontend.""" + + +@dataclass +class NodeInfoV1: + input: dict=None + input_order: dict[str, list[str]]=None + output: list[str]=None + output_is_list: list[bool]=None + output_name: list[str]=None + output_tooltips: list[str]=None + name: str=None + display_name: str=None + description: str=None + python_module: Any=None + category: str=None + output_node: bool=None + deprecated: bool=None + experimental: bool=None + api_node: bool=None + +@dataclass +class NodeInfoV3: + input: dict=None + output: dict=None + hidden: list[str]=None + name: str=None + display_name: str=None + description: str=None + category: str=None + output_node: bool=None + deprecated: bool=None + experimental: bool=None + api_node: bool=None + + +@dataclass +class Schema: + """Definition of V3 node properties.""" + + node_id: str + """ID of node - should be globally unique. If this is a custom node, add a prefix or postfix to avoid name clashes.""" + display_name: str = None + """Display name of node.""" + category: str = "sd" + """The category of the node, as per the "Add Node" menu.""" + inputs: list[Input]=None + outputs: list[Output]=None + hidden: list[Hidden]=None + description: str="" + """Node description, shown as a tooltip when hovering over the node.""" + is_input_list: bool = False + """A flag indicating if this node implements the additional code necessary to deal with OUTPUT_IS_LIST nodes. + + All inputs of ``type`` will become ``list[type]``, regardless of how many items are passed in. This also affects ``check_lazy_status``. + + From the docs: + + A node can also override the default input behaviour and receive the whole list in a single call. This is done by setting a class attribute `INPUT_IS_LIST` to ``True``. + + Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lists#list-processing + """ + is_output_node: bool=False + """Flags this node as an output node, causing any inputs it requires to be executed. + + If a node is not connected to any output nodes, that node will not be executed. Usage:: + + From the docs: + + By default, a node is not considered an output. Set ``OUTPUT_NODE = True`` to specify that it is. + + Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#output-node + """ + is_deprecated: bool=False + """Flags a node as deprecated, indicating to users that they should find alternatives to this node.""" + is_experimental: bool=False + """Flags a node as experimental, informing users that it may change or not work as expected.""" + is_api_node: bool=False + """Flags a node as an API node. See: https://docs.comfy.org/tutorials/api-nodes/overview.""" + not_idempotent: bool=False + """Flags a node as not idempotent; when True, the node will run and not reuse the cached outputs when identical inputs are provided on a different node in the graph.""" + enable_expand: bool=False + """Flags a node as expandable, allowing NodeOutput to include 'expand' property.""" + + def validate(self): + '''Validate the schema: + - verify ids on inputs and outputs are unique - both internally and in relation to each other + ''' + input_ids = [i.id for i in self.inputs] if self.inputs is not None else [] + output_ids = [o.id for o in self.outputs] if self.outputs is not None else [] + input_set = set(input_ids) + output_set = set(output_ids) + issues = [] + # verify ids are unique per list + if len(input_set) != len(input_ids): + issues.append(f"Input ids must be unique, but {[item for item, count in Counter(input_ids).items() if count > 1]} are not.") + if len(output_set) != len(output_ids): + issues.append(f"Output ids must be unique, but {[item for item, count in Counter(output_ids).items() if count > 1]} are not.") + # verify ids are unique between lists + intersection = input_set & output_set + if len(intersection) > 0: + issues.append(f"Ids must be unique between inputs and outputs, but {intersection} are not.") + if len(issues) > 0: + raise ValueError("\n".join(issues)) + + def finalize(self): + """Add hidden based on selected schema options, and give outputs without ids default ids.""" + # if is an api_node, will need key-related hidden + if self.is_api_node: + if self.hidden is None: + self.hidden = [] + if Hidden.auth_token_comfy_org not in self.hidden: + self.hidden.append(Hidden.auth_token_comfy_org) + if Hidden.api_key_comfy_org not in self.hidden: + self.hidden.append(Hidden.api_key_comfy_org) + # if is an output_node, will need prompt and extra_pnginfo + if self.is_output_node: + if self.hidden is None: + self.hidden = [] + if Hidden.prompt not in self.hidden: + self.hidden.append(Hidden.prompt) + if Hidden.extra_pnginfo not in self.hidden: + self.hidden.append(Hidden.extra_pnginfo) + # give outputs without ids default ids + if self.outputs is not None: + for i, output in enumerate(self.outputs): + if output.id is None: + output.id = f"_{i}_{output.io_type}_" + + def get_v1_info(self, cls) -> NodeInfoV1: + # get V1 inputs + input = { + "required": {} + } + if self.inputs: + for i in self.inputs: + if isinstance(i, DynamicInput): + dynamic_inputs = i.get_dynamic() + for d in dynamic_inputs: + add_to_dict_v1(d, input) + else: + add_to_dict_v1(i, input) + if self.hidden: + for hidden in self.hidden: + input.setdefault("hidden", {})[hidden.name] = (hidden.value,) + # create separate lists from output fields + output = [] + output_is_list = [] + output_name = [] + output_tooltips = [] + if self.outputs: + for o in self.outputs: + output.append(o.io_type) + output_is_list.append(o.is_output_list) + output_name.append(o.display_name if o.display_name else o.io_type) + output_tooltips.append(o.tooltip if o.tooltip else None) + + info = NodeInfoV1( + input=input, + input_order={key: list(value.keys()) for (key, value) in input.items()}, + output=output, + output_is_list=output_is_list, + output_name=output_name, + output_tooltips=output_tooltips, + name=self.node_id, + display_name=self.display_name, + category=self.category, + description=self.description, + output_node=self.is_output_node, + deprecated=self.is_deprecated, + experimental=self.is_experimental, + api_node=self.is_api_node, + python_module=getattr(cls, "RELATIVE_PYTHON_MODULE", "nodes") + ) + return info + + + def get_v3_info(self, cls) -> NodeInfoV3: + input_dict = {} + output_dict = {} + hidden_list = [] + # TODO: make sure dynamic types will be handled correctly + if self.inputs: + for input in self.inputs: + add_to_dict_v3(input, input_dict) + if self.outputs: + for output in self.outputs: + add_to_dict_v3(output, output_dict) + if self.hidden: + for hidden in self.hidden: + hidden_list.append(hidden.value) + + info = NodeInfoV3( + input=input_dict, + output=output_dict, + hidden=hidden_list, + name=self.node_id, + display_name=self.display_name, + description=self.description, + category=self.category, + output_node=self.is_output_node, + deprecated=self.is_deprecated, + experimental=self.is_experimental, + api_node=self.is_api_node, + python_module=getattr(cls, "RELATIVE_PYTHON_MODULE", "nodes") + ) + return info + + +def add_to_dict_v1(i: Input, input: dict): + key = "optional" if i.optional else "required" + as_dict = i.as_dict() + # for v1, we don't want to include the optional key + as_dict.pop("optional", None) + input.setdefault(key, {})[i.id] = (i.get_io_type(), as_dict) + +def add_to_dict_v3(io: Input | Output, d: dict): + d[io.id] = (io.get_io_type(), io.as_dict()) + + + +class _ComfyNodeBaseInternal(_ComfyNodeInternal): + """Common base class for storing internal methods and properties; DO NOT USE for defining nodes.""" + + RELATIVE_PYTHON_MODULE = None + SCHEMA = None + + # filled in during execution + resources: Resources = None + hidden: HiddenHolder = None + + @classmethod + @abstractmethod + def define_schema(cls) -> Schema: + """Override this function with one that returns a Schema instance.""" + raise NotImplementedError + + @classmethod + @abstractmethod + def execute(cls, **kwargs) -> NodeOutput: + """Override this function with one that performs node's actions.""" + raise NotImplementedError + + @classmethod + def validate_inputs(cls, **kwargs) -> bool: + """Optionally, define this function to validate inputs; equivalent to V1's VALIDATE_INPUTS.""" + raise NotImplementedError + + @classmethod + def fingerprint_inputs(cls, **kwargs) -> Any: + """Optionally, define this function to fingerprint inputs; equivalent to V1's IS_CHANGED.""" + raise NotImplementedError + + @classmethod + def check_lazy_status(cls, **kwargs) -> list[str]: + """Optionally, define this function to return a list of input names that should be evaluated. + + This basic mixin impl. requires all inputs. + + :kwargs: All node inputs will be included here. If the input is ``None``, it should be assumed that it has not yet been evaluated. \ + When using ``INPUT_IS_LIST = True``, unevaluated will instead be ``(None,)``. + + Params should match the nodes execution ``FUNCTION`` (self, and all inputs by name). + Will be executed repeatedly until it returns an empty list, or all requested items were already evaluated (and sent as params). + + Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lazy_evaluation#defining-check-lazy-status + """ + return [name for name in kwargs if kwargs[name] is None] + + def __init__(self): + self.local_resources: ResourcesLocal = None + self.__class__.VALIDATE_CLASS() + + @classmethod + def GET_BASE_CLASS(cls): + return _ComfyNodeBaseInternal + + @final + @classmethod + def VALIDATE_CLASS(cls): + if first_real_override(cls, "define_schema") is None: + raise Exception(f"No define_schema function was defined for node class {cls.__name__}.") + if first_real_override(cls, "execute") is None: + raise Exception(f"No execute function was defined for node class {cls.__name__}.") + + @classproperty + def FUNCTION(cls): # noqa + if inspect.iscoroutinefunction(cls.execute): + return "EXECUTE_NORMALIZED_ASYNC" + return "EXECUTE_NORMALIZED" + + @final + @classmethod + def EXECUTE_NORMALIZED(cls, *args, **kwargs) -> NodeOutput: + to_return = cls.execute(*args, **kwargs) + if to_return is None: + to_return = NodeOutput() + elif isinstance(to_return, NodeOutput): + pass + elif isinstance(to_return, tuple): + to_return = NodeOutput(*to_return) + elif isinstance(to_return, dict): + to_return = NodeOutput.from_dict(to_return) + elif isinstance(to_return, ExecutionBlocker): + to_return = NodeOutput(block_execution=to_return.message) + else: + raise Exception(f"Invalid return type from node: {type(to_return)}") + if to_return.expand is not None and not cls.SCHEMA.enable_expand: + raise Exception(f"Node {cls.__name__} is not expandable, but expand included in NodeOutput; developer should set enable_expand=True on node's Schema to allow this.") + return to_return + + @final + @classmethod + async def EXECUTE_NORMALIZED_ASYNC(cls, *args, **kwargs) -> NodeOutput: + to_return = await cls.execute(*args, **kwargs) + if to_return is None: + to_return = NodeOutput() + elif isinstance(to_return, NodeOutput): + pass + elif isinstance(to_return, tuple): + to_return = NodeOutput(*to_return) + elif isinstance(to_return, dict): + to_return = NodeOutput.from_dict(to_return) + elif isinstance(to_return, ExecutionBlocker): + to_return = NodeOutput(block_execution=to_return.message) + else: + raise Exception(f"Invalid return type from node: {type(to_return)}") + if to_return.expand is not None and not cls.SCHEMA.enable_expand: + raise Exception(f"Node {cls.__name__} is not expandable, but expand included in NodeOutput; developer should set enable_expand=True on node's Schema to allow this.") + return to_return + + @final + @classmethod + def PREPARE_CLASS_CLONE(cls, hidden_inputs: dict) -> type[ComfyNode]: + """Creates clone of real node class to prevent monkey-patching.""" + c_type: type[ComfyNode] = cls if is_class(cls) else type(cls) + type_clone: type[ComfyNode] = shallow_clone_class(c_type) + # set hidden + type_clone.hidden = HiddenHolder.from_dict(hidden_inputs) + return type_clone + + @final + @classmethod + def GET_NODE_INFO_V3(cls) -> dict[str, Any]: + schema = cls.GET_SCHEMA() + info = schema.get_v3_info(cls) + return asdict(info) + ############################################# + # V1 Backwards Compatibility code + #-------------------------------------------- + @final + @classmethod + def GET_NODE_INFO_V1(cls) -> dict[str, Any]: + schema = cls.GET_SCHEMA() + info = schema.get_v1_info(cls) + return asdict(info) + + _DESCRIPTION = None + @final + @classproperty + def DESCRIPTION(cls): # noqa + if cls._DESCRIPTION is None: + cls.GET_SCHEMA() + return cls._DESCRIPTION + + _CATEGORY = None + @final + @classproperty + def CATEGORY(cls): # noqa + if cls._CATEGORY is None: + cls.GET_SCHEMA() + return cls._CATEGORY + + _EXPERIMENTAL = None + @final + @classproperty + def EXPERIMENTAL(cls): # noqa + if cls._EXPERIMENTAL is None: + cls.GET_SCHEMA() + return cls._EXPERIMENTAL + + _DEPRECATED = None + @final + @classproperty + def DEPRECATED(cls): # noqa + if cls._DEPRECATED is None: + cls.GET_SCHEMA() + return cls._DEPRECATED + + _API_NODE = None + @final + @classproperty + def API_NODE(cls): # noqa + if cls._API_NODE is None: + cls.GET_SCHEMA() + return cls._API_NODE + + _OUTPUT_NODE = None + @final + @classproperty + def OUTPUT_NODE(cls): # noqa + if cls._OUTPUT_NODE is None: + cls.GET_SCHEMA() + return cls._OUTPUT_NODE + + _INPUT_IS_LIST = None + @final + @classproperty + def INPUT_IS_LIST(cls): # noqa + if cls._INPUT_IS_LIST is None: + cls.GET_SCHEMA() + return cls._INPUT_IS_LIST + _OUTPUT_IS_LIST = None + + @final + @classproperty + def OUTPUT_IS_LIST(cls): # noqa + if cls._OUTPUT_IS_LIST is None: + cls.GET_SCHEMA() + return cls._OUTPUT_IS_LIST + + _RETURN_TYPES = None + @final + @classproperty + def RETURN_TYPES(cls): # noqa + if cls._RETURN_TYPES is None: + cls.GET_SCHEMA() + return cls._RETURN_TYPES + + _RETURN_NAMES = None + @final + @classproperty + def RETURN_NAMES(cls): # noqa + if cls._RETURN_NAMES is None: + cls.GET_SCHEMA() + return cls._RETURN_NAMES + + _OUTPUT_TOOLTIPS = None + @final + @classproperty + def OUTPUT_TOOLTIPS(cls): # noqa + if cls._OUTPUT_TOOLTIPS is None: + cls.GET_SCHEMA() + return cls._OUTPUT_TOOLTIPS + + _NOT_IDEMPOTENT = None + @final + @classproperty + def NOT_IDEMPOTENT(cls): # noqa + if cls._NOT_IDEMPOTENT is None: + cls.GET_SCHEMA() + return cls._NOT_IDEMPOTENT + + @final + @classmethod + def INPUT_TYPES(cls, include_hidden=True, return_schema=False) -> dict[str, dict] | tuple[dict[str, dict], Schema]: + schema = cls.FINALIZE_SCHEMA() + info = schema.get_v1_info(cls) + input = info.input + if not include_hidden: + input.pop("hidden", None) + if return_schema: + return input, schema + return input + + @final + @classmethod + def FINALIZE_SCHEMA(cls): + """Call define_schema and finalize it.""" + schema = cls.define_schema() + schema.finalize() + return schema + + @final + @classmethod + def GET_SCHEMA(cls) -> Schema: + """Validate node class, finalize schema, validate schema, and set expected class properties.""" + cls.VALIDATE_CLASS() + schema = cls.FINALIZE_SCHEMA() + schema.validate() + if cls._DESCRIPTION is None: + cls._DESCRIPTION = schema.description + if cls._CATEGORY is None: + cls._CATEGORY = schema.category + if cls._EXPERIMENTAL is None: + cls._EXPERIMENTAL = schema.is_experimental + if cls._DEPRECATED is None: + cls._DEPRECATED = schema.is_deprecated + if cls._API_NODE is None: + cls._API_NODE = schema.is_api_node + if cls._OUTPUT_NODE is None: + cls._OUTPUT_NODE = schema.is_output_node + if cls._INPUT_IS_LIST is None: + cls._INPUT_IS_LIST = schema.is_input_list + if cls._NOT_IDEMPOTENT is None: + cls._NOT_IDEMPOTENT = schema.not_idempotent + + if cls._RETURN_TYPES is None: + output = [] + output_name = [] + output_is_list = [] + output_tooltips = [] + if schema.outputs: + for o in schema.outputs: + output.append(o.io_type) + output_name.append(o.display_name if o.display_name else o.io_type) + output_is_list.append(o.is_output_list) + output_tooltips.append(o.tooltip if o.tooltip else None) + + cls._RETURN_TYPES = output + cls._RETURN_NAMES = output_name + cls._OUTPUT_IS_LIST = output_is_list + cls._OUTPUT_TOOLTIPS = output_tooltips + cls.SCHEMA = schema + return schema + #-------------------------------------------- + ############################################# + + +class ComfyNode(_ComfyNodeBaseInternal): + """Common base class for all V3 nodes.""" + + @classmethod + @abstractmethod + def define_schema(cls) -> Schema: + """Override this function with one that returns a Schema instance.""" + raise NotImplementedError + + @classmethod + @abstractmethod + def execute(cls, **kwargs) -> NodeOutput: + """Override this function with one that performs node's actions.""" + raise NotImplementedError + + @classmethod + def validate_inputs(cls, **kwargs) -> bool: + """Optionally, define this function to validate inputs; equivalent to V1's VALIDATE_INPUTS.""" + raise NotImplementedError + + @classmethod + def fingerprint_inputs(cls, **kwargs) -> Any: + """Optionally, define this function to fingerprint inputs; equivalent to V1's IS_CHANGED.""" + raise NotImplementedError + + @classmethod + def check_lazy_status(cls, **kwargs) -> list[str]: + """Optionally, define this function to return a list of input names that should be evaluated. + + This basic mixin impl. requires all inputs. + + :kwargs: All node inputs will be included here. If the input is ``None``, it should be assumed that it has not yet been evaluated. \ + When using ``INPUT_IS_LIST = True``, unevaluated will instead be ``(None,)``. + + Params should match the nodes execution ``FUNCTION`` (self, and all inputs by name). + Will be executed repeatedly until it returns an empty list, or all requested items were already evaluated (and sent as params). + + Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lazy_evaluation#defining-check-lazy-status + """ + return [name for name in kwargs if kwargs[name] is None] + + @final + @classmethod + def GET_BASE_CLASS(cls): + """DO NOT override this class. Will break things in execution.py.""" + return ComfyNode + + +class NodeOutput(_NodeOutputInternal): + ''' + Standardized output of a node; can pass in any number of args and/or a UIOutput into 'ui' kwarg. + ''' + def __init__(self, *args: Any, ui: _UIOutput | dict=None, expand: dict=None, block_execution: str=None): + self.args = args + self.ui = ui + self.expand = expand + self.block_execution = block_execution + + @property + def result(self): + return self.args if len(self.args) > 0 else None + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "NodeOutput": + args = () + ui = None + expand = None + if "result" in data: + result = data["result"] + if isinstance(result, ExecutionBlocker): + return cls(block_execution=result.message) + args = result + if "ui" in data: + ui = data["ui"] + if "expand" in data: + expand = data["expand"] + return cls(args=args, ui=ui, expand=expand) + + def __getitem__(self, index) -> Any: + return self.args[index] + +class _UIOutput(ABC): + def __init__(self): + pass + + @abstractmethod + def as_dict(self) -> dict: + ... + + +class _IO: + FolderType = FolderType + UploadType = UploadType + RemoteOptions = RemoteOptions + NumberDisplay = NumberDisplay + + comfytype = staticmethod(comfytype) + Custom = staticmethod(Custom) + Input = Input + WidgetInput = WidgetInput + Output = Output + ComfyTypeI = ComfyTypeI + ComfyTypeIO = ComfyTypeIO + #--------------------------------- + # Supported Types + Boolean = Boolean + Int = Int + Float = Float + String = String + Combo = Combo + MultiCombo = MultiCombo + Image = Image + WanCameraEmbedding = WanCameraEmbedding + Webcam = Webcam + Mask = Mask + Latent = Latent + Conditioning = Conditioning + Sampler = Sampler + Sigmas = Sigmas + Noise = Noise + Guider = Guider + Clip = Clip + ControlNet = ControlNet + Vae = Vae + Model = Model + ClipVision = ClipVision + ClipVisionOutput = ClipVisionOutput + StyleModel = StyleModel + Gligen = Gligen + UpscaleModel = UpscaleModel + Audio = Audio + Video = Video + SVG = SVG + LoraModel = LoraModel + LossMap = LossMap + Voxel = Voxel + Mesh = Mesh + Hooks = Hooks + HookKeyframes = HookKeyframes + TimestepsRange = TimestepsRange + LatentOperation = LatentOperation + FlowControl = FlowControl + Accumulation = Accumulation + Load3DCamera = Load3DCamera + Load3D = Load3D + Load3DAnimation = Load3DAnimation + Photomaker = Photomaker + Point = Point + FaceAnalysis = FaceAnalysis + BBOX = BBOX + SEGS = SEGS + AnyType = AnyType + MultiType = MultiType + #--------------------------------- + HiddenHolder = HiddenHolder + Hidden = Hidden + NodeInfoV1 = NodeInfoV1 + NodeInfoV3 = NodeInfoV3 + Schema = Schema + ComfyNode = ComfyNode + NodeOutput = NodeOutput + add_to_dict_v1 = staticmethod(add_to_dict_v1) + add_to_dict_v3 = staticmethod(add_to_dict_v3) diff --git a/comfy_api/latest/_resources.py b/comfy_api/latest/_resources.py new file mode 100644 index 000000000..a6bdda972 --- /dev/null +++ b/comfy_api/latest/_resources.py @@ -0,0 +1,72 @@ +from __future__ import annotations +import comfy.utils +import folder_paths +import logging +from abc import ABC, abstractmethod +from typing import Any +import torch + +class ResourceKey(ABC): + Type = Any + def __init__(self): + ... + +class TorchDictFolderFilename(ResourceKey): + '''Key for requesting a torch file via file_name from a folder category.''' + Type = dict[str, torch.Tensor] + def __init__(self, folder_name: str, file_name: str): + self.folder_name = folder_name + self.file_name = file_name + + def __hash__(self): + return hash((self.folder_name, self.file_name)) + + def __eq__(self, other: object) -> bool: + if not isinstance(other, TorchDictFolderFilename): + return False + return self.folder_name == other.folder_name and self.file_name == other.file_name + + def __str__(self): + return f"{self.folder_name} -> {self.file_name}" + +class Resources(ABC): + def __init__(self): + ... + + @abstractmethod + def get(self, key: ResourceKey, default: Any=...) -> Any: + pass + +class ResourcesLocal(Resources): + def __init__(self): + super().__init__() + self.local_resources: dict[ResourceKey, Any] = {} + + def get(self, key: ResourceKey, default: Any=...) -> Any: + cached = self.local_resources.get(key, None) + if cached is not None: + logging.info(f"Using cached resource '{key}'") + return cached + logging.info(f"Loading resource '{key}'") + to_return = None + if isinstance(key, TorchDictFolderFilename): + if default is ...: + to_return = comfy.utils.load_torch_file(folder_paths.get_full_path_or_raise(key.folder_name, key.file_name), safe_load=True) + else: + full_path = folder_paths.get_full_path(key.folder_name, key.file_name) + if full_path is not None: + to_return = comfy.utils.load_torch_file(full_path, safe_load=True) + + if to_return is not None: + self.local_resources[key] = to_return + return to_return + if default is not ...: + return default + raise Exception(f"Unsupported resource key type: {type(key)}") + + +class _RESOURCES: + ResourceKey = ResourceKey + TorchDictFolderFilename = TorchDictFolderFilename + Resources = Resources + ResourcesLocal = ResourcesLocal diff --git a/comfy_api/latest/_ui.py b/comfy_api/latest/_ui.py new file mode 100644 index 000000000..6b8a39d58 --- /dev/null +++ b/comfy_api/latest/_ui.py @@ -0,0 +1,457 @@ +from __future__ import annotations + +import json +import os +import random +from io import BytesIO +from typing import Type + +import av +import numpy as np +import torch +import torchaudio +from PIL import Image as PILImage +from PIL.PngImagePlugin import PngInfo + +import folder_paths + +# used for image preview +from comfy.cli_args import args +from comfy_api.latest._io import ComfyNode, FolderType, Image, _UIOutput + + +class SavedResult(dict): + def __init__(self, filename: str, subfolder: str, type: FolderType): + super().__init__(filename=filename, subfolder=subfolder,type=type.value) + + @property + def filename(self) -> str: + return self["filename"] + + @property + def subfolder(self) -> str: + return self["subfolder"] + + @property + def type(self) -> FolderType: + return FolderType(self["type"]) + + +class SavedImages(_UIOutput): + """A UI output class to represent one or more saved images, potentially animated.""" + def __init__(self, results: list[SavedResult], is_animated: bool = False): + super().__init__() + self.results = results + self.is_animated = is_animated + + def as_dict(self) -> dict: + data = {"images": self.results} + if self.is_animated: + data["animated"] = (True,) + return data + + +class SavedAudios(_UIOutput): + """UI wrapper around one or more audio files on disk (FLAC / MP3 / Opus).""" + def __init__(self, results: list[SavedResult]): + super().__init__() + self.results = results + + def as_dict(self) -> dict: + return {"audio": self.results} + + +def _get_directory_by_folder_type(folder_type: FolderType) -> str: + if folder_type == FolderType.input: + return folder_paths.get_input_directory() + if folder_type == FolderType.output: + return folder_paths.get_output_directory() + return folder_paths.get_temp_directory() + + +class ImageSaveHelper: + """A helper class with static methods to handle image saving and metadata.""" + + @staticmethod + def _convert_tensor_to_pil(image_tensor: torch.Tensor) -> PILImage.Image: + """Converts a single torch tensor to a PIL Image.""" + return PILImage.fromarray(np.clip(255.0 * image_tensor.cpu().numpy(), 0, 255).astype(np.uint8)) + + @staticmethod + def _create_png_metadata(cls: Type[ComfyNode] | None) -> PngInfo | None: + """Creates a PngInfo object with prompt and extra_pnginfo.""" + if args.disable_metadata or cls is None or not cls.hidden: + return None + metadata = PngInfo() + if cls.hidden.prompt: + metadata.add_text("prompt", json.dumps(cls.hidden.prompt)) + if cls.hidden.extra_pnginfo: + for x in cls.hidden.extra_pnginfo: + metadata.add_text(x, json.dumps(cls.hidden.extra_pnginfo[x])) + return metadata + + @staticmethod + def _create_animated_png_metadata(cls: Type[ComfyNode] | None) -> PngInfo | None: + """Creates a PngInfo object with prompt and extra_pnginfo for animated PNGs (APNG).""" + if args.disable_metadata or cls is None or not cls.hidden: + return None + metadata = PngInfo() + if cls.hidden.prompt: + metadata.add( + b"comf", + "prompt".encode("latin-1", "strict") + + b"\0" + + json.dumps(cls.hidden.prompt).encode("latin-1", "strict"), + after_idat=True, + ) + if cls.hidden.extra_pnginfo: + for x in cls.hidden.extra_pnginfo: + metadata.add( + b"comf", + x.encode("latin-1", "strict") + + b"\0" + + json.dumps(cls.hidden.extra_pnginfo[x]).encode("latin-1", "strict"), + after_idat=True, + ) + return metadata + + @staticmethod + def _create_webp_metadata(pil_image: PILImage.Image, cls: Type[ComfyNode] | None) -> PILImage.Exif: + """Creates EXIF metadata bytes for WebP images.""" + exif_data = pil_image.getexif() + if args.disable_metadata or cls is None or cls.hidden is None: + return exif_data + if cls.hidden.prompt is not None: + exif_data[0x0110] = "prompt:{}".format(json.dumps(cls.hidden.prompt)) # EXIF 0x0110 = Model + if cls.hidden.extra_pnginfo is not None: + inital_exif_tag = 0x010F # EXIF 0x010f = Make + for key, value in cls.hidden.extra_pnginfo.items(): + exif_data[inital_exif_tag] = "{}:{}".format(key, json.dumps(value)) + inital_exif_tag -= 1 + return exif_data + + @staticmethod + def save_images( + images, filename_prefix: str, folder_type: FolderType, cls: Type[ComfyNode] | None, compress_level = 4, + ) -> list[SavedResult]: + """Saves a batch of images as individual PNG files.""" + full_output_folder, filename, counter, subfolder, _ = folder_paths.get_save_image_path( + filename_prefix, _get_directory_by_folder_type(folder_type), images[0].shape[1], images[0].shape[0] + ) + results = [] + metadata = ImageSaveHelper._create_png_metadata(cls) + for batch_number, image_tensor in enumerate(images): + img = ImageSaveHelper._convert_tensor_to_pil(image_tensor) + filename_with_batch_num = filename.replace("%batch_num%", str(batch_number)) + file = f"{filename_with_batch_num}_{counter:05}_.png" + img.save(os.path.join(full_output_folder, file), pnginfo=metadata, compress_level=compress_level) + results.append(SavedResult(file, subfolder, folder_type)) + counter += 1 + return results + + @staticmethod + def get_save_images_ui(images, filename_prefix: str, cls: Type[ComfyNode] | None, compress_level=4) -> SavedImages: + """Saves a batch of images and returns a UI object for the node output.""" + return SavedImages( + ImageSaveHelper.save_images( + images, + filename_prefix=filename_prefix, + folder_type=FolderType.output, + cls=cls, + compress_level=compress_level, + ) + ) + + @staticmethod + def save_animated_png( + images, filename_prefix: str, folder_type: FolderType, cls: Type[ComfyNode] | None, fps: float, compress_level: int + ) -> SavedResult: + """Saves a batch of images as a single animated PNG.""" + full_output_folder, filename, counter, subfolder, _ = folder_paths.get_save_image_path( + filename_prefix, _get_directory_by_folder_type(folder_type), images[0].shape[1], images[0].shape[0] + ) + pil_images = [ImageSaveHelper._convert_tensor_to_pil(img) for img in images] + metadata = ImageSaveHelper._create_animated_png_metadata(cls) + file = f"{filename}_{counter:05}_.png" + save_path = os.path.join(full_output_folder, file) + pil_images[0].save( + save_path, + pnginfo=metadata, + compress_level=compress_level, + save_all=True, + duration=int(1000.0 / fps), + append_images=pil_images[1:], + ) + return SavedResult(file, subfolder, folder_type) + + @staticmethod + def get_save_animated_png_ui( + images, filename_prefix: str, cls: Type[ComfyNode] | None, fps: float, compress_level: int + ) -> SavedImages: + """Saves an animated PNG and returns a UI object for the node output.""" + result = ImageSaveHelper.save_animated_png( + images, + filename_prefix=filename_prefix, + folder_type=FolderType.output, + cls=cls, + fps=fps, + compress_level=compress_level, + ) + return SavedImages([result], is_animated=len(images) > 1) + + @staticmethod + def save_animated_webp( + images, + filename_prefix: str, + folder_type: FolderType, + cls: Type[ComfyNode] | None, + fps: float, + lossless: bool, + quality: int, + method: int, + ) -> SavedResult: + """Saves a batch of images as a single animated WebP.""" + full_output_folder, filename, counter, subfolder, _ = folder_paths.get_save_image_path( + filename_prefix, _get_directory_by_folder_type(folder_type), images[0].shape[1], images[0].shape[0] + ) + pil_images = [ImageSaveHelper._convert_tensor_to_pil(img) for img in images] + pil_exif = ImageSaveHelper._create_webp_metadata(pil_images[0], cls) + file = f"{filename}_{counter:05}_.webp" + pil_images[0].save( + os.path.join(full_output_folder, file), + save_all=True, + duration=int(1000.0 / fps), + append_images=pil_images[1:], + exif=pil_exif, + lossless=lossless, + quality=quality, + method=method, + ) + return SavedResult(file, subfolder, folder_type) + + @staticmethod + def get_save_animated_webp_ui( + images, + filename_prefix: str, + cls: Type[ComfyNode] | None, + fps: float, + lossless: bool, + quality: int, + method: int, + ) -> SavedImages: + """Saves an animated WebP and returns a UI object for the node output.""" + result = ImageSaveHelper.save_animated_webp( + images, + filename_prefix=filename_prefix, + folder_type=FolderType.output, + cls=cls, + fps=fps, + lossless=lossless, + quality=quality, + method=method, + ) + return SavedImages([result], is_animated=len(images) > 1) + + +class AudioSaveHelper: + """A helper class with static methods to handle audio saving and metadata.""" + _OPUS_RATES = [8000, 12000, 16000, 24000, 48000] + + @staticmethod + def save_audio( + audio: dict, + filename_prefix: str, + folder_type: FolderType, + cls: Type[ComfyNode] | None, + format: str = "flac", + quality: str = "128k", + ) -> list[SavedResult]: + full_output_folder, filename, counter, subfolder, _ = folder_paths.get_save_image_path( + filename_prefix, _get_directory_by_folder_type(folder_type) + ) + + metadata = {} + if not args.disable_metadata and cls is not None: + if cls.hidden.prompt is not None: + metadata["prompt"] = json.dumps(cls.hidden.prompt) + if cls.hidden.extra_pnginfo is not None: + for x in cls.hidden.extra_pnginfo: + metadata[x] = json.dumps(cls.hidden.extra_pnginfo[x]) + + results = [] + for batch_number, waveform in enumerate(audio["waveform"].cpu()): + filename_with_batch_num = filename.replace("%batch_num%", str(batch_number)) + file = f"{filename_with_batch_num}_{counter:05}_.{format}" + output_path = os.path.join(full_output_folder, file) + + # Use original sample rate initially + sample_rate = audio["sample_rate"] + + # Handle Opus sample rate requirements + if format == "opus": + if sample_rate > 48000: + sample_rate = 48000 + elif sample_rate not in AudioSaveHelper._OPUS_RATES: + # Find the next highest supported rate + for rate in sorted(AudioSaveHelper._OPUS_RATES): + if rate > sample_rate: + sample_rate = rate + break + if sample_rate not in AudioSaveHelper._OPUS_RATES: # Fallback if still not supported + sample_rate = 48000 + + # Resample if necessary + if sample_rate != audio["sample_rate"]: + waveform = torchaudio.functional.resample(waveform, audio["sample_rate"], sample_rate) + + # Create output with specified format + output_buffer = BytesIO() + output_container = av.open(output_buffer, mode="w", format=format) + + # Set metadata on the container + for key, value in metadata.items(): + output_container.metadata[key] = value + + # Set up the output stream with appropriate properties + if format == "opus": + out_stream = output_container.add_stream("libopus", rate=sample_rate) + if quality == "64k": + out_stream.bit_rate = 64000 + elif quality == "96k": + out_stream.bit_rate = 96000 + elif quality == "128k": + out_stream.bit_rate = 128000 + elif quality == "192k": + out_stream.bit_rate = 192000 + elif quality == "320k": + out_stream.bit_rate = 320000 + elif format == "mp3": + out_stream = output_container.add_stream("libmp3lame", rate=sample_rate) + if quality == "V0": + # TODO i would really love to support V3 and V5 but there doesn't seem to be a way to set the qscale level, the property below is a bool + out_stream.codec_context.qscale = 1 + elif quality == "128k": + out_stream.bit_rate = 128000 + elif quality == "320k": + out_stream.bit_rate = 320000 + else: # format == "flac": + out_stream = output_container.add_stream("flac", rate=sample_rate) + + frame = av.AudioFrame.from_ndarray( + waveform.movedim(0, 1).reshape(1, -1).float().numpy(), + format="flt", + layout="mono" if waveform.shape[0] == 1 else "stereo", + ) + frame.sample_rate = sample_rate + frame.pts = 0 + output_container.mux(out_stream.encode(frame)) + + # Flush encoder + output_container.mux(out_stream.encode(None)) + + # Close containers + output_container.close() + + # Write the output to file + output_buffer.seek(0) + with open(output_path, "wb") as f: + f.write(output_buffer.getbuffer()) + + results.append(SavedResult(file, subfolder, folder_type)) + counter += 1 + + return results + + @staticmethod + def get_save_audio_ui( + audio, filename_prefix: str, cls: Type[ComfyNode] | None, format: str = "flac", quality: str = "128k", + ) -> SavedAudios: + """Save and instantly wrap for UI.""" + return SavedAudios( + AudioSaveHelper.save_audio( + audio, + filename_prefix=filename_prefix, + folder_type=FolderType.output, + cls=cls, + format=format, + quality=quality, + ) + ) + + +class PreviewImage(_UIOutput): + def __init__(self, image: Image.Type, animated: bool = False, cls: Type[ComfyNode] = None, **kwargs): + self.values = ImageSaveHelper.save_images( + image, + filename_prefix="ComfyUI_temp_" + ''.join(random.choice("abcdefghijklmnopqrstupvxyz") for _ in range(5)), + folder_type=FolderType.temp, + cls=cls, + compress_level=1, + ) + self.animated = animated + + def as_dict(self): + return { + "images": self.values, + "animated": (self.animated,) + } + + +class PreviewMask(PreviewImage): + def __init__(self, mask: PreviewMask.Type, animated: bool=False, cls: ComfyNode=None, **kwargs): + preview = mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])).movedim(1, -1).expand(-1, -1, -1, 3) + super().__init__(preview, animated, cls, **kwargs) + + +class PreviewAudio(_UIOutput): + def __init__(self, audio: dict, cls: Type[ComfyNode] = None, **kwargs): + self.values = AudioSaveHelper.save_audio( + audio, + filename_prefix="ComfyUI_temp_" + "".join(random.choice("abcdefghijklmnopqrstuvwxyz") for _ in range(5)), + folder_type=FolderType.temp, + cls=cls, + format="flac", + quality="128k", + ) + + def as_dict(self) -> dict: + return {"audio": self.values} + + +class PreviewVideo(_UIOutput): + def __init__(self, values: list[SavedResult | dict], **kwargs): + self.values = values + + def as_dict(self): + return {"images": self.values, "animated": (True,)} + + +class PreviewUI3D(_UIOutput): + def __init__(self, model_file, camera_info, **kwargs): + self.model_file = model_file + self.camera_info = camera_info + + def as_dict(self): + return {"result": [self.model_file, self.camera_info]} + + +class PreviewText(_UIOutput): + def __init__(self, value: str, **kwargs): + self.value = value + + def as_dict(self): + return {"text": (self.value,)} + + +class _UI: + SavedResult = SavedResult + SavedImages = SavedImages + SavedAudios = SavedAudios + ImageSaveHelper = ImageSaveHelper + AudioSaveHelper = AudioSaveHelper + PreviewImage = PreviewImage + PreviewMask = PreviewMask + PreviewAudio = PreviewAudio + PreviewVideo = PreviewVideo + PreviewUI3D = PreviewUI3D + PreviewText = PreviewText diff --git a/comfy_api/v0_0_2/__init__.py b/comfy_api/v0_0_2/__init__.py index ea83833fb..de0f95001 100644 --- a/comfy_api/v0_0_2/__init__.py +++ b/comfy_api/v0_0_2/__init__.py @@ -6,6 +6,7 @@ from comfy_api.latest import ( ) from typing import Type, TYPE_CHECKING from comfy_api.internal.async_to_sync import create_sync_class +from comfy_api.latest import io, ui, ComfyExtension #noqa: F401 class ComfyAPIAdapter_v0_0_2(ComfyAPI_latest): @@ -40,4 +41,5 @@ __all__ = [ "Input", "InputImpl", "Types", + "ComfyExtension", ] diff --git a/comfy_execution/graph.py b/comfy_execution/graph.py index 60e2ab91e..f4b427265 100644 --- a/comfy_execution/graph.py +++ b/comfy_execution/graph.py @@ -4,9 +4,12 @@ from typing import Type, Literal import nodes import asyncio import inspect -from comfy_execution.graph_utils import is_link +from comfy_execution.graph_utils import is_link, ExecutionBlocker from comfy.comfy_types.node_typing import ComfyNodeABC, InputTypeDict, InputTypeOptions +# NOTE: ExecutionBlocker code got moved to graph_utils.py to prevent torch being imported too soon during unit tests +ExecutionBlocker = ExecutionBlocker + class DependencyCycleError(Exception): pass @@ -294,21 +297,3 @@ class ExecutionList(TopologicalSort): del blocked_by[node_id] to_remove = [node_id for node_id in blocked_by if len(blocked_by[node_id]) == 0] return list(blocked_by.keys()) - -class ExecutionBlocker: - """ - Return this from a node and any users will be blocked with the given error message. - If the message is None, execution will be blocked silently instead. - Generally, you should avoid using this functionality unless absolutely necessary. Whenever it's - possible, a lazy input will be more efficient and have a better user experience. - This functionality is useful in two cases: - 1. You want to conditionally prevent an output node from executing. (Particularly a built-in node - like SaveImage. For your own output nodes, I would recommend just adding a BOOL input and using - lazy evaluation to let it conditionally disable itself.) - 2. You have a node with multiple possible outputs, some of which are invalid and should not be used. - (I would recommend not making nodes like this in the future -- instead, make multiple nodes with - different outputs. Unfortunately, there are several popular existing nodes using this pattern.) - """ - def __init__(self, message): - self.message = message - diff --git a/comfy_execution/graph_utils.py b/comfy_execution/graph_utils.py index 8595e942d..496d2c634 100644 --- a/comfy_execution/graph_utils.py +++ b/comfy_execution/graph_utils.py @@ -137,3 +137,19 @@ def add_graph_prefix(graph, outputs, prefix): return new_graph, tuple(new_outputs) +class ExecutionBlocker: + """ + Return this from a node and any users will be blocked with the given error message. + If the message is None, execution will be blocked silently instead. + Generally, you should avoid using this functionality unless absolutely necessary. Whenever it's + possible, a lazy input will be more efficient and have a better user experience. + This functionality is useful in two cases: + 1. You want to conditionally prevent an output node from executing. (Particularly a built-in node + like SaveImage. For your own output nodes, I would recommend just adding a BOOL input and using + lazy evaluation to let it conditionally disable itself.) + 2. You have a node with multiple possible outputs, some of which are invalid and should not be used. + (I would recommend not making nodes like this in the future -- instead, make multiple nodes with + different outputs. Unfortunately, there are several popular existing nodes using this pattern.) + """ + def __init__(self, message): + self.message = message diff --git a/execution.py b/execution.py index cde14c52f..952f0cc5c 100644 --- a/execution.py +++ b/execution.py @@ -32,6 +32,8 @@ from comfy_execution.graph_utils import GraphBuilder, is_link from comfy_execution.validation import validate_node_input from comfy_execution.progress import get_progress_state, reset_progress_state, add_progress_handler, WebUIProgressHandler from comfy_execution.utils import CurrentNodeContext +from comfy_api.internal import _ComfyNodeInternal, _NodeOutputInternal, first_real_override, is_class, make_locked_method_func +from comfy_api.latest import io class ExecutionResult(Enum): @@ -56,7 +58,15 @@ class IsChangedCache: node = self.dynprompt.get_node(node_id) class_type = node["class_type"] class_def = nodes.NODE_CLASS_MAPPINGS[class_type] - if not hasattr(class_def, "IS_CHANGED"): + has_is_changed = False + is_changed_name = None + if issubclass(class_def, _ComfyNodeInternal) and first_real_override(class_def, "fingerprint_inputs") is not None: + has_is_changed = True + is_changed_name = "fingerprint_inputs" + elif hasattr(class_def, "IS_CHANGED"): + has_is_changed = True + is_changed_name = "IS_CHANGED" + if not has_is_changed: self.is_changed[node_id] = False return self.is_changed[node_id] @@ -65,9 +75,9 @@ class IsChangedCache: return self.is_changed[node_id] # Intentionally do not use cached outputs here. We only want constants in IS_CHANGED - input_data_all, _ = get_input_data(node["inputs"], class_def, node_id, None) + input_data_all, _, hidden_inputs = get_input_data(node["inputs"], class_def, node_id, None) try: - is_changed = await _async_map_node_over_list(self.prompt_id, node_id, class_def, input_data_all, "IS_CHANGED") + is_changed = await _async_map_node_over_list(self.prompt_id, node_id, class_def, input_data_all, is_changed_name) is_changed = await resolve_map_node_over_list_results(is_changed) node["is_changed"] = [None if isinstance(x, ExecutionBlocker) else x for x in is_changed] except Exception as e: @@ -126,9 +136,14 @@ class CacheSet: SENSITIVE_EXTRA_DATA_KEYS = ("auth_token_comfy_org", "api_key_comfy_org") def get_input_data(inputs, class_def, unique_id, outputs=None, dynprompt=None, extra_data={}): - valid_inputs = class_def.INPUT_TYPES() + is_v3 = issubclass(class_def, _ComfyNodeInternal) + if is_v3: + valid_inputs, schema = class_def.INPUT_TYPES(include_hidden=False, return_schema=True) + else: + valid_inputs = class_def.INPUT_TYPES() input_data_all = {} missing_keys = {} + hidden_inputs_v3 = {} for x in inputs: input_data = inputs[x] _, input_category, input_info = get_input_info(class_def, x, valid_inputs) @@ -153,22 +168,37 @@ def get_input_data(inputs, class_def, unique_id, outputs=None, dynprompt=None, e elif input_category is not None: input_data_all[x] = [input_data] - if "hidden" in valid_inputs: - h = valid_inputs["hidden"] - for x in h: - if h[x] == "PROMPT": - input_data_all[x] = [dynprompt.get_original_prompt() if dynprompt is not None else {}] - if h[x] == "DYNPROMPT": - input_data_all[x] = [dynprompt] - if h[x] == "EXTRA_PNGINFO": - input_data_all[x] = [extra_data.get('extra_pnginfo', None)] - if h[x] == "UNIQUE_ID": - input_data_all[x] = [unique_id] - if h[x] == "AUTH_TOKEN_COMFY_ORG": - input_data_all[x] = [extra_data.get("auth_token_comfy_org", None)] - if h[x] == "API_KEY_COMFY_ORG": - input_data_all[x] = [extra_data.get("api_key_comfy_org", None)] - return input_data_all, missing_keys + if is_v3: + if schema.hidden: + if io.Hidden.prompt in schema.hidden: + hidden_inputs_v3[io.Hidden.prompt] = dynprompt.get_original_prompt() if dynprompt is not None else {} + if io.Hidden.dynprompt in schema.hidden: + hidden_inputs_v3[io.Hidden.dynprompt] = dynprompt + if io.Hidden.extra_pnginfo in schema.hidden: + hidden_inputs_v3[io.Hidden.extra_pnginfo] = extra_data.get('extra_pnginfo', None) + if io.Hidden.unique_id in schema.hidden: + hidden_inputs_v3[io.Hidden.unique_id] = unique_id + if io.Hidden.auth_token_comfy_org in schema.hidden: + hidden_inputs_v3[io.Hidden.auth_token_comfy_org] = extra_data.get("auth_token_comfy_org", None) + if io.Hidden.api_key_comfy_org in schema.hidden: + hidden_inputs_v3[io.Hidden.api_key_comfy_org] = extra_data.get("api_key_comfy_org", None) + else: + if "hidden" in valid_inputs: + h = valid_inputs["hidden"] + for x in h: + if h[x] == "PROMPT": + input_data_all[x] = [dynprompt.get_original_prompt() if dynprompt is not None else {}] + if h[x] == "DYNPROMPT": + input_data_all[x] = [dynprompt] + if h[x] == "EXTRA_PNGINFO": + input_data_all[x] = [extra_data.get('extra_pnginfo', None)] + if h[x] == "UNIQUE_ID": + input_data_all[x] = [unique_id] + if h[x] == "AUTH_TOKEN_COMFY_ORG": + input_data_all[x] = [extra_data.get("auth_token_comfy_org", None)] + if h[x] == "API_KEY_COMFY_ORG": + input_data_all[x] = [extra_data.get("api_key_comfy_org", None)] + return input_data_all, missing_keys, hidden_inputs_v3 map_node_over_list = None #Don't hook this please @@ -184,7 +214,7 @@ async def resolve_map_node_over_list_results(results): raise exc return [x.result() if isinstance(x, asyncio.Task) else x for x in results] -async def _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, func, allow_interrupt=False, execution_block_cb=None, pre_execute_cb=None): +async def _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, func, allow_interrupt=False, execution_block_cb=None, pre_execute_cb=None, hidden_inputs=None): # check if node wants the lists input_is_list = getattr(obj, "INPUT_IS_LIST", False) @@ -214,7 +244,22 @@ async def _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, f if execution_block is None: if pre_execute_cb is not None and index is not None: pre_execute_cb(index) - f = getattr(obj, func) + # V3 + if isinstance(obj, _ComfyNodeInternal) or (is_class(obj) and issubclass(obj, _ComfyNodeInternal)): + # if is just a class, then assign no resources or state, just create clone + if is_class(obj): + type_obj = obj + obj.VALIDATE_CLASS() + class_clone = obj.PREPARE_CLASS_CLONE(hidden_inputs) + # otherwise, use class instance to populate/reuse some fields + else: + type_obj = type(obj) + type_obj.VALIDATE_CLASS() + class_clone = type_obj.PREPARE_CLASS_CLONE(hidden_inputs) + f = make_locked_method_func(type_obj, func, class_clone) + # V1 + else: + f = getattr(obj, func) if inspect.iscoroutinefunction(f): async def async_wrapper(f, prompt_id, unique_id, list_index, args): with CurrentNodeContext(prompt_id, unique_id, list_index): @@ -266,8 +311,8 @@ def merge_result_data(results, obj): output.append([o[i] for o in results]) return output -async def get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=None, pre_execute_cb=None): - return_values = await _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, obj.FUNCTION, allow_interrupt=True, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb) +async def get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=None, pre_execute_cb=None, hidden_inputs=None): + return_values = await _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, obj.FUNCTION, allow_interrupt=True, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb, hidden_inputs=hidden_inputs) has_pending_task = any(isinstance(r, asyncio.Task) and not r.done() for r in return_values) if has_pending_task: return return_values, {}, False, has_pending_task @@ -298,6 +343,26 @@ def get_output_from_returns(return_values, obj): result = tuple([result] * len(obj.RETURN_TYPES)) results.append(result) subgraph_results.append((None, result)) + elif isinstance(r, _NodeOutputInternal): + # V3 + if r.ui is not None: + if isinstance(r.ui, dict): + uis.append(r.ui) + else: + uis.append(r.ui.as_dict()) + if r.expand is not None: + has_subgraph = True + new_graph = r.expand + result = r.result + if r.block_execution is not None: + result = tuple([ExecutionBlocker(r.block_execution)] * len(obj.RETURN_TYPES)) + subgraph_results.append((new_graph, result)) + elif r.result is not None: + result = r.result + if r.block_execution is not None: + result = tuple([ExecutionBlocker(r.block_execution)] * len(obj.RETURN_TYPES)) + results.append(result) + subgraph_results.append((None, result)) else: if isinstance(r, ExecutionBlocker): r = tuple([r] * len(obj.RETURN_TYPES)) @@ -381,7 +446,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, has_subgraph = False else: get_progress_state().start_progress(unique_id) - input_data_all, missing_keys = get_input_data(inputs, class_def, unique_id, caches.outputs, dynprompt, extra_data) + input_data_all, missing_keys, hidden_inputs = get_input_data(inputs, class_def, unique_id, caches.outputs, dynprompt, extra_data) if server.client_id is not None: server.last_node_id = display_node_id server.send_sync("executing", { "node": unique_id, "display_node": display_node_id, "prompt_id": prompt_id }, server.client_id) @@ -391,8 +456,12 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, obj = class_def() caches.objects.set(unique_id, obj) - if hasattr(obj, "check_lazy_status"): - required_inputs = await _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, "check_lazy_status", allow_interrupt=True) + if issubclass(class_def, _ComfyNodeInternal): + lazy_status_present = first_real_override(class_def, "check_lazy_status") is not None + else: + lazy_status_present = getattr(obj, "check_lazy_status", None) is not None + if lazy_status_present: + required_inputs = await _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, "check_lazy_status", allow_interrupt=True, hidden_inputs=hidden_inputs) required_inputs = await resolve_map_node_over_list_results(required_inputs) required_inputs = set(sum([r for r in required_inputs if isinstance(r,list)], [])) required_inputs = [x for x in required_inputs if isinstance(x,str) and ( @@ -424,7 +493,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, def pre_execute_cb(call_index): # TODO - How to handle this with async functions without contextvars (which requires Python 3.12)? GraphBuilder.set_default_prefix(unique_id, call_index, 0) - output_data, output_ui, has_subgraph, has_pending_tasks = await get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb) + output_data, output_ui, has_subgraph, has_pending_tasks = await get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb, hidden_inputs=hidden_inputs) if has_pending_tasks: pending_async_nodes[unique_id] = output_data unblock = execution_list.add_external_block(unique_id) @@ -672,8 +741,14 @@ async def validate_inputs(prompt_id, prompt, item, validated): validate_function_inputs = [] validate_has_kwargs = False - if hasattr(obj_class, "VALIDATE_INPUTS"): - argspec = inspect.getfullargspec(obj_class.VALIDATE_INPUTS) + if issubclass(obj_class, _ComfyNodeInternal): + validate_function_name = "validate_inputs" + validate_function = first_real_override(obj_class, validate_function_name) + else: + validate_function_name = "VALIDATE_INPUTS" + validate_function = getattr(obj_class, validate_function_name, None) + if validate_function is not None: + argspec = inspect.getfullargspec(validate_function) validate_function_inputs = argspec.args validate_has_kwargs = argspec.varkw is not None received_types = {} @@ -848,7 +923,7 @@ async def validate_inputs(prompt_id, prompt, item, validated): continue if len(validate_function_inputs) > 0 or validate_has_kwargs: - input_data_all, _ = get_input_data(inputs, obj_class, unique_id) + input_data_all, _, hidden_inputs = get_input_data(inputs, obj_class, unique_id) input_filtered = {} for x in input_data_all: if x in validate_function_inputs or validate_has_kwargs: @@ -856,8 +931,7 @@ async def validate_inputs(prompt_id, prompt, item, validated): if 'input_types' in validate_function_inputs: input_filtered['input_types'] = [received_types] - #ret = obj_class.VALIDATE_INPUTS(**input_filtered) - ret = await _async_map_node_over_list(prompt_id, unique_id, obj_class, input_filtered, "VALIDATE_INPUTS") + ret = await _async_map_node_over_list(prompt_id, unique_id, obj_class, input_filtered, validate_function_name, hidden_inputs=hidden_inputs) ret = await resolve_map_node_over_list_results(ret) for x in input_filtered: for i, r in enumerate(ret): diff --git a/nodes.py b/nodes.py index 54e530388..da4a46366 100644 --- a/nodes.py +++ b/nodes.py @@ -6,6 +6,7 @@ import os import sys import json import hashlib +import inspect import traceback import math import time @@ -29,6 +30,7 @@ import comfy.controlnet from comfy.comfy_types import IO, ComfyNodeABC, InputTypeDict, FileLocator from comfy_api.internal import register_versions, ComfyAPIWithVersion from comfy_api.version_list import supported_versions +from comfy_api.latest import io, ComfyExtension import comfy.clip_vision @@ -2152,6 +2154,7 @@ async def load_custom_node(module_path: str, ignore=set(), module_parent="custom if os.path.isdir(web_dir): EXTENSION_WEB_DIRS[module_name] = web_dir + # V1 node definition if hasattr(module, "NODE_CLASS_MAPPINGS") and getattr(module, "NODE_CLASS_MAPPINGS") is not None: for name, node_cls in module.NODE_CLASS_MAPPINGS.items(): if name not in ignore: @@ -2160,8 +2163,38 @@ async def load_custom_node(module_path: str, ignore=set(), module_parent="custom if hasattr(module, "NODE_DISPLAY_NAME_MAPPINGS") and getattr(module, "NODE_DISPLAY_NAME_MAPPINGS") is not None: NODE_DISPLAY_NAME_MAPPINGS.update(module.NODE_DISPLAY_NAME_MAPPINGS) return True + # V3 Extension Definition + elif hasattr(module, "comfy_entrypoint"): + entrypoint = getattr(module, "comfy_entrypoint") + if not callable(entrypoint): + logging.warning(f"comfy_entrypoint in {module_path} is not callable, skipping.") + return False + try: + if inspect.iscoroutinefunction(entrypoint): + extension = await entrypoint() + else: + extension = entrypoint() + if not isinstance(extension, ComfyExtension): + logging.warning(f"comfy_entrypoint in {module_path} did not return a ComfyExtension, skipping.") + return False + node_list = await extension.get_node_list() + if not isinstance(node_list, list): + logging.warning(f"comfy_entrypoint in {module_path} did not return a list of nodes, skipping.") + return False + for node_cls in node_list: + node_cls: io.ComfyNode + schema = node_cls.GET_SCHEMA() + if schema.node_id not in ignore: + NODE_CLASS_MAPPINGS[schema.node_id] = node_cls + node_cls.RELATIVE_PYTHON_MODULE = "{}.{}".format(module_parent, get_module_name(module_path)) + if schema.display_name is not None: + NODE_DISPLAY_NAME_MAPPINGS[schema.node_id] = schema.display_name + return True + except Exception as e: + logging.warning(f"Error while calling comfy_entrypoint in {module_path}: {e}") + return False else: - logging.warning(f"Skip {module_path} module for custom nodes due to the lack of NODE_CLASS_MAPPINGS.") + logging.warning(f"Skip {module_path} module for custom nodes due to the lack of NODE_CLASS_MAPPINGS or NODES_LIST (need one).") return False except Exception as e: logging.warning(traceback.format_exc()) @@ -2286,7 +2319,7 @@ async def init_builtin_extra_nodes(): "nodes_string.py", "nodes_camera_trajectory.py", "nodes_edit_model.py", - "nodes_tcfg.py" + "nodes_tcfg.py", ] import_failed = [] diff --git a/server.py b/server.py index 3e06d2fbb..0553a0dd7 100644 --- a/server.py +++ b/server.py @@ -30,6 +30,7 @@ from comfy_api import feature_flags import node_helpers from comfyui_version import __version__ from app.frontend_management import FrontendManager +from comfy_api.internal import _ComfyNodeInternal from app.user_manager import UserManager from app.model_manager import ModelFileManager @@ -591,6 +592,8 @@ class PromptServer(): def node_info(node_class): obj_class = nodes.NODE_CLASS_MAPPINGS[node_class] + if issubclass(obj_class, _ComfyNodeInternal): + return obj_class.GET_NODE_INFO_V1() info = {} info['input'] = obj_class.INPUT_TYPES() info['input_order'] = {key: list(value.keys()) for (key, value) in obj_class.INPUT_TYPES().items()} From 5ee381c058d606209dcafb568af20196e7884fc8 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 31 Jul 2025 20:33:27 -0700 Subject: [PATCH 083/243] Fix WanFirstLastFrameToVideo node when no clip vision. (#9134) --- comfy_extras/nodes_wan.py | 1 + 1 file changed, 1 insertion(+) diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index 0b92c68ac..0067d054d 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -149,6 +149,7 @@ class WanFirstLastFrameToVideo: positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image, "concat_mask": mask}) negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent_image, "concat_mask": mask}) + clip_vision_output = None if clip_vision_start_image is not None: clip_vision_output = clip_vision_start_image From 4696d74305e98a96bda5685b7f11d6ba167c2ed3 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Fri, 1 Aug 2025 15:06:18 +0800 Subject: [PATCH 084/243] update template to 0.1.45 (#9135) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8f2f6a56c..3828c5b91 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.44 +comfyui-workflow-templates==0.1.45 comfyui-embedded-docs==0.2.4 torch torchsde From 1e638a140b2f459595fafc73ade5ea5b4024d4b4 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 1 Aug 2025 02:25:38 -0700 Subject: [PATCH 085/243] Tiny wan vae optimizations. (#9136) --- comfy/ldm/wan/vae.py | 13 +++++++++---- comfy/ldm/wan/vae2_2.py | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/comfy/ldm/wan/vae.py b/comfy/ldm/wan/vae.py index 6b07840fc..791596938 100644 --- a/comfy/ldm/wan/vae.py +++ b/comfy/ldm/wan/vae.py @@ -24,12 +24,17 @@ class CausalConv3d(ops.Conv3d): self.padding[1], 2 * self.padding[0], 0) self.padding = (0, 0, 0) - def forward(self, x, cache_x=None): + def forward(self, x, cache_x=None, cache_list=None, cache_idx=None): + if cache_list is not None: + cache_x = cache_list[cache_idx] + cache_list[cache_idx] = None + padding = list(self._padding) if cache_x is not None and self._padding[4] > 0: cache_x = cache_x.to(x.device) x = torch.cat([cache_x, x], dim=2) padding[4] -= cache_x.shape[2] + del cache_x x = F.pad(x, padding) return super().forward(x) @@ -166,7 +171,7 @@ class ResidualBlock(nn.Module): if in_dim != out_dim else nn.Identity() def forward(self, x, feat_cache=None, feat_idx=[0]): - h = self.shortcut(x) + old_x = x for layer in self.residual: if isinstance(layer, CausalConv3d) and feat_cache is not None: idx = feat_idx[0] @@ -178,12 +183,12 @@ class ResidualBlock(nn.Module): cache_x.device), cache_x ], dim=2) - x = layer(x, feat_cache[idx]) + x = layer(x, cache_list=feat_cache, cache_idx=idx) feat_cache[idx] = cache_x feat_idx[0] += 1 else: x = layer(x) - return x + h + return x + self.shortcut(old_x) class AttentionBlock(nn.Module): diff --git a/comfy/ldm/wan/vae2_2.py b/comfy/ldm/wan/vae2_2.py index b9c2d1a26..1f6d584a2 100644 --- a/comfy/ldm/wan/vae2_2.py +++ b/comfy/ldm/wan/vae2_2.py @@ -151,7 +151,7 @@ class ResidualBlock(nn.Module): ], dim=2, ) - x = layer(x, feat_cache[idx]) + x = layer(x, cache_list=feat_cache, cache_idx=idx) feat_cache[idx] = cache_x feat_idx[0] += 1 else: From bff60b5cfc10d1b037a95746226ac6698dc3e373 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Fri, 1 Aug 2025 20:03:22 -0400 Subject: [PATCH 086/243] ComfyUI version 0.3.48 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 20a2e892a..7b29e338d 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.47" +__version__ = "0.3.48" diff --git a/pyproject.toml b/pyproject.toml index 244fdd232..256677fad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.47" +version = "0.3.48" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 3dfefc88d00bde744b729b073058a57e149cddc1 Mon Sep 17 00:00:00 2001 From: Johnpaul Chiwetelu <49923152+Myestery@users.noreply.github.com> Date: Sat, 2 Aug 2025 03:02:06 +0100 Subject: [PATCH 087/243] API for Recently Used Items (#8792) * feat: add file creation time to model file metadata and user file info * fix linting --- app/model_manager.py | 21 ++++++++++++++++----- app/user_manager.py | 4 +++- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/app/model_manager.py b/app/model_manager.py index 74d942fb8..ab36bca74 100644 --- a/app/model_manager.py +++ b/app/model_manager.py @@ -130,10 +130,21 @@ class ModelFileManager: for file_name in filenames: try: - relative_path = os.path.relpath(os.path.join(dirpath, file_name), directory) - result.append(relative_path) - except: - logging.warning(f"Warning: Unable to access {file_name}. Skipping this file.") + full_path = os.path.join(dirpath, file_name) + relative_path = os.path.relpath(full_path, directory) + + # Get file metadata + file_info = { + "name": relative_path, + "pathIndex": pathIndex, + "modified": os.path.getmtime(full_path), # Add modification time + "created": os.path.getctime(full_path), # Add creation time + "size": os.path.getsize(full_path) # Add file size + } + result.append(file_info) + + except Exception as e: + logging.warning(f"Warning: Unable to access {file_name}. Error: {e}. Skipping this file.") continue for d in subdirs: @@ -144,7 +155,7 @@ class ModelFileManager: logging.warning(f"Warning: Unable to access {path}. Skipping this path.") continue - return [{"name": f, "pathIndex": pathIndex} for f in result], dirs, time.perf_counter() + return result, dirs, time.perf_counter() def get_model_previews(self, filepath: str) -> list[str | BytesIO]: dirname = os.path.dirname(filepath) diff --git a/app/user_manager.py b/app/user_manager.py index d31da5b9b..0ec3e46ea 100644 --- a/app/user_manager.py +++ b/app/user_manager.py @@ -20,13 +20,15 @@ class FileInfo(TypedDict): path: str size: int modified: int + created: int def get_file_info(path: str, relative_to: str) -> FileInfo: return { "path": os.path.relpath(path, relative_to).replace(os.sep, '/'), "size": os.path.getsize(path), - "modified": os.path.getmtime(path) + "modified": os.path.getmtime(path), + "created": os.path.getctime(path) } From fbcc23945dc377c8623bbee6132f15a93ac0c84a Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Sun, 3 Aug 2025 02:15:29 +0800 Subject: [PATCH 088/243] Update template to 0.1.47 (#9153) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 3828c5b91..ffa7dce65 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.45 +comfyui-workflow-templates==0.1.47 comfyui-embedded-docs==0.2.4 torch torchsde From 5f582a97572e87ebfa655d379e8c8f7611c0249f Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 2 Aug 2025 12:00:13 -0700 Subject: [PATCH 089/243] Make sure all the conds are on the right device. (#9151) --- comfy/model_base.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/comfy/model_base.py b/comfy/model_base.py index 6b7978949..3ff8106d7 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -106,10 +106,12 @@ def model_sampling(model_config, model_type): return ModelSampling(model_config) -def convert_tensor(extra, dtype): +def convert_tensor(extra, dtype, device): if hasattr(extra, "dtype"): if extra.dtype != torch.int and extra.dtype != torch.long: - extra = extra.to(dtype) + extra = extra.to(dtype=dtype, device=device) + else: + extra = extra.to(device=device) return extra @@ -174,15 +176,16 @@ class BaseModel(torch.nn.Module): context = context.to(dtype) extra_conds = {} + device = xc.device for o in kwargs: extra = kwargs[o] if hasattr(extra, "dtype"): - extra = convert_tensor(extra, dtype) + extra = convert_tensor(extra, dtype, device) elif isinstance(extra, list): ex = [] for ext in extra: - ex.append(convert_tensor(ext, dtype)) + ex.append(convert_tensor(ext, dtype, device)) extra = ex extra_conds[o] = extra From 13aaa66ec21c397240a9b972d818430b39112588 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 2 Aug 2025 12:09:23 -0700 Subject: [PATCH 090/243] Make sure context is on the right device. (#9154) --- comfy/model_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comfy/model_base.py b/comfy/model_base.py index 3ff8106d7..4556ee138 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -171,12 +171,12 @@ class BaseModel(torch.nn.Module): dtype = self.manual_cast_dtype xc = xc.to(dtype) + device = xc.device t = self.model_sampling.timestep(t).float() if context is not None: - context = context.to(dtype) + context = context.to(dtype=dtype, device=device) extra_conds = {} - device = xc.device for o in kwargs: extra = kwargs[o] From aebac221937b511d46fe601656acdc753435b849 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 3 Aug 2025 04:08:11 -0700 Subject: [PATCH 091/243] Cleanup. (#9160) --- comfy/controlnet.py | 1 - 1 file changed, 1 deletion(-) diff --git a/comfy/controlnet.py b/comfy/controlnet.py index 9a47b86f2..6ed8bd756 100644 --- a/comfy/controlnet.py +++ b/comfy/controlnet.py @@ -43,7 +43,6 @@ if TYPE_CHECKING: def broadcast_image_to(tensor, target_batch_size, batched_number): current_batch_size = tensor.shape[0] - #print(current_batch_size, target_batch_size) if current_batch_size == 1: return tensor From 182f90b5eca2baa25474223759039925b286d562 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 4 Aug 2025 00:11:53 -0700 Subject: [PATCH 092/243] Lower cond vram use by casting at the same time as device transfer. (#9159) --- comfy/conds.py | 14 +++++++------- comfy/model_base.py | 6 +++--- comfy/samplers.py | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/comfy/conds.py b/comfy/conds.py index 2af2a43a3..f2564e7ef 100644 --- a/comfy/conds.py +++ b/comfy/conds.py @@ -10,8 +10,8 @@ class CONDRegular: def _copy_with(self, cond): return self.__class__(cond) - def process_cond(self, batch_size, device, **kwargs): - return self._copy_with(comfy.utils.repeat_to_batch_size(self.cond, batch_size).to(device)) + def process_cond(self, batch_size, **kwargs): + return self._copy_with(comfy.utils.repeat_to_batch_size(self.cond, batch_size)) def can_concat(self, other): if self.cond.shape != other.cond.shape: @@ -29,14 +29,14 @@ class CONDRegular: class CONDNoiseShape(CONDRegular): - def process_cond(self, batch_size, device, area, **kwargs): + def process_cond(self, batch_size, area, **kwargs): data = self.cond if area is not None: dims = len(area) // 2 for i in range(dims): data = data.narrow(i + 2, area[i + dims], area[i]) - return self._copy_with(comfy.utils.repeat_to_batch_size(data, batch_size).to(device)) + return self._copy_with(comfy.utils.repeat_to_batch_size(data, batch_size)) class CONDCrossAttn(CONDRegular): @@ -73,7 +73,7 @@ class CONDConstant(CONDRegular): def __init__(self, cond): self.cond = cond - def process_cond(self, batch_size, device, **kwargs): + def process_cond(self, batch_size, **kwargs): return self._copy_with(self.cond) def can_concat(self, other): @@ -92,10 +92,10 @@ class CONDList(CONDRegular): def __init__(self, cond): self.cond = cond - def process_cond(self, batch_size, device, **kwargs): + def process_cond(self, batch_size, **kwargs): out = [] for c in self.cond: - out.append(comfy.utils.repeat_to_batch_size(c, batch_size).to(device)) + out.append(comfy.utils.repeat_to_batch_size(c, batch_size)) return self._copy_with(out) diff --git a/comfy/model_base.py b/comfy/model_base.py index 4556ee138..3a9c031ea 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -109,9 +109,9 @@ def model_sampling(model_config, model_type): def convert_tensor(extra, dtype, device): if hasattr(extra, "dtype"): if extra.dtype != torch.int and extra.dtype != torch.long: - extra = extra.to(dtype=dtype, device=device) + extra = comfy.model_management.cast_to_device(extra, device, dtype) else: - extra = extra.to(device=device) + extra = comfy.model_management.cast_to_device(extra, device, None) return extra @@ -174,7 +174,7 @@ class BaseModel(torch.nn.Module): device = xc.device t = self.model_sampling.timestep(t).float() if context is not None: - context = context.to(dtype=dtype, device=device) + context = comfy.model_management.cast_to_device(context, device, dtype) extra_conds = {} for o in kwargs: diff --git a/comfy/samplers.py b/comfy/samplers.py index e93d2a315..ad2f40cdc 100644 --- a/comfy/samplers.py +++ b/comfy/samplers.py @@ -89,7 +89,7 @@ def get_area_and_mult(conds, x_in, timestep_in): conditioning = {} model_conds = conds["model_conds"] for c in model_conds: - conditioning[c] = model_conds[c].process_cond(batch_size=x_in.shape[0], device=x_in.device, area=area) + conditioning[c] = model_conds[c].process_cond(batch_size=x_in.shape[0], area=area) hooks = conds.get('hooks', None) control = conds.get('control', None) From 140ffc7fdc53e810030f060e421c1f528c2d2ab9 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 4 Aug 2025 00:28:12 -0700 Subject: [PATCH 093/243] Fix broken controlnet from last PR. (#9167) --- comfy/controlnet.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/comfy/controlnet.py b/comfy/controlnet.py index 6ed8bd756..988acdb57 100644 --- a/comfy/controlnet.py +++ b/comfy/controlnet.py @@ -28,6 +28,7 @@ import comfy.model_detection import comfy.model_patcher import comfy.ops import comfy.latent_formats +import comfy.model_base import comfy.cldm.cldm import comfy.t2i_adapter.adapter @@ -264,12 +265,12 @@ class ControlNet(ControlBase): for c in self.extra_conds: temp = cond.get(c, None) if temp is not None: - extra[c] = temp.to(dtype) + extra[c] = comfy.model_base.convert_tensor(temp, dtype, x_noisy.device) timestep = self.model_sampling_current.timestep(t) x_noisy = self.model_sampling_current.calculate_input(t, x_noisy) - control = self.control_model(x=x_noisy.to(dtype), hint=self.cond_hint, timesteps=timestep.to(dtype), context=context.to(dtype), **extra) + control = self.control_model(x=x_noisy.to(dtype), hint=self.cond_hint, timesteps=timestep.to(dtype), context=comfy.model_management.cast_to_device(context, x_noisy.device, dtype), **extra) return self.control_merge(control, control_prev, output_dtype=None) def copy(self): From 7991341e89cab521441641505ac4b0eea292a829 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 4 Aug 2025 01:02:40 -0700 Subject: [PATCH 094/243] Various fixes for broken things from earlier PR. (#9168) --- comfy/model_base.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/comfy/model_base.py b/comfy/model_base.py index 3a9c031ea..f9591f292 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -401,7 +401,7 @@ class SD21UNCLIP(BaseModel): unclip_conditioning = kwargs.get("unclip_conditioning", None) device = kwargs["device"] if unclip_conditioning is None: - return torch.zeros((1, self.adm_channels)) + return torch.zeros((1, self.adm_channels), device=device) else: return unclip_adm(unclip_conditioning, device, self.noise_augmentor, kwargs.get("unclip_noise_augment_merge", 0.05), kwargs.get("seed", 0) - 10) @@ -409,7 +409,7 @@ def sdxl_pooled(args, noise_augmentor): if "unclip_conditioning" in args: return unclip_adm(args.get("unclip_conditioning", None), args["device"], noise_augmentor, seed=args.get("seed", 0) - 10)[:,:1280] else: - return args["pooled_output"] + return args["pooled_output"].to(device=args["device"]) class SDXLRefiner(BaseModel): def __init__(self, model_config, model_type=ModelType.EPS, device=None): @@ -615,9 +615,11 @@ class IP2P: if image is None: image = torch.zeros_like(noise) + else: + image = image.to(device=device) if image.shape[1:] != noise.shape[1:]: - image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center") + image = utils.common_upscale(image, noise.shape[-1], noise.shape[-2], "bilinear", "center") image = utils.resize_to_batch_size(image, noise.shape[0]) return self.process_ip2p_image_in(image) @@ -696,7 +698,7 @@ class StableCascade_B(BaseModel): #size of prior doesn't really matter if zeros because it gets resized but I still want it to get batched prior = kwargs.get("stable_cascade_prior", torch.zeros((1, 16, (noise.shape[2] * 4) // 42, (noise.shape[3] * 4) // 42), dtype=noise.dtype, layout=noise.layout, device=noise.device)) - out["effnet"] = comfy.conds.CONDRegular(prior) + out["effnet"] = comfy.conds.CONDRegular(prior.to(device=noise.device)) out["sca"] = comfy.conds.CONDRegular(torch.zeros((1,))) return out @@ -1161,10 +1163,10 @@ class WAN21_Vace(WAN21): vace_frames_out = [] for j in range(len(vace_frames)): - vf = vace_frames[j].clone() + vf = vace_frames[j].to(device=noise.device, dtype=noise.dtype, copy=True) for i in range(0, vf.shape[1], 16): vf[:, i:i + 16] = self.process_latent_in(vf[:, i:i + 16]) - vf = torch.cat([vf, mask[j]], dim=1) + vf = torch.cat([vf, mask[j].to(device=noise.device, dtype=noise.dtype)], dim=1) vace_frames_out.append(vf) vace_frames = torch.stack(vace_frames_out, dim=1) From 84f9759424ccbd8de710960c79f0f1d28eef2776 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 4 Aug 2025 01:20:12 -0700 Subject: [PATCH 095/243] Add some warnings and prevent crash when cond devices don't match. (#9169) --- comfy/conds.py | 7 +++++++ comfy/model_base.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/comfy/conds.py b/comfy/conds.py index f2564e7ef..5af3e93ea 100644 --- a/comfy/conds.py +++ b/comfy/conds.py @@ -1,6 +1,7 @@ import torch import math import comfy.utils +import logging class CONDRegular: @@ -16,6 +17,9 @@ class CONDRegular: def can_concat(self, other): if self.cond.shape != other.cond.shape: return False + if self.cond.device != other.cond.device: + logging.warning("WARNING: conds not on same device, skipping concat.") + return False return True def concat(self, others): @@ -51,6 +55,9 @@ class CONDCrossAttn(CONDRegular): diff = mult_min // min(s1[1], s2[1]) if diff > 4: #arbitrary limit on the padding because it's probably going to impact performance negatively if it's too much return False + if self.cond.device != other.cond.device: + logging.warning("WARNING: conds not on same device: skipping concat.") + return False return True def concat(self, others): diff --git a/comfy/model_base.py b/comfy/model_base.py index f9591f292..2db81e244 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -409,7 +409,7 @@ def sdxl_pooled(args, noise_augmentor): if "unclip_conditioning" in args: return unclip_adm(args.get("unclip_conditioning", None), args["device"], noise_augmentor, seed=args.get("seed", 0) - 10)[:,:1280] else: - return args["pooled_output"].to(device=args["device"]) + return args["pooled_output"] class SDXLRefiner(BaseModel): def __init__(self, model_config, model_type=ModelType.EPS, device=None): From 03895dea7c4a6cc93fa362cd11ca450217d74b13 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 4 Aug 2025 01:33:04 -0700 Subject: [PATCH 096/243] Fix another issue with the PR. (#9170) --- comfy/model_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/model_base.py b/comfy/model_base.py index 2db81e244..a06686436 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -162,7 +162,7 @@ class BaseModel(torch.nn.Module): xc = self.model_sampling.calculate_input(sigma, x) if c_concat is not None: - xc = torch.cat([xc] + [c_concat], dim=1) + xc = torch.cat([xc] + [comfy.model_management.cast_to_device(c_concat, xc.device, xc.dtype)], dim=1) context = c_crossattn dtype = self.get_dtype() From c012400240d4867cd63a45220eb791b91ad47617 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 4 Aug 2025 19:53:25 -0700 Subject: [PATCH 097/243] Initial support for qwen image model. (#9179) --- comfy/ldm/qwen_image/model.py | 399 ++++++++++++++++++++++++++++++ comfy/model_base.py | 12 + comfy/model_detection.py | 7 +- comfy/sd.py | 12 +- comfy/supported_models.py | 32 ++- comfy/text_encoders/llama.py | 26 ++ comfy/text_encoders/qwen_image.py | 71 ++++++ nodes.py | 2 +- 8 files changed, 557 insertions(+), 4 deletions(-) create mode 100644 comfy/ldm/qwen_image/model.py create mode 100644 comfy/text_encoders/qwen_image.py diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py new file mode 100644 index 000000000..ff631a60f --- /dev/null +++ b/comfy/ldm/qwen_image/model.py @@ -0,0 +1,399 @@ +# https://github.com/QwenLM/Qwen-Image (Apache 2.0) +import torch +import torch.nn as nn +import torch.nn.functional as F +from typing import Optional, Tuple +from einops import repeat + +from comfy.ldm.lightricks.model import TimestepEmbedding, Timesteps +from comfy.ldm.modules.attention import optimized_attention_masked +from comfy.ldm.flux.layers import EmbedND + + +class GELU(nn.Module): + def __init__(self, dim_in: int, dim_out: int, approximate: str = "none", bias: bool = True, dtype=None, device=None, operations=None): + super().__init__() + self.proj = operations.Linear(dim_in, dim_out, bias=bias, dtype=dtype, device=device) + self.approximate = approximate + + def forward(self, hidden_states): + hidden_states = self.proj(hidden_states) + hidden_states = F.gelu(hidden_states, approximate=self.approximate) + return hidden_states + + +class FeedForward(nn.Module): + def __init__( + self, + dim: int, + dim_out: Optional[int] = None, + mult: int = 4, + dropout: float = 0.0, + inner_dim=None, + bias: bool = True, + dtype=None, device=None, operations=None + ): + super().__init__() + if inner_dim is None: + inner_dim = int(dim * mult) + dim_out = dim_out if dim_out is not None else dim + + self.net = nn.ModuleList([]) + self.net.append(GELU(dim, inner_dim, approximate="tanh", bias=bias, dtype=dtype, device=device, operations=operations)) + self.net.append(nn.Dropout(dropout)) + self.net.append(operations.Linear(inner_dim, dim_out, bias=bias, dtype=dtype, device=device)) + + def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor: + for module in self.net: + hidden_states = module(hidden_states) + return hidden_states + + +def apply_rotary_emb(x, freqs_cis): + if x.shape[1] == 0: + return x + + t_ = x.reshape(*x.shape[:-1], -1, 1, 2) + t_out = freqs_cis[..., 0] * t_[..., 0] + freqs_cis[..., 1] * t_[..., 1] + return t_out.reshape(*x.shape) + + +class QwenTimestepProjEmbeddings(nn.Module): + def __init__(self, embedding_dim, pooled_projection_dim, dtype=None, device=None, operations=None): + super().__init__() + self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0, scale=1000) + self.timestep_embedder = TimestepEmbedding( + in_channels=256, + time_embed_dim=embedding_dim, + dtype=dtype, + device=device, + operations=operations + ) + + def forward(self, timestep, hidden_states): + timesteps_proj = self.time_proj(timestep) + timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_states.dtype)) + return timesteps_emb + + +class Attention(nn.Module): + def __init__( + self, + query_dim: int, + dim_head: int = 64, + heads: int = 8, + dropout: float = 0.0, + bias: bool = False, + eps: float = 1e-5, + out_bias: bool = True, + out_dim: int = None, + out_context_dim: int = None, + dtype=None, + device=None, + operations=None + ): + super().__init__() + self.inner_dim = out_dim if out_dim is not None else dim_head * heads + self.inner_kv_dim = self.inner_dim + self.heads = heads + self.dim_head = dim_head + self.out_dim = out_dim if out_dim is not None else query_dim + self.out_context_dim = out_context_dim if out_context_dim is not None else query_dim + self.dropout = dropout + + # Q/K normalization + self.norm_q = operations.RMSNorm(dim_head, eps=eps, elementwise_affine=True, dtype=dtype, device=device) + self.norm_k = operations.RMSNorm(dim_head, eps=eps, elementwise_affine=True, dtype=dtype, device=device) + self.norm_added_q = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device) + self.norm_added_k = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device) + + # Image stream projections + self.to_q = operations.Linear(query_dim, self.inner_dim, bias=bias, dtype=dtype, device=device) + self.to_k = operations.Linear(query_dim, self.inner_kv_dim, bias=bias, dtype=dtype, device=device) + self.to_v = operations.Linear(query_dim, self.inner_kv_dim, bias=bias, dtype=dtype, device=device) + + # Text stream projections + self.add_q_proj = operations.Linear(query_dim, self.inner_dim, bias=bias, dtype=dtype, device=device) + self.add_k_proj = operations.Linear(query_dim, self.inner_kv_dim, bias=bias, dtype=dtype, device=device) + self.add_v_proj = operations.Linear(query_dim, self.inner_kv_dim, bias=bias, dtype=dtype, device=device) + + # Output projections + self.to_out = nn.ModuleList([ + operations.Linear(self.inner_dim, self.out_dim, bias=out_bias, dtype=dtype, device=device), + nn.Dropout(dropout) + ]) + self.to_add_out = operations.Linear(self.inner_dim, self.out_context_dim, bias=out_bias, dtype=dtype, device=device) + + def forward( + self, + hidden_states: torch.FloatTensor, # Image stream + encoder_hidden_states: torch.FloatTensor = None, # Text stream + encoder_hidden_states_mask: torch.FloatTensor = None, + attention_mask: Optional[torch.FloatTensor] = None, + image_rotary_emb: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + seq_txt = encoder_hidden_states.shape[1] + + img_query = self.to_q(hidden_states).unflatten(-1, (self.heads, -1)) + img_key = self.to_k(hidden_states).unflatten(-1, (self.heads, -1)) + img_value = self.to_v(hidden_states).unflatten(-1, (self.heads, -1)) + + txt_query = self.add_q_proj(encoder_hidden_states).unflatten(-1, (self.heads, -1)) + txt_key = self.add_k_proj(encoder_hidden_states).unflatten(-1, (self.heads, -1)) + txt_value = self.add_v_proj(encoder_hidden_states).unflatten(-1, (self.heads, -1)) + + img_query = self.norm_q(img_query) + img_key = self.norm_k(img_key) + txt_query = self.norm_added_q(txt_query) + txt_key = self.norm_added_k(txt_key) + + joint_query = torch.cat([txt_query, img_query], dim=1) + joint_key = torch.cat([txt_key, img_key], dim=1) + joint_value = torch.cat([txt_value, img_value], dim=1) + + joint_query = apply_rotary_emb(joint_query, image_rotary_emb) + joint_key = apply_rotary_emb(joint_key, image_rotary_emb) + + joint_query = joint_query.flatten(start_dim=2) + joint_key = joint_key.flatten(start_dim=2) + joint_value = joint_value.flatten(start_dim=2) + + joint_hidden_states = optimized_attention_masked(joint_query, joint_key, joint_value, self.heads, attention_mask) + + txt_attn_output = joint_hidden_states[:, :seq_txt, :] + img_attn_output = joint_hidden_states[:, seq_txt:, :] + + img_attn_output = self.to_out[0](img_attn_output) + img_attn_output = self.to_out[1](img_attn_output) + txt_attn_output = self.to_add_out(txt_attn_output) + + return img_attn_output, txt_attn_output + + +class QwenImageTransformerBlock(nn.Module): + def __init__( + self, + dim: int, + num_attention_heads: int, + attention_head_dim: int, + eps: float = 1e-6, + dtype=None, + device=None, + operations=None + ): + super().__init__() + self.dim = dim + self.num_attention_heads = num_attention_heads + self.attention_head_dim = attention_head_dim + + self.img_mod = nn.Sequential( + nn.SiLU(), + operations.Linear(dim, 6 * dim, bias=True, dtype=dtype, device=device), + ) + self.img_norm1 = operations.LayerNorm(dim, elementwise_affine=False, eps=eps, dtype=dtype, device=device) + self.img_norm2 = operations.LayerNorm(dim, elementwise_affine=False, eps=eps, dtype=dtype, device=device) + self.img_mlp = FeedForward(dim=dim, dim_out=dim, dtype=dtype, device=device, operations=operations) + + self.txt_mod = nn.Sequential( + nn.SiLU(), + operations.Linear(dim, 6 * dim, bias=True, dtype=dtype, device=device), + ) + self.txt_norm1 = operations.LayerNorm(dim, elementwise_affine=False, eps=eps, dtype=dtype, device=device) + self.txt_norm2 = operations.LayerNorm(dim, elementwise_affine=False, eps=eps, dtype=dtype, device=device) + self.txt_mlp = FeedForward(dim=dim, dim_out=dim, dtype=dtype, device=device, operations=operations) + + self.attn = Attention( + query_dim=dim, + dim_head=attention_head_dim, + heads=num_attention_heads, + out_dim=dim, + bias=True, + eps=eps, + dtype=dtype, + device=device, + operations=operations, + ) + + def _modulate(self, x, mod_params): + shift, scale, gate = mod_params.chunk(3, dim=-1) + return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1), gate.unsqueeze(1) + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor, + encoder_hidden_states_mask: torch.Tensor, + temb: torch.Tensor, + image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + img_mod_params = self.img_mod(temb) + txt_mod_params = self.txt_mod(temb) + img_mod1, img_mod2 = img_mod_params.chunk(2, dim=-1) + txt_mod1, txt_mod2 = txt_mod_params.chunk(2, dim=-1) + + img_normed = self.img_norm1(hidden_states) + img_modulated, img_gate1 = self._modulate(img_normed, img_mod1) + txt_normed = self.txt_norm1(encoder_hidden_states) + txt_modulated, txt_gate1 = self._modulate(txt_normed, txt_mod1) + + img_attn_output, txt_attn_output = self.attn( + hidden_states=img_modulated, + encoder_hidden_states=txt_modulated, + encoder_hidden_states_mask=encoder_hidden_states_mask, + image_rotary_emb=image_rotary_emb, + ) + + hidden_states = hidden_states + img_gate1 * img_attn_output + encoder_hidden_states = encoder_hidden_states + txt_gate1 * txt_attn_output + + img_normed2 = self.img_norm2(hidden_states) + img_modulated2, img_gate2 = self._modulate(img_normed2, img_mod2) + hidden_states = hidden_states + img_gate2 * self.img_mlp(img_modulated2) + + txt_normed2 = self.txt_norm2(encoder_hidden_states) + txt_modulated2, txt_gate2 = self._modulate(txt_normed2, txt_mod2) + encoder_hidden_states = encoder_hidden_states + txt_gate2 * self.txt_mlp(txt_modulated2) + + return encoder_hidden_states, hidden_states + + +class LastLayer(nn.Module): + def __init__( + self, + embedding_dim: int, + conditioning_embedding_dim: int, + elementwise_affine=False, + eps=1e-6, + bias=True, + dtype=None, device=None, operations=None + ): + super().__init__() + self.silu = nn.SiLU() + self.linear = operations.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias, dtype=dtype, device=device) + self.norm = operations.LayerNorm(embedding_dim, eps, elementwise_affine=False, bias=bias, dtype=dtype, device=device) + + def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor: + emb = self.linear(self.silu(conditioning_embedding)) + scale, shift = torch.chunk(emb, 2, dim=1) + x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :] + return x + + +class QwenImageTransformer2DModel(nn.Module): + def __init__( + self, + patch_size: int = 2, + in_channels: int = 64, + out_channels: Optional[int] = 16, + num_layers: int = 60, + attention_head_dim: int = 128, + num_attention_heads: int = 24, + joint_attention_dim: int = 3584, + pooled_projection_dim: int = 768, + guidance_embeds: bool = False, + axes_dims_rope: Tuple[int, int, int] = (16, 56, 56), + image_model=None, + dtype=None, + device=None, + operations=None, + ): + super().__init__() + self.dtype = dtype + self.patch_size = patch_size + self.out_channels = out_channels or in_channels + self.inner_dim = num_attention_heads * attention_head_dim + + self.pe_embedder = EmbedND(dim=attention_head_dim, theta=10000, axes_dim=list(axes_dims_rope)) + + self.time_text_embed = QwenTimestepProjEmbeddings( + embedding_dim=self.inner_dim, + pooled_projection_dim=pooled_projection_dim, + dtype=dtype, + device=device, + operations=operations + ) + + self.txt_norm = operations.RMSNorm(joint_attention_dim, eps=1e-6, dtype=dtype, device=device) + self.img_in = operations.Linear(in_channels, self.inner_dim, dtype=dtype, device=device) + self.txt_in = operations.Linear(joint_attention_dim, self.inner_dim, dtype=dtype, device=device) + + self.transformer_blocks = nn.ModuleList([ + QwenImageTransformerBlock( + dim=self.inner_dim, + num_attention_heads=num_attention_heads, + attention_head_dim=attention_head_dim, + dtype=dtype, + device=device, + operations=operations + ) + for _ in range(num_layers) + ]) + + self.norm_out = LastLayer(self.inner_dim, self.inner_dim, dtype=dtype, device=device, operations=operations) + self.proj_out = operations.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True, dtype=dtype, device=device) + self.gradient_checkpointing = False + + def pos_embeds(self, x, context): + bs, c, t, h, w = x.shape + patch_size = self.patch_size + h_len = ((h + (patch_size // 2)) // patch_size) + w_len = ((w + (patch_size // 2)) // patch_size) + + img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype) + img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1) + img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) + img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs) + + txt_start = round(max(h_len, w_len)) + txt_ids = torch.linspace(txt_start, txt_start + context.shape[1], steps=context.shape[1], device=x.device, dtype=x.dtype).reshape(1, -1, 1).repeat(bs, 1, 3) + ids = torch.cat((txt_ids, img_ids), dim=1) + return self.pe_embedder(ids).squeeze(1).unsqueeze(2).to(x.dtype) + + def forward( + self, + x, + timesteps, + context, + attention_mask=None, + guidance: torch.Tensor = None, + **kwargs + ): + timestep = timesteps + encoder_hidden_states = context + encoder_hidden_states_mask = attention_mask + + image_rotary_emb = self.pos_embeds(x, context) + + orig_shape = x.shape + hidden_states = x.view(orig_shape[0], orig_shape[1], orig_shape[-2] // 2, 2, orig_shape[-1] // 2, 2) + hidden_states = hidden_states.permute(0, 2, 4, 1, 3, 5) + hidden_states = hidden_states.reshape(orig_shape[0], (orig_shape[-2] // 2) * (orig_shape[-1] // 2), orig_shape[1] * 4) + + hidden_states = self.img_in(hidden_states) + encoder_hidden_states = self.txt_norm(encoder_hidden_states) + encoder_hidden_states = self.txt_in(encoder_hidden_states) + + if guidance is not None: + guidance = guidance * 1000 + + temb = ( + self.time_text_embed(timestep, hidden_states) + if guidance is None + else self.time_text_embed(timestep, guidance, hidden_states) + ) + + for block in self.transformer_blocks: + encoder_hidden_states, hidden_states = block( + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + encoder_hidden_states_mask=encoder_hidden_states_mask, + temb=temb, + image_rotary_emb=image_rotary_emb, + ) + + hidden_states = self.norm_out(hidden_states, temb) + hidden_states = self.proj_out(hidden_states) + + hidden_states = hidden_states.view(orig_shape[0], orig_shape[-2] // 2, orig_shape[-1] // 2, orig_shape[1], 2, 2) + hidden_states = hidden_states.permute(0, 3, 1, 4, 2, 5) + return hidden_states.reshape(orig_shape) diff --git a/comfy/model_base.py b/comfy/model_base.py index a06686436..8a2d9cbe6 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -42,6 +42,7 @@ import comfy.ldm.hidream.model import comfy.ldm.chroma.model import comfy.ldm.ace.model import comfy.ldm.omnigen.omnigen2 +import comfy.ldm.qwen_image.model import comfy.model_management import comfy.patcher_extension @@ -1308,3 +1309,14 @@ class Omnigen2(BaseModel): if ref_latents is not None: out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()), ref_latents)) // 16]) return out + +class QwenImage(BaseModel): + def __init__(self, model_config, model_type=ModelType.FLUX, device=None): + super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.qwen_image.model.QwenImageTransformer2DModel) + + def extra_conds(self, **kwargs): + out = super().extra_conds(**kwargs) + cross_attn = kwargs.get("cross_attn", None) + if cross_attn is not None: + out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) + return out diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 9fc1f42de..8b57ebd2f 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -481,6 +481,11 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["timestep_scale"] = 1000.0 return dit_config + if '{}txt_norm.weight'.format(key_prefix) in state_dict_keys: # Qwen Image + dit_config = {} + dit_config["image_model"] = "qwen_image" + return dit_config + if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys: return None @@ -867,7 +872,7 @@ def convert_diffusers_mmdit(state_dict, output_prefix=""): depth_single_blocks = count_blocks(state_dict, 'single_transformer_blocks.{}.') hidden_size = state_dict["x_embedder.bias"].shape[0] sd_map = comfy.utils.flux_to_diffusers({"depth": depth, "depth_single_blocks": depth_single_blocks, "hidden_size": hidden_size}, output_prefix=output_prefix) - elif 'transformer_blocks.0.attn.add_q_proj.weight' in state_dict: #SD3 + elif 'transformer_blocks.0.attn.add_q_proj.weight' in state_dict and 'pos_embed.proj.weight' in state_dict: #SD3 num_blocks = count_blocks(state_dict, 'transformer_blocks.{}.') depth = state_dict["pos_embed.proj.weight"].shape[0] // 64 sd_map = comfy.utils.mmdit_to_diffusers({"depth": depth, "num_blocks": num_blocks}, output_prefix=output_prefix) diff --git a/comfy/sd.py b/comfy/sd.py index e0498e585..bb5d61fb3 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -47,6 +47,7 @@ import comfy.text_encoders.wan import comfy.text_encoders.hidream import comfy.text_encoders.ace import comfy.text_encoders.omnigen2 +import comfy.text_encoders.qwen_image import comfy.model_patcher import comfy.lora @@ -771,6 +772,7 @@ class CLIPType(Enum): CHROMA = 15 ACE = 16 OMNIGEN2 = 17 + QWEN_IMAGE = 18 def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}): @@ -791,6 +793,7 @@ class TEModel(Enum): T5_XXL_OLD = 8 GEMMA_2_2B = 9 QWEN25_3B = 10 + QWEN25_7B = 11 def detect_te_model(sd): if "text_model.encoder.layers.30.mlp.fc1.weight" in sd: @@ -812,7 +815,11 @@ def detect_te_model(sd): if 'model.layers.0.post_feedforward_layernorm.weight' in sd: return TEModel.GEMMA_2_2B if 'model.layers.0.self_attn.k_proj.bias' in sd: - return TEModel.QWEN25_3B + weight = sd['model.layers.0.self_attn.k_proj.bias'] + if weight.shape[0] == 256: + return TEModel.QWEN25_3B + if weight.shape[0] == 512: + return TEModel.QWEN25_7B if "model.layers.0.post_attention_layernorm.weight" in sd: return TEModel.LLAMA3_8 return None @@ -917,6 +924,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip elif te_model == TEModel.QWEN25_3B: clip_target.clip = comfy.text_encoders.omnigen2.te(**llama_detect(clip_data)) clip_target.tokenizer = comfy.text_encoders.omnigen2.Omnigen2Tokenizer + elif te_model == TEModel.QWEN25_7B: + clip_target.clip = comfy.text_encoders.qwen_image.te(**llama_detect(clip_data)) + clip_target.tokenizer = comfy.text_encoders.qwen_image.QwenImageTokenizer else: # clip_l if clip_type == CLIPType.SD3: diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 8f3f4652d..880055bd3 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -19,6 +19,7 @@ import comfy.text_encoders.lumina2 import comfy.text_encoders.wan import comfy.text_encoders.ace import comfy.text_encoders.omnigen2 +import comfy.text_encoders.qwen_image from . import supported_models_base from . import latent_formats @@ -1229,7 +1230,36 @@ class Omnigen2(supported_models_base.BASE): hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_3b.transformer.".format(pref)) return supported_models_base.ClipTarget(comfy.text_encoders.omnigen2.Omnigen2Tokenizer, comfy.text_encoders.omnigen2.te(**hunyuan_detect)) +class QwenImage(supported_models_base.BASE): + unet_config = { + "image_model": "qwen_image", + } -models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep, Omnigen2] + sampling_settings = { + "multiplier": 1.0, + "shift": 2.6, + } + + memory_usage_factor = 1.8 #TODO + + unet_extra_config = {} + latent_format = latent_formats.Wan21 + + supported_inference_dtypes = [torch.bfloat16, torch.float32] + + vae_key_prefix = ["vae."] + text_encoder_key_prefix = ["text_encoders."] + + def get_model(self, state_dict, prefix="", device=None): + out = model_base.QwenImage(self, device=device) + return out + + def clip_target(self, state_dict={}): + pref = self.text_encoder_key_prefix[0] + hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref)) + return supported_models_base.ClipTarget(comfy.text_encoders.qwen_image.QwenImageTokenizer, comfy.text_encoders.qwen_image.te(**hunyuan_detect)) + + +models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep, Omnigen2, QwenImage] models += [SVD_img2vid] diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py index 7fbd0f604..1da6a0c94 100644 --- a/comfy/text_encoders/llama.py +++ b/comfy/text_encoders/llama.py @@ -43,6 +43,23 @@ class Qwen25_3BConfig: mlp_activation = "silu" qkv_bias = True +@dataclass +class Qwen25_7BVLI_Config: + vocab_size: int = 152064 + hidden_size: int = 3584 + intermediate_size: int = 18944 + num_hidden_layers: int = 28 + num_attention_heads: int = 28 + num_key_value_heads: int = 4 + max_position_embeddings: int = 128000 + rms_norm_eps: float = 1e-6 + rope_theta: float = 1000000.0 + transformer_type: str = "llama" + head_dim = 128 + rms_norm_add = False + mlp_activation = "silu" + qkv_bias = True + @dataclass class Gemma2_2B_Config: vocab_size: int = 256000 @@ -348,6 +365,15 @@ class Qwen25_3B(BaseLlama, torch.nn.Module): self.model = Llama2_(config, device=device, dtype=dtype, ops=operations) self.dtype = dtype +class Qwen25_7BVLI(BaseLlama, torch.nn.Module): + def __init__(self, config_dict, dtype, device, operations): + super().__init__() + config = Qwen25_7BVLI_Config(**config_dict) + self.num_layers = config.num_hidden_layers + + self.model = Llama2_(config, device=device, dtype=dtype, ops=operations) + self.dtype = dtype + class Gemma2_2B(BaseLlama, torch.nn.Module): def __init__(self, config_dict, dtype, device, operations): super().__init__() diff --git a/comfy/text_encoders/qwen_image.py b/comfy/text_encoders/qwen_image.py new file mode 100644 index 000000000..ce5c98097 --- /dev/null +++ b/comfy/text_encoders/qwen_image.py @@ -0,0 +1,71 @@ +from transformers import Qwen2Tokenizer +from comfy import sd1_clip +import comfy.text_encoders.llama +import os +import torch +import numbers + +class Qwen25_7BVLITokenizer(sd1_clip.SDTokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "qwen25_tokenizer") + super().__init__(tokenizer_path, pad_with_end=False, embedding_size=3584, embedding_key='qwen25_7b', tokenizer_class=Qwen2Tokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=151643, tokenizer_data=tokenizer_data) + + +class QwenImageTokenizer(sd1_clip.SD1Tokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="qwen25_7b", tokenizer=Qwen25_7BVLITokenizer) + self.llama_template = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" + + def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None,**kwargs): + if llama_template is None: + llama_text = self.llama_template.format(text) + else: + llama_text = llama_template.format(text) + return super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, **kwargs) + + +class Qwen25_7BVLIModel(sd1_clip.SDClipModel): + def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options={}): + super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Qwen25_7BVLI, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options) + + +class QwenImageTEModel(sd1_clip.SD1ClipModel): + def __init__(self, device="cpu", dtype=None, model_options={}): + super().__init__(device=device, dtype=dtype, name="qwen25_7b", clip_model=Qwen25_7BVLIModel, model_options=model_options) + + def encode_token_weights(self, token_weight_pairs): + out, pooled, extra = super().encode_token_weights(token_weight_pairs) + tok_pairs = token_weight_pairs["qwen25_7b"][0] + count_im_start = 0 + for i, v in enumerate(tok_pairs): + elem = v[0] + if not torch.is_tensor(elem): + if isinstance(elem, numbers.Integral): + if elem == 151644 and count_im_start < 2: + template_end = i + count_im_start += 1 + + if out.shape[1] > (template_end + 3): + if tok_pairs[template_end + 1][0] == 872: + if tok_pairs[template_end + 2][0] == 198: + template_end += 3 + + out = out[:, template_end:] + + extra["attention_mask"] = extra["attention_mask"][:, template_end:] + if extra["attention_mask"].sum() == torch.numel(extra["attention_mask"]): + extra.pop("attention_mask") # attention mask is useless if no masked elements + + return out, pooled, extra + + +def te(dtype_llama=None, llama_scaled_fp8=None): + class QwenImageTEModel_(QwenImageTEModel): + def __init__(self, device="cpu", dtype=None, model_options={}): + if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options: + model_options = model_options.copy() + model_options["scaled_fp8"] = llama_scaled_fp8 + if dtype_llama is not None: + dtype = dtype_llama + super().__init__(device=device, dtype=dtype, model_options=model_options) + return QwenImageTEModel_ diff --git a/nodes.py b/nodes.py index da4a46366..9bedbcaca 100644 --- a/nodes.py +++ b/nodes.py @@ -925,7 +925,7 @@ class CLIPLoader: @classmethod def INPUT_TYPES(s): return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ), - "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2"], ), + "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image"], ), }, "optional": { "device": (["default", "cpu"], {"advanced": True}), From f69609bbd6c20f4814e313f8974656b187a9bee2 Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Mon, 4 Aug 2025 22:52:25 -0700 Subject: [PATCH 098/243] Add Veo3 video generation node with audio support (#9110) - Create new Veo3VideoGenerationNode that extends VeoVideoGenerationNode - Add support for generateAudio parameter (only for Veo3 models) - Support new Veo3 models: veo-3.0-generate-001, veo-3.0-fast-generate-001 - Fix Veo3 duration constraint to 8 seconds only - Update original node to be clearly Veo 2 only - Update API paths to use model parameter: /proxy/veo/{model}/generate - Regenerate API types from staging to include generateAudio parameter - Fix TripoModelVersion enum reference after regeneration - Mark generated API types file in .gitattributes --- .gitattributes | 1 + comfy_api_nodes/apis/__init__.py | 2656 ++++++++++++++++++++++++++++- comfy_api_nodes/apis/tripo_api.py | 2 +- comfy_api_nodes/nodes_veo2.py | 98 +- 4 files changed, 2664 insertions(+), 93 deletions(-) diff --git a/.gitattributes b/.gitattributes index 4391de678..5b3c15bb4 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,3 @@ /web/assets/** linguist-generated /web/** linguist-vendored +comfy_api_nodes/apis/__init__.py linguist-generated diff --git a/comfy_api_nodes/apis/__init__.py b/comfy_api_nodes/apis/__init__.py index 086028abe..54298e8a9 100644 --- a/comfy_api_nodes/apis/__init__.py +++ b/comfy_api_nodes/apis/__init__.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: filtered-openapi.yaml -# timestamp: 2025-07-06T09:47:31+00:00 +# timestamp: 2025-07-30T08:54:00+00:00 from __future__ import annotations @@ -37,6 +37,99 @@ class AuditLog(BaseModel): ) +class BFLAsyncResponse(BaseModel): + id: str = Field(..., title='Id') + polling_url: str = Field(..., title='Polling Url') + + +class BFLAsyncWebhookResponse(BaseModel): + id: str = Field(..., title='Id') + status: str = Field(..., title='Status') + webhook_url: str = Field(..., title='Webhook Url') + + +class CannyHighThreshold(RootModel[int]): + root: int = Field( + ..., + description='High threshold for Canny edge detection', + ge=0, + le=500, + title='Canny High Threshold', + ) + + +class CannyLowThreshold(RootModel[int]): + root: int = Field( + ..., + description='Low threshold for Canny edge detection', + ge=0, + le=500, + title='Canny Low Threshold', + ) + + +class Guidance(RootModel[float]): + root: float = Field( + ..., + description='Guidance strength for the image generation process', + ge=1.0, + le=100.0, + title='Guidance', + ) + + +class Steps(RootModel[int]): + root: int = Field( + ..., + description='Number of steps for the image generation process', + ge=15, + le=50, + title='Steps', + ) + + +class WebhookUrl(RootModel[AnyUrl]): + root: AnyUrl = Field( + ..., description='URL to receive webhook notifications', title='Webhook Url' + ) + + +class BFLFluxKontextMaxGenerateRequest(BaseModel): + guidance: Optional[float] = Field( + 3, description='The guidance scale for generation', ge=1.0, le=20.0 + ) + input_image: str = Field(..., description='Base64 encoded image to be edited') + prompt: str = Field( + ..., description='The text prompt describing what to edit on the image' + ) + steps: Optional[int] = Field( + 50, description='Number of inference steps', ge=1, le=50 + ) + + +class BFLFluxKontextMaxGenerateResponse(BaseModel): + id: str = Field(..., description='Job ID for tracking') + polling_url: str = Field(..., description='URL to poll for results') + + +class BFLFluxKontextProGenerateRequest(BaseModel): + guidance: Optional[float] = Field( + 3, description='The guidance scale for generation', ge=1.0, le=20.0 + ) + input_image: str = Field(..., description='Base64 encoded image to be edited') + prompt: str = Field( + ..., description='The text prompt describing what to edit on the image' + ) + steps: Optional[int] = Field( + 50, description='Number of inference steps', ge=1, le=50 + ) + + +class BFLFluxKontextProGenerateResponse(BaseModel): + id: str = Field(..., description='Job ID for tracking') + polling_url: str = Field(..., description='URL to poll for results') + + class OutputFormat(str, Enum): jpeg = 'jpeg' png = 'png' @@ -68,6 +161,67 @@ class BFLFluxPro11GenerateResponse(BaseModel): polling_url: str = Field(..., description='URL to poll for results') +class Bottom(RootModel[int]): + root: int = Field( + ..., + description='Number of pixels to expand at the bottom of the image', + ge=0, + le=2048, + title='Bottom', + ) + + +class Guidance2(RootModel[float]): + root: float = Field( + ..., + description='Guidance strength for the image generation process', + ge=1.5, + le=100.0, + title='Guidance', + ) + + +class Left(RootModel[int]): + root: int = Field( + ..., + description='Number of pixels to expand on the left side of the image', + ge=0, + le=2048, + title='Left', + ) + + +class Right(RootModel[int]): + root: int = Field( + ..., + description='Number of pixels to expand on the right side of the image', + ge=0, + le=2048, + title='Right', + ) + + +class Steps2(RootModel[int]): + root: int = Field( + ..., + description='Number of steps for the image generation process', + examples=[50], + ge=15, + le=50, + title='Steps', + ) + + +class Top(RootModel[int]): + root: int = Field( + ..., + description='Number of pixels to expand at the top of the image', + ge=0, + le=2048, + title='Top', + ) + + class BFLFluxProGenerateRequest(BaseModel): guidance_scale: Optional[float] = Field( None, description='The guidance scale for generation.', ge=1.0, le=20.0 @@ -96,7 +250,71 @@ class BFLFluxProGenerateResponse(BaseModel): polling_url: str = Field(..., description='URL to poll for the generation result.') +class BFLOutputFormat(str, Enum): + jpeg = 'jpeg' + png = 'png' + + +class BFLValidationError(BaseModel): + loc: List[Union[str, int]] = Field(..., title='Location') + msg: str = Field(..., title='Message') + type: str = Field(..., title='Error Type') + + class Status(str, Enum): + success = 'success' + not_found = 'not_found' + error = 'error' + + +class ClaimMyNodeRequest(BaseModel): + GH_TOKEN: str = Field( + ..., description='GitHub token to verify if the user owns the repo of the node' + ) + + +class ComfyNode(BaseModel): + category: Optional[str] = Field( + None, + description='UI category where the node is listed, used for grouping nodes.', + ) + comfy_node_name: Optional[str] = Field( + None, description='Unique identifier for the node' + ) + deprecated: Optional[bool] = Field( + None, + description='Indicates if the node is deprecated. Deprecated nodes are hidden in the UI.', + ) + description: Optional[str] = Field( + None, description="Brief description of the node's functionality or purpose." + ) + experimental: Optional[bool] = Field( + None, + description='Indicates if the node is experimental, subject to changes or removal.', + ) + function: Optional[str] = Field( + None, description='Name of the entry-point function to execute the node.' + ) + input_types: Optional[str] = Field(None, description='Defines input parameters') + output_is_list: Optional[List[bool]] = Field( + None, description='Boolean values indicating if each output is a list.' + ) + return_names: Optional[str] = Field( + None, description='Names of the outputs for clarity in workflows.' + ) + return_types: Optional[str] = Field( + None, description='Specifies the types of outputs produced by the node.' + ) + + +class ComfyNodeCloudBuildInfo(BaseModel): + build_id: Optional[str] = None + location: Optional[str] = None + project_id: Optional[str] = None + project_number: Optional[str] = None + + +class Status1(str, Enum): in_progress = 'in_progress' completed = 'completed' incomplete = 'incomplete' @@ -113,7 +331,7 @@ class ComputerToolCall(BaseModel): description='An identifier used when responding to the tool call with output.\n', ) id: str = Field(..., description='The unique ID of the computer call.') - status: Status = Field( + status: Status1 = Field( ..., description='The status of the item. One of `in_progress`, `completed`, or\n`incomplete`. Populated when items are returned via API.\n', ) @@ -156,6 +374,7 @@ class Customer(BaseModel): None, description='The date and time the user was created' ) email: Optional[str] = Field(None, description='The email address for this user') + has_fund: Optional[bool] = Field(None, description='Whether the user has funds') id: str = Field(..., description='The firebase UID of the user') is_admin: Optional[bool] = Field(None, description='Whether the user is an admin') metronome_id: Optional[str] = Field(None, description='The Metronome customer ID') @@ -194,6 +413,16 @@ class Type2(str, Enum): message = 'message' +class Error(BaseModel): + details: Optional[List[str]] = Field( + None, + description='Optional detailed information about the error or hints for resolving it.', + ) + message: Optional[str] = Field( + None, description='A clear and concise description of the error.' + ) + + class ErrorResponse(BaseModel): error: str message: str @@ -221,7 +450,7 @@ class Result(BaseModel): ) -class Status1(str, Enum): +class Status2(str, Enum): in_progress = 'in_progress' searching = 'searching' completed = 'completed' @@ -241,7 +470,7 @@ class FileSearchToolCall(BaseModel): results: Optional[List[Result]] = Field( None, description='The results of the file search tool call.\n' ) - status: Status1 = Field( + status: Status2 = Field( ..., description='The status of the file search tool call. One of `in_progress`, \n`searching`, `incomplete` or `failed`,\n', ) @@ -266,7 +495,7 @@ class FunctionTool(BaseModel): type: Literal['FunctionTool'] = Field(..., description='The type of tool') -class Status2(str, Enum): +class Status3(str, Enum): in_progress = 'in_progress' completed = 'completed' incomplete = 'incomplete' @@ -288,7 +517,7 @@ class FunctionToolCall(BaseModel): None, description='The unique ID of the function tool call.\n' ) name: str = Field(..., description='The name of the function to run.\n') - status: Optional[Status2] = Field( + status: Optional[Status3] = Field( None, description='The status of the item. One of `in_progress`, `completed`, or\n`incomplete`. Populated when items are returned via API.\n', ) @@ -442,6 +671,95 @@ class GeminiVideoMetadata(BaseModel): startOffset: Optional[GeminiOffset] = None +class GitCommitSummary(BaseModel): + author: Optional[str] = Field(None, description='The author of the commit') + branch_name: Optional[str] = Field( + None, description='The branch where the commit was made' + ) + commit_hash: Optional[str] = Field(None, description='The hash of the commit') + commit_name: Optional[str] = Field(None, description='The name of the commit') + status_summary: Optional[Dict[str, str]] = Field( + None, description='A map of operating system to status pairs' + ) + timestamp: Optional[datetime] = Field( + None, description='The timestamp when the commit was made' + ) + + +class GithubEnterprise(BaseModel): + avatar_url: str = Field(..., description='URL to the enterprise avatar') + created_at: datetime = Field(..., description='When the enterprise was created') + description: Optional[str] = Field(None, description='The enterprise description') + html_url: str = Field(..., description='The HTML URL of the enterprise') + id: int = Field(..., description='The enterprise ID') + name: str = Field(..., description='The enterprise name') + node_id: str = Field(..., description='The enterprise node ID') + slug: str = Field(..., description='The enterprise slug') + updated_at: datetime = Field( + ..., description='When the enterprise was last updated' + ) + website_url: Optional[str] = Field(None, description='The enterprise website URL') + + +class RepositorySelection(str, Enum): + selected = 'selected' + all = 'all' + + +class GithubOrganization(BaseModel): + avatar_url: str = Field(..., description="URL to the organization's avatar") + description: Optional[str] = Field(None, description='The organization description') + events_url: str = Field(..., description="The API URL of the organization's events") + hooks_url: str = Field(..., description="The API URL of the organization's hooks") + id: int = Field(..., description='The organization ID') + issues_url: str = Field(..., description="The API URL of the organization's issues") + login: str = Field(..., description="The organization's login name") + members_url: str = Field( + ..., description="The API URL of the organization's members" + ) + node_id: str = Field(..., description='The organization node ID') + public_members_url: str = Field( + ..., description="The API URL of the organization's public members" + ) + repos_url: str = Field( + ..., description="The API URL of the organization's repositories" + ) + url: str = Field(..., description='The API URL of the organization') + + +class State(str, Enum): + uploaded = 'uploaded' + open = 'open' + + +class Action(str, Enum): + published = 'published' + unpublished = 'unpublished' + created = 'created' + edited = 'edited' + deleted = 'deleted' + prereleased = 'prereleased' + released = 'released' + + +class Type7(str, Enum): + Bot = 'Bot' + User = 'User' + Organization = 'Organization' + + +class GithubUser(BaseModel): + avatar_url: str = Field(..., description="URL to the user's avatar") + gravatar_id: Optional[str] = Field(None, description="The user's gravatar ID") + html_url: str = Field(..., description='The HTML URL of the user') + id: int = Field(..., description="The user's ID") + login: str = Field(..., description="The user's login name") + node_id: str = Field(..., description="The user's node ID") + site_admin: bool = Field(..., description='Whether the user is a site admin') + type: Type7 = Field(..., description='The type of user') + url: str = Field(..., description='The API URL of the user') + + class IdeogramColorPalette1(BaseModel): name: str = Field(..., description='Name of the preset color palette') @@ -689,7 +1007,7 @@ class Includable(str, Enum): computer_call_output_output_image_url = 'computer_call_output.output.image_url' -class Type7(str, Enum): +class Type8(str, Enum): input_file = 'input_file' @@ -703,7 +1021,7 @@ class InputFileContent(BaseModel): filename: Optional[str] = Field( None, description='The name of the file to be sent to the model.' ) - type: Type7 = Field( + type: Type8 = Field( ..., description='The type of the input item. Always `input_file`.' ) @@ -714,7 +1032,7 @@ class Detail(str, Enum): auto = 'auto' -class Type8(str, Enum): +class Type9(str, Enum): input_image = 'input_image' @@ -730,7 +1048,7 @@ class InputImageContent(BaseModel): None, description='The URL of the image to be sent to the model. A fully qualified URL or base64 encoded image in a data URL.', ) - type: Type8 = Field( + type: Type9 = Field( ..., description='The type of the input item. Always `input_image`.' ) @@ -741,17 +1059,17 @@ class Role3(str, Enum): developer = 'developer' -class Type9(str, Enum): +class Type10(str, Enum): message = 'message' -class Type10(str, Enum): +class Type11(str, Enum): input_text = 'input_text' class InputTextContent(BaseModel): text: str = Field(..., description='The text input to the model.') - type: Type10 = Field( + type: Type11 = Field( ..., description='The type of the input item. Always `input_text`.' ) @@ -923,7 +1241,7 @@ class ResourcePackType(str, Enum): constant_period = 'constant_period' -class Status4(str, Enum): +class Status5(str, Enum): toBeOnline = 'toBeOnline' online = 'online' expired = 'expired' @@ -949,7 +1267,7 @@ class ResourcePackSubscribeInfo(BaseModel): None, description='Resource package type (decreasing_total=decreasing total, constant_period=constant periodicity)', ) - status: Optional[Status4] = Field(None, description='Resource Package Status') + status: Optional[Status5] = Field(None, description='Resource Package Status') total_quantity: Optional[float] = Field(None, description='Total quantity') @@ -1113,7 +1431,7 @@ class LumaError(BaseModel): detail: Optional[str] = Field(None, description='The error message') -class Type11(str, Enum): +class Type12(str, Enum): generation = 'generation' @@ -1153,7 +1471,7 @@ class LumaImageRef(BaseModel): ) -class Type12(str, Enum): +class Type13(str, Enum): image = 'image' @@ -1223,6 +1541,36 @@ class LumaVideoModelOutputResolution( root: Union[LumaVideoModelOutputResolution1, str] +class MachineStats(BaseModel): + cpu_capacity: Optional[str] = Field(None, description='Total CPU on the machine.') + disk_capacity: Optional[str] = Field( + None, description='Total disk capacity on the machine.' + ) + gpu_type: Optional[str] = Field( + None, description='The GPU type. eg. NVIDIA Tesla K80' + ) + initial_cpu: Optional[str] = Field( + None, description='Initial CPU available before the job starts.' + ) + initial_disk: Optional[str] = Field( + None, description='Initial disk available before the job starts.' + ) + initial_ram: Optional[str] = Field( + None, description='Initial RAM available before the job starts.' + ) + machine_name: Optional[str] = Field(None, description='Name of the machine.') + memory_capacity: Optional[str] = Field( + None, description='Total memory on the machine.' + ) + os_version: Optional[str] = Field( + None, description='The operating system version. eg. Ubuntu Linux 20.04' + ) + pip_freeze: Optional[str] = Field(None, description='The pip freeze output') + vram_time_series: Optional[Dict[str, Any]] = Field( + None, description='Time series of VRAM usage.' + ) + + class MinimaxBaseResponse(BaseModel): status_code: int = Field( ..., @@ -1251,7 +1599,7 @@ class MinimaxFileRetrieveResponse(BaseModel): file: File -class Status5(str, Enum): +class Status6(str, Enum): Queueing = 'Queueing' Preparing = 'Preparing' Processing = 'Processing' @@ -1265,7 +1613,7 @@ class MinimaxTaskResultResponse(BaseModel): None, description='After the task status changes to Success, this field returns the file ID corresponding to the generated video.', ) - status: Status5 = Field( + status: Status6 = Field( ..., description="Task status: 'Queueing' (in queue), 'Preparing' (task is preparing), 'Processing' (generating), 'Success' (task completed successfully), or 'Fail' (task failed).", ) @@ -1326,6 +1674,22 @@ class MinimaxVideoGenerationResponse(BaseModel): ) +class Modality(str, Enum): + MODALITY_UNSPECIFIED = 'MODALITY_UNSPECIFIED' + TEXT = 'TEXT' + IMAGE = 'IMAGE' + VIDEO = 'VIDEO' + AUDIO = 'AUDIO' + DOCUMENT = 'DOCUMENT' + + +class ModalityTokenCount(BaseModel): + modality: Optional[Modality] = None + tokenCount: Optional[int] = Field( + None, description='Number of tokens for the given modality.' + ) + + class Truncation(str, Enum): disabled = 'disabled' auto = 'auto' @@ -1391,13 +1755,13 @@ class MoonvalleyTextToVideoInferenceParams(BaseModel): 0, description='Index of the conditioning frame' ) cooldown_steps: Optional[int] = Field( - None, description='Number of cooldown steps (calculated based on num_frames)' + 75, description='Number of cooldown steps (calculated based on num_frames)' ) fps: Optional[int] = Field( 24, description='Frames per second of the generated video' ) guidance_scale: Optional[float] = Field( - 12.5, description='Guidance scale for generation control' + 10, description='Guidance scale for generation control' ) height: Optional[int] = Field( 1080, description='Height of the generated video in pixels' @@ -1421,7 +1785,7 @@ class MoonvalleyTextToVideoInferenceParams(BaseModel): True, description='Whether to use timestep transformation' ) warmup_steps: Optional[int] = Field( - None, description='Number of warmup steps (calculated based on num_frames)' + 0, description='Number of warmup steps (calculated based on num_frames)' ) width: Optional[int] = Field( 1920, description='Width of the generated video in pixels' @@ -1463,10 +1827,10 @@ class MoonvalleyVideoToVideoInferenceParams(BaseModel): 0, description='Index of the conditioning frame' ) cooldown_steps: Optional[int] = Field( - None, description='Number of cooldown steps (calculated based on num_frames)' + 36, description='Number of cooldown steps (calculated based on num_frames)' ) guidance_scale: Optional[float] = Field( - 12.5, description='Guidance scale for generation control' + 15, description='Guidance scale for generation control' ) negative_prompt: Optional[str] = Field(None, description='Negative prompt text') seed: Optional[int] = Field( @@ -1486,7 +1850,7 @@ class MoonvalleyVideoToVideoInferenceParams(BaseModel): True, description='Whether to use timestep transformation' ) warmup_steps: Optional[int] = Field( - None, description='Number of warmup steps (calculated based on num_frames)' + 24, description='Number of warmup steps (calculated based on num_frames)' ) @@ -1507,6 +1871,34 @@ class MoonvalleyVideoToVideoRequest(BaseModel): ) +class NodeStatus(str, Enum): + NodeStatusActive = 'NodeStatusActive' + NodeStatusDeleted = 'NodeStatusDeleted' + NodeStatusBanned = 'NodeStatusBanned' + + +class NodeVersionIdentifier(BaseModel): + node_id: str = Field(..., description='The unique identifier of the node') + version: str = Field(..., description='The version of the node') + + +class NodeVersionStatus(str, Enum): + NodeVersionStatusActive = 'NodeVersionStatusActive' + NodeVersionStatusDeleted = 'NodeVersionStatusDeleted' + NodeVersionStatusBanned = 'NodeVersionStatusBanned' + NodeVersionStatusPending = 'NodeVersionStatusPending' + NodeVersionStatusFlagged = 'NodeVersionStatusFlagged' + + +class NodeVersionUpdateRequest(BaseModel): + changelog: Optional[str] = Field( + None, description='The changelog describing the version changes.' + ) + deprecated: Optional[bool] = Field( + None, description='Whether the version is deprecated.' + ) + + class Moderation(str, Enum): low = 'low' auto = 'auto' @@ -1723,38 +2115,57 @@ class Object(str, Enum): response = 'response' -class Status6(str, Enum): +class Status7(str, Enum): completed = 'completed' failed = 'failed' in_progress = 'in_progress' incomplete = 'incomplete' -class Type13(str, Enum): +class Type14(str, Enum): output_audio = 'output_audio' class OutputAudioContent(BaseModel): data: str = Field(..., description='Base64-encoded audio data') transcript: str = Field(..., description='Transcript of the audio') - type: Type13 = Field(..., description='The type of output content') + type: Type14 = Field(..., description='The type of output content') class Role4(str, Enum): assistant = 'assistant' -class Type14(str, Enum): +class Type15(str, Enum): message = 'message' -class Type15(str, Enum): +class Type16(str, Enum): output_text = 'output_text' class OutputTextContent(BaseModel): text: str = Field(..., description='The text content') - type: Type15 = Field(..., description='The type of output content') + type: Type16 = Field(..., description='The type of output content') + + +class PersonalAccessToken(BaseModel): + createdAt: Optional[datetime] = Field( + None, description='[Output Only]The date and time the token was created.' + ) + description: Optional[str] = Field( + None, + description="Optional. A more detailed description of the token's intended use.", + ) + id: Optional[UUID] = Field(None, description='Unique identifier for the GitCommit') + name: Optional[str] = Field( + None, + description='Required. The name of the token. Can be a simple description.', + ) + token: Optional[str] = Field( + None, + description='[Output Only]. The personal access token. Only returned during creation.', + ) class AspectRatio1(RootModel[float]): @@ -1961,7 +2372,7 @@ class PixverseVideoResponse(BaseModel): Resp: Optional[Resp1] = None -class Status7(int, Enum): +class Status8(int, Enum): integer_1 = 1 integer_5 = 5 integer_6 = 6 @@ -1980,7 +2391,7 @@ class Resp2(BaseModel): resolution_ratio: Optional[int] = None seed: Optional[int] = None size: Optional[int] = None - status: Optional[Status7] = Field( + status: Optional[Status8] = Field( None, description='Video generation status codes:\n* 1 - Generation successful\n* 5 - Generating\n* 6 - Deleted\n* 7 - Contents moderation failed\n* 8 - Generation failed\n', ) @@ -1994,6 +2405,17 @@ class PixverseVideoResultResponse(BaseModel): Resp: Optional[Resp2] = None +class PublisherStatus(str, Enum): + PublisherStatusActive = 'PublisherStatusActive' + PublisherStatusBanned = 'PublisherStatusBanned' + + +class PublisherUser(BaseModel): + email: Optional[str] = Field(None, description='The email address for this user.') + id: Optional[str] = Field(None, description='The unique id for this user.') + name: Optional[str] = Field(None, description='The name for this user.') + + class RgbItem(RootModel[int]): root: int = Field(..., ge=0, le=255) @@ -2020,13 +2442,13 @@ class ReasoningEffort(str, Enum): high = 'high' -class Status8(str, Enum): +class Status9(str, Enum): in_progress = 'in_progress' completed = 'completed' incomplete = 'incomplete' -class Type16(str, Enum): +class Type17(str, Enum): summary_text = 'summary_text' @@ -2035,12 +2457,12 @@ class SummaryItem(BaseModel): ..., description='A short summary of the reasoning used by the model when generating\nthe response.\n', ) - type: Type16 = Field( + type: Type17 = Field( ..., description='The type of the object. Always `summary_text`.\n' ) -class Type17(str, Enum): +class Type18(str, Enum): reasoning = 'reasoning' @@ -2048,16 +2470,31 @@ class ReasoningItem(BaseModel): id: str = Field( ..., description='The unique identifier of the reasoning content.\n' ) - status: Optional[Status8] = Field( + status: Optional[Status9] = Field( None, description='The status of the item. One of `in_progress`, `completed`, or\n`incomplete`. Populated when items are returned via API.\n', ) summary: List[SummaryItem] = Field(..., description='Reasoning text contents.\n') - type: Type17 = Field( + type: Type18 = Field( ..., description='The type of the object. Always `reasoning`.\n' ) +class RecraftImageColor(BaseModel): + rgb: Optional[List[int]] = None + std: Optional[List[float]] = None + weight: Optional[float] = None + + +class RecraftImageFeatures(BaseModel): + nsfw_score: Optional[float] = None + + +class RecraftImageFormat(str, Enum): + webp = 'webp' + png = 'png' + + class Controls(BaseModel): artistic_level: Optional[int] = Field( None, @@ -2111,12 +2548,143 @@ class RecraftImageGenerationResponse(BaseModel): data: List[Datum3] = Field(..., description='Array of generated image information') +class RecraftImageStyle(str, Enum): + digital_illustration = 'digital_illustration' + icon = 'icon' + realistic_image = 'realistic_image' + vector_illustration = 'vector_illustration' + + +class RecraftImageSubStyle(str, Enum): + field_2d_art_poster = '2d_art_poster' + field_3d = '3d' + field_80s = '80s' + glow = 'glow' + grain = 'grain' + hand_drawn = 'hand_drawn' + infantile_sketch = 'infantile_sketch' + kawaii = 'kawaii' + pixel_art = 'pixel_art' + psychedelic = 'psychedelic' + seamless = 'seamless' + voxel = 'voxel' + watercolor = 'watercolor' + broken_line = 'broken_line' + colored_outline = 'colored_outline' + colored_shapes = 'colored_shapes' + colored_shapes_gradient = 'colored_shapes_gradient' + doodle_fill = 'doodle_fill' + doodle_offset_fill = 'doodle_offset_fill' + offset_fill = 'offset_fill' + outline = 'outline' + outline_gradient = 'outline_gradient' + uneven_fill = 'uneven_fill' + field_70s = '70s' + cartoon = 'cartoon' + doodle_line_art = 'doodle_line_art' + engraving = 'engraving' + flat_2 = 'flat_2' + kawaii_1 = 'kawaii' + line_art = 'line_art' + linocut = 'linocut' + seamless_1 = 'seamless' + b_and_w = 'b_and_w' + enterprise = 'enterprise' + hard_flash = 'hard_flash' + hdr = 'hdr' + motion_blur = 'motion_blur' + natural_light = 'natural_light' + studio_portrait = 'studio_portrait' + line_circuit = 'line_circuit' + field_2d_art_poster_2 = '2d_art_poster_2' + engraving_color = 'engraving_color' + flat_air_art = 'flat_air_art' + hand_drawn_outline = 'hand_drawn_outline' + handmade_3d = 'handmade_3d' + stickers_drawings = 'stickers_drawings' + plastic = 'plastic' + pictogram = 'pictogram' + + +class RecraftResponseFormat(str, Enum): + url = 'url' + b64_json = 'b64_json' + + +class RecraftTextLayoutItem(BaseModel): + bbox: List[List[float]] + text: str + + +class RecraftTransformModel(str, Enum): + refm1 = 'refm1' + recraft20b = 'recraft20b' + recraftv2 = 'recraftv2' + recraftv3 = 'recraftv3' + flux1_1pro = 'flux1_1pro' + flux1dev = 'flux1dev' + imagen3 = 'imagen3' + hidream_i1_dev = 'hidream_i1_dev' + + +class RecraftUserControls(BaseModel): + artistic_level: Optional[int] = None + background_color: Optional[RecraftImageColor] = None + colors: Optional[List[RecraftImageColor]] = None + no_text: Optional[bool] = None + + +class Attention(str, Enum): + low = 'low' + medium = 'medium' + high = 'high' + + +class Project(str, Enum): + comfyui = 'comfyui' + comfyui_frontend = 'comfyui_frontend' + desktop = 'desktop' + + +class ReleaseNote(BaseModel): + attention: Attention = Field( + ..., description='The attention level for this release' + ) + content: str = Field( + ..., description='The content of the release note in markdown format' + ) + id: int = Field(..., description='Unique identifier for the release note') + project: Project = Field( + ..., description='The project this release note belongs to' + ) + published_at: datetime = Field( + ..., description='When the release note was published' + ) + version: str = Field(..., description='The version of the release') + + class RenderingSpeed(str, Enum): BALANCED = 'BALANCED' TURBO = 'TURBO' QUALITY = 'QUALITY' +class Type19(str, Enum): + response_completed = 'response.completed' + + +class Type20(str, Enum): + response_content_part_added = 'response.content_part.added' + + +class Type21(str, Enum): + response_content_part_done = 'response.content_part.done' + + +class Type22(str, Enum): + response_created = 'response.created' + + class ResponseErrorCode(str, Enum): server_error = 'server_error' rate_limit_exceeded = 'rate_limit_exceeded' @@ -2138,12 +2706,27 @@ class ResponseErrorCode(str, Enum): image_file_not_found = 'image_file_not_found' -class Type18(str, Enum): +class Type23(str, Enum): + error = 'error' + + +class ResponseErrorEvent(BaseModel): + code: str = Field(..., description='The error code.\n') + message: str = Field(..., description='The error message.\n') + param: str = Field(..., description='The error parameter.\n') + type: Type23 = Field(..., description='The type of the event. Always `error`.\n') + + +class Type24(str, Enum): + response_failed = 'response.failed' + + +class Type25(str, Enum): json_object = 'json_object' class ResponseFormatJsonObject(BaseModel): - type: Type18 = Field( + type: Type25 = Field( ..., description='The type of response format being defined. Always `json_object`.', ) @@ -2156,16 +2739,32 @@ class ResponseFormatJsonSchemaSchema(BaseModel): ) -class Type19(str, Enum): +class Type26(str, Enum): text = 'text' class ResponseFormatText(BaseModel): - type: Type19 = Field( + type: Type26 = Field( ..., description='The type of response format being defined. Always `text`.' ) +class Type27(str, Enum): + response_in_progress = 'response.in_progress' + + +class Type28(str, Enum): + response_incomplete = 'response.incomplete' + + +class Type29(str, Enum): + response_output_item_added = 'response.output_item.added' + + +class Type30(str, Enum): + response_output_item_done = 'response.output_item.done' + + class Truncation1(str, Enum): auto = 'auto' disabled = 'disabled' @@ -2200,10 +2799,6 @@ class Rodin3DCheckStatusRequest(BaseModel): ) -class Rodin3DCheckStatusResponse(BaseModel): - pass - - class Rodin3DDownloadRequest(BaseModel): task_uuid: str = Field(..., description='Task UUID') @@ -2235,6 +2830,13 @@ class RodinResourceItem(BaseModel): url: Optional[str] = Field(None, description='Download url') +class RodinStatusOptions(str, Enum): + Done = 'Done' + Failed = 'Failed' + Generating = 'Generating' + Waiting = 'Waiting' + + class RodinTierType(str, Enum): Regular = 'Regular' Sketch = 'Sketch' @@ -2325,6 +2927,7 @@ class RunwayTextToImageAspectRatioEnum(str, Enum): field_1808_768 = '1808:768' field_2112_912 = '2112:912' + class Model4(str, Enum): gen4_image = 'gen4_image' @@ -2350,6 +2953,38 @@ class RunwayTextToImageResponse(BaseModel): id: Optional[str] = Field(None, description='Task ID') +class Name(str, Enum): + content_moderation = 'content_moderation' + + +class StabilityContentModerationResponse(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new) you file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: Name = Field( + ..., + description='Our content moderation system has flagged some part of your request and subsequently denied it. You were not charged for this request. While this may at times be frustrating, it is necessary to maintain the integrity of our platform and ensure a safe experience for all users. If you would like to provide feedback, please use the [Support Form](https://kb.stability.ai/knowledge-base/kb-tickets/new).', + ) + + +class StabilityCreativity(RootModel[float]): + root: float = Field( + ..., + description='Controls the likelihood of creating additional details not heavily conditioned by the init image.', + ge=0.2, + le=0.5, + ) + + class StabilityError(BaseModel): errors: List[str] = Field( ..., @@ -2371,7 +3006,17 @@ class StabilityError(BaseModel): ) -class Status9(str, Enum): +class StabilityGenerationID(RootModel[str]): + root: str = Field( + ..., + description='The `id` of a generation, typically used for async generations, that can be used to check the status of the generation or retrieve the result.', + examples=['a6dc6c6e20acda010fe14d71f180658f2896ed9b4ec25aa99a6ff06c796987c4'], + max_length=64, + min_length=64, + ) + + +class Status10(str, Enum): in_progress = 'in-progress' @@ -2379,10 +3024,860 @@ class StabilityGetResultResponse202(BaseModel): id: Optional[str] = Field( None, description='The ID of the generation result.', examples=[1234567890] ) - status: Optional[Status9] = None + status: Optional[Status10] = None -class Type20(str, Enum): +class AspectRatio3(str, Enum): + field_21_9 = '21:9' + field_16_9 = '16:9' + field_3_2 = '3:2' + field_5_4 = '5:4' + field_1_1 = '1:1' + field_4_5 = '4:5' + field_2_3 = '2:3' + field_9_16 = '9:16' + field_9_21 = '9:21' + + +class Mode(str, Enum): + text_to_image = 'text-to-image' + image_to_image = 'image-to-image' + + +class Model5(str, Enum): + sd3_5_large = 'sd3.5-large' + sd3_5_large_turbo = 'sd3.5-large-turbo' + sd3_5_medium = 'sd3.5-medium' + + +class OutputFormat3(str, Enum): + png = 'png' + jpeg = 'jpeg' + + +class StylePreset(str, Enum): + enhance = 'enhance' + anime = 'anime' + photographic = 'photographic' + digital_art = 'digital-art' + comic_book = 'comic-book' + fantasy_art = 'fantasy-art' + line_art = 'line-art' + analog_film = 'analog-film' + neon_punk = 'neon-punk' + isometric = 'isometric' + low_poly = 'low-poly' + origami = 'origami' + modeling_compound = 'modeling-compound' + cinematic = 'cinematic' + field_3d_model = '3d-model' + pixel_art = 'pixel-art' + tile_texture = 'tile-texture' + + +class StabilityImageGenerationSD3Request(BaseModel): + aspect_ratio: Optional[AspectRatio3] = Field( + '1:1', + description='Controls the aspect ratio of the generated image. Defaults to 1:1.\n\n> **Important:** This parameter is only valid for **text-to-image** requests.', + ) + cfg_scale: Optional[float] = Field( + None, + description='How strictly the diffusion process adheres to the prompt text (higher values keep your image closer to your prompt). The _Large_ and _Medium_ models use a default of `4`. The _Turbo_ model uses a default of `1`.', + ge=1.0, + le=10.0, + ) + image: Optional[StrictBytes] = Field( + None, + description='The image to use as the starting point for the generation.\n\nSupported formats:\n\n\n\n - jpeg\n - png\n - webp\n\nSupported dimensions:\n\n\n\n - Every side must be at least 64 pixels\n\n> **Important:** This parameter is only valid for **image-to-image** requests.', + ) + mode: Optional[Mode] = Field( + 'text-to-image', + description='Controls whether this is a text-to-image or image-to-image generation, which affects which parameters are required:\n- **text-to-image** requires only the `prompt` parameter\n- **image-to-image** requires the `prompt`, `image`, and `strength` parameters', + title='GenerationMode', + ) + model: Optional[Model5] = Field( + 'sd3.5-large', + description='The model to use for generation.\n\n- `sd3.5-large` requires 6.5 credits per generation\n- `sd3.5-large-turbo` requires 4 credits per generation\n- `sd3.5-medium` requires 3.5 credits per generation\n- As of the April 17, 2025, `sd3-large`, `sd3-large-turbo` and `sd3-medium`\n\n\n\n are re-routed to their `sd3.5-[model version]` equivalent, at the same price.', + ) + negative_prompt: Optional[str] = Field( + None, + description='Keywords of what you **do not** wish to see in the output image.\nThis is an advanced feature.', + max_length=10000, + ) + output_format: Optional[OutputFormat3] = Field( + 'png', description='Dictates the `content-type` of the generated image.' + ) + prompt: str = Field( + ..., + description='What you wish to see in the output image. A strong, descriptive prompt that clearly defines\nelements, colors, and subjects will lead to better results.', + max_length=10000, + min_length=1, + ) + seed: Optional[float] = Field( + 0, + description="A specific value that is used to guide the 'randomness' of the generation. (Omit this parameter or pass `0` to use a random seed.)", + ge=0.0, + le=4294967294.0, + ) + strength: Optional[float] = Field( + None, + description='Sometimes referred to as _denoising_, this parameter controls how much influence the\n`image` parameter has on the generated image. A value of 0 would yield an image that\nis identical to the input. A value of 1 would be as if you passed in no image at all.\n\n> **Important:** This parameter is only valid for **image-to-image** requests.', + ge=0.0, + le=1.0, + ) + style_preset: Optional[StylePreset] = Field( + None, description='Guides the image model towards a particular style.' + ) + + +class FinishReason(str, Enum): + SUCCESS = 'SUCCESS' + CONTENT_FILTERED = 'CONTENT_FILTERED' + + +class StabilityImageGenrationSD3Response200(BaseModel): + finish_reason: FinishReason = Field( + ..., + description='The reason the generation finished.\n\n- `SUCCESS` = successful generation.\n- `CONTENT_FILTERED` = successful generation, however the output violated our content moderation\npolicy and has been blurred as a result.', + examples=['SUCCESS'], + ) + image: str = Field( + ..., + description='The generated image, encoded to base64.', + examples=['AAAAIGZ0eXBpc29tAAACAGlzb21pc28yYXZjMW1...'], + ) + seed: Optional[float] = Field( + 0, + description='The seed used as random noise for this generation.', + examples=[343940597], + ge=0.0, + le=4294967294.0, + ) + + +class StabilityImageGenrationSD3Response400(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationSD3Response413(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationSD3Response422(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationSD3Response429(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationSD3Response500(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class OutputFormat4(str, Enum): + jpeg = 'jpeg' + png = 'png' + webp = 'webp' + + +class StabilityImageGenrationUpscaleConservativeRequest(BaseModel): + creativity: Optional[StabilityCreativity] = Field( + default_factory=lambda: StabilityCreativity.model_validate(0.35) + ) + image: StrictBytes = Field( + ..., + description='The image you wish to upscale.\n\nSupported Formats:\n- jpeg\n- png\n- webp\n\nValidation Rules:\n- Every side must be at least 64 pixels\n- Total pixel count must be between 4,096 and 9,437,184 pixels\n- The aspect ratio must be between 1:2.5 and 2.5:1', + examples=['./some/image.png'], + ) + negative_prompt: Optional[str] = Field( + None, + description='A blurb of text describing what you **do not** wish to see in the output image.\nThis is an advanced feature.', + max_length=10000, + ) + output_format: Optional[OutputFormat4] = Field( + 'png', description='Dictates the `content-type` of the generated image.' + ) + prompt: str = Field( + ..., + description="What you wish to see in the output image. A strong, descriptive prompt that clearly defines\nelements, colors, and subjects will lead to better results.\n\nTo control the weight of a given word use the format `(word:weight)`,\nwhere `word` is the word you'd like to control the weight of and `weight`\nis a value between 0 and 1. For example: `The sky was a crisp (blue:0.3) and (green:0.8)`\nwould convey a sky that was blue and green, but more green than blue.", + max_length=10000, + min_length=1, + ) + seed: Optional[float] = Field( + 0, + description="A specific value that is used to guide the 'randomness' of the generation. (Omit this parameter or pass `0` to use a random seed.)", + ge=0.0, + le=4294967294.0, + ) + + +class StabilityImageGenrationUpscaleConservativeResponse200(BaseModel): + finish_reason: FinishReason = Field( + ..., + description='The reason the generation finished.\n\n- `SUCCESS` = successful generation.\n- `CONTENT_FILTERED` = successful generation, however the output violated our content moderation\npolicy and has been blurred as a result.', + examples=['SUCCESS'], + ) + image: str = Field( + ..., + description='The generated image, encoded to base64.', + examples=['AAAAIGZ0eXBpc29tAAACAGlzb21pc28yYXZjMW1...'], + ) + seed: Optional[float] = Field( + 0, + description='The seed used as random noise for this generation.', + examples=[343940597], + ge=0.0, + le=4294967294.0, + ) + + +class StabilityImageGenrationUpscaleConservativeResponse400(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleConservativeResponse413(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleConservativeResponse422(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleConservativeResponse429(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleConservativeResponse500(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleCreativeRequest(BaseModel): + creativity: Optional[float] = Field( + 0.3, + description='Indicates how creative the model should be when upscaling an image.\nHigher values will result in more details being added to the image during upscaling.', + ge=0.1, + le=0.5, + ) + image: StrictBytes = Field( + ..., + description='The image you wish to upscale.\n\nSupported Formats:\n- jpeg\n- png\n- webp\n\nValidation Rules:\n- Every side must be at least 64 pixels\n- Total pixel count must be between 4,096 and 1,048,576 pixels', + examples=['./some/image.png'], + ) + negative_prompt: Optional[str] = Field( + None, + description='A blurb of text describing what you **do not** wish to see in the output image.\nThis is an advanced feature.', + max_length=10000, + ) + output_format: Optional[OutputFormat4] = Field( + 'png', description='Dictates the `content-type` of the generated image.' + ) + prompt: str = Field( + ..., + description="What you wish to see in the output image. A strong, descriptive prompt that clearly defines\nelements, colors, and subjects will lead to better results.\n\nTo control the weight of a given word use the format `(word:weight)`,\nwhere `word` is the word you'd like to control the weight of and `weight`\nis a value between 0 and 1. For example: `The sky was a crisp (blue:0.3) and (green:0.8)`\nwould convey a sky that was blue and green, but more green than blue.", + max_length=10000, + min_length=1, + ) + seed: Optional[float] = Field( + 0, + description="A specific value that is used to guide the 'randomness' of the generation. (Omit this parameter or pass `0` to use a random seed.)", + ge=0.0, + le=4294967294.0, + ) + style_preset: Optional[StylePreset] = Field( + None, description='Guides the image model towards a particular style.' + ) + + +class StabilityImageGenrationUpscaleCreativeResponse200(BaseModel): + id: StabilityGenerationID + + +class StabilityImageGenrationUpscaleCreativeResponse400(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleCreativeResponse413(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleCreativeResponse422(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleCreativeResponse429(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleCreativeResponse500(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleFastRequest(BaseModel): + image: StrictBytes = Field( + ..., + description='The image you wish to upscale.\n\nSupported Formats:\n- jpeg\n- png\n- webp\n\nValidation Rules:\n- Width must be between 32 and 1,536 pixels\n- Height must be between 32 and 1,536 pixels\n- Total pixel count must be between 1,024 and 1,048,576 pixels', + examples=['./some/image.png'], + ) + output_format: Optional[OutputFormat4] = Field( + 'png', description='Dictates the `content-type` of the generated image.' + ) + + +class StabilityImageGenrationUpscaleFastResponse200(BaseModel): + finish_reason: FinishReason = Field( + ..., + description='The reason the generation finished.\n\n- `SUCCESS` = successful generation.\n- `CONTENT_FILTERED` = successful generation, however the output violated our content moderation\npolicy and has been blurred as a result.', + examples=['SUCCESS'], + ) + image: str = Field( + ..., + description='The generated image, encoded to base64.', + examples=['AAAAIGZ0eXBpc29tAAACAGlzb21pc28yYXZjMW1...'], + ) + seed: Optional[float] = Field( + 0, + description='The seed used as random noise for this generation.', + examples=[343940597], + ge=0.0, + le=4294967294.0, + ) + + +class StabilityImageGenrationUpscaleFastResponse400(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleFastResponse413(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleFastResponse422(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleFastResponse429(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleFastResponse500(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityStabilityClientID(RootModel[str]): + root: str = Field( + ..., + description='The name of your application, used to help us communicate app-specific debugging or moderation issues to you.', + examples=['my-awesome-app'], + max_length=256, + ) + + +class StabilityStabilityClientUserID(RootModel[str]): + root: str = Field( + ..., + description='A unique identifier for your end user. Used to help us communicate user-specific debugging or moderation issues to you. Feel free to obfuscate this value to protect user privacy.', + examples=['DiscordUser#9999'], + max_length=256, + ) + + +class StabilityStabilityClientVersion(RootModel[str]): + root: str = Field( + ..., + description='The version of your application, used to help us communicate version-specific debugging or moderation issues to you.', + examples=['1.2.1'], + max_length=256, + ) + + +class StorageFile(BaseModel): + file_path: Optional[str] = Field(None, description='Path to the file in storage') + id: Optional[UUID] = Field( + None, description='Unique identifier for the storage file' + ) + public_url: Optional[str] = Field(None, description='Public URL') + + +class StripeAddress(BaseModel): + city: Optional[str] = None + country: Optional[str] = None + line1: Optional[str] = None + line2: Optional[str] = None + postal_code: Optional[str] = None + state: Optional[str] = None + + +class StripeAmountDetails(BaseModel): + tip: Optional[Dict[str, Any]] = None + + +class StripeBillingDetails(BaseModel): + address: Optional[StripeAddress] = None + email: Optional[str] = None + name: Optional[str] = None + phone: Optional[str] = None + tax_id: Optional[Any] = None + + +class Checks(BaseModel): + address_line1_check: Optional[Any] = None + address_postal_code_check: Optional[Any] = None + cvc_check: Optional[str] = None + + +class ExtendedAuthorization(BaseModel): + status: Optional[str] = None + + +class IncrementalAuthorization(BaseModel): + status: Optional[str] = None + + +class Multicapture(BaseModel): + status: Optional[str] = None + + +class NetworkToken(BaseModel): + used: Optional[bool] = None + + +class Overcapture(BaseModel): + maximum_amount_capturable: Optional[int] = None + status: Optional[str] = None + + +class StripeCardDetails(BaseModel): + amount_authorized: Optional[int] = None + authorization_code: Optional[Any] = None + brand: Optional[str] = None + checks: Optional[Checks] = None + country: Optional[str] = None + exp_month: Optional[int] = None + exp_year: Optional[int] = None + extended_authorization: Optional[ExtendedAuthorization] = None + fingerprint: Optional[str] = None + funding: Optional[str] = None + incremental_authorization: Optional[IncrementalAuthorization] = None + installments: Optional[Any] = None + last4: Optional[str] = None + mandate: Optional[Any] = None + multicapture: Optional[Multicapture] = None + network: Optional[str] = None + network_token: Optional[NetworkToken] = None + network_transaction_id: Optional[str] = None + overcapture: Optional[Overcapture] = None + regulated_status: Optional[str] = None + three_d_secure: Optional[Any] = None + wallet: Optional[Any] = None + + +class Object1(str, Enum): + charge = 'charge' + + +class Object2(str, Enum): + event = 'event' + + +class Type31(str, Enum): + payment_intent_succeeded = 'payment_intent.succeeded' + + +class StripeOutcome(BaseModel): + advice_code: Optional[Any] = None + network_advice_code: Optional[Any] = None + network_decline_code: Optional[Any] = None + network_status: Optional[str] = None + reason: Optional[Any] = None + risk_level: Optional[str] = None + risk_score: Optional[int] = None + seller_message: Optional[str] = None + type: Optional[str] = None + + +class Object3(str, Enum): + payment_intent = 'payment_intent' + + +class StripePaymentMethodDetails(BaseModel): + card: Optional[StripeCardDetails] = None + type: Optional[str] = None + + +class Card(BaseModel): + installments: Optional[Any] = None + mandate_options: Optional[Any] = None + network: Optional[Any] = None + request_three_d_secure: Optional[str] = None + + +class StripePaymentMethodOptions(BaseModel): + card: Optional[Card] = None + + +class StripeRefundList(BaseModel): + data: Optional[List[Dict[str, Any]]] = None + has_more: Optional[bool] = None + object: Optional[str] = None + total_count: Optional[int] = None + url: Optional[str] = None + + +class StripeRequestInfo(BaseModel): + id: Optional[str] = None + idempotency_key: Optional[str] = None + + +class StripeShipping(BaseModel): + address: Optional[StripeAddress] = None + carrier: Optional[str] = None + name: Optional[str] = None + phone: Optional[str] = None + tracking_number: Optional[str] = None + + +class Type32(str, Enum): json_schema = 'json_schema' @@ -2400,19 +3895,19 @@ class TextResponseFormatJsonSchema(BaseModel): False, description='Whether to enable strict schema adherence when generating the output.\nIf set to true, the model will always follow the exact schema defined\nin the `schema` field. Only a subset of JSON Schema is supported when\n`strict` is `true`. To learn more, read the [Structured Outputs\nguide](/docs/guides/structured-outputs).\n', ) - type: Type20 = Field( + type: Type32 = Field( ..., description='The type of response format being defined. Always `json_schema`.', ) -class Type21(str, Enum): +class Type33(str, Enum): function = 'function' class ToolChoiceFunction(BaseModel): name: str = Field(..., description='The name of the function to call.') - type: Type21 = Field( + type: Type33 = Field( ..., description='For function calling, the type is always `function`.' ) @@ -2423,7 +3918,7 @@ class ToolChoiceOptions(str, Enum): required = 'required' -class Type22(str, Enum): +class Type34(str, Enum): file_search = 'file_search' web_search_preview = 'web_search_preview' computer_use_preview = 'computer_use_preview' @@ -2431,7 +3926,7 @@ class Type22(str, Enum): class ToolChoiceTypes(BaseModel): - type: Type22 = Field( + type: Type34 = Field( ..., description='The type of hosted tool the model should to use. Learn more about\n[built-in tools](/docs/guides/tools).\n\nAllowed values are:\n- `file_search`\n- `web_search_preview`\n- `computer_use_preview`\n', ) @@ -2499,9 +3994,9 @@ class TripoModelStyle(str, Enum): class TripoModelVersion(str, Enum): - V2_5 = 'v2.5-20250123' - V2_0 = 'v2.0-20240919' - V1_4 = 'v1.4-20240625' + v2_5_20250123 = 'v2.5-20250123' + v2_0_20240919 = 'v2.0-20240919' + v1_4_20240625 = 'v1.4-20240625' class TripoMultiviewMode(str, Enum): @@ -2547,13 +4042,13 @@ class Code1(int, Enum): integer_0 = 0 -class Data8(BaseModel): +class Data9(BaseModel): task_id: str = Field(..., description='used for getTask') class TripoSuccessTask(BaseModel): code: Code1 - data: Data8 + data: Data9 class Topology(str, Enum): @@ -2570,7 +4065,7 @@ class Output(BaseModel): topology: Optional[Topology] = None -class Status10(str, Enum): +class Status11(str, Enum): queued = 'queued' running = 'running' success = 'success' @@ -2586,7 +4081,7 @@ class TripoTask(BaseModel): input: Dict[str, Any] output: Output progress: int = Field(..., ge=0, le=100) - status: Status10 + status: Status11 task_id: str type: str @@ -2650,6 +4145,18 @@ class TripoTypeTextureModel(str, Enum): texture_model = 'texture_model' +class User(BaseModel): + email: Optional[str] = Field(None, description='The email address for this user.') + id: Optional[str] = Field(None, description='The unique id for this user.') + isAdmin: Optional[bool] = Field( + None, description='Indicates if the user has admin privileges.' + ) + isApproved: Optional[bool] = Field( + None, description='Indicates if the user is approved.' + ) + name: Optional[str] = Field(None, description='The name for this user.') + + class Veo2GenVidPollRequest(BaseModel): operationName: str = Field( ..., @@ -2660,7 +4167,7 @@ class Veo2GenVidPollRequest(BaseModel): ) -class Error(BaseModel): +class Error1(BaseModel): code: Optional[int] = Field(None, description='Error code') message: Optional[str] = Field(None, description='Error message') @@ -2692,7 +4199,7 @@ class Response(BaseModel): class Veo2GenVidPollResponse(BaseModel): done: Optional[bool] = None - error: Optional[Error] = Field( + error: Optional[Error1] = Field( None, description='Error details if operation failed' ) name: Optional[str] = None @@ -2753,13 +4260,102 @@ class Veo2GenVidResponse(BaseModel): ) +class VeoGenVidPollRequest(BaseModel): + operationName: str = Field( + ..., + description='Full operation name (from predict response)', + examples=[ + 'projects/PROJECT_ID/locations/us-central1/publishers/google/models/MODEL_ID/operations/OPERATION_ID' + ], + ) + + +class Response1(BaseModel): + field_type: Optional[str] = Field( + None, + alias='@type', + examples=[ + 'type.googleapis.com/cloud.ai.large_models.vision.GenerateVideoResponse' + ], + ) + raiMediaFilteredCount: Optional[int] = Field( + None, description='Count of media filtered by responsible AI policies' + ) + raiMediaFilteredReasons: Optional[List[str]] = Field( + None, description='Reasons why media was filtered by responsible AI policies' + ) + videos: Optional[List[Video]] = None + + +class VeoGenVidPollResponse(BaseModel): + done: Optional[bool] = None + error: Optional[Error1] = Field( + None, description='Error details if operation failed' + ) + name: Optional[str] = None + response: Optional[Response1] = Field( + None, description='The actual prediction response if done is true' + ) + + +class Image2(BaseModel): + bytesBase64Encoded: str + gcsUri: Optional[str] = None + mimeType: Optional[str] = None + + +class Image3(BaseModel): + bytesBase64Encoded: Optional[str] = None + gcsUri: str + mimeType: Optional[str] = None + + +class Instance1(BaseModel): + image: Optional[Union[Image2, Image3]] = Field( + None, description='Optional image to guide video generation' + ) + prompt: str = Field(..., description='Text description of the video') + + +class Parameters1(BaseModel): + aspectRatio: Optional[str] = Field(None, examples=['16:9']) + durationSeconds: Optional[int] = None + enhancePrompt: Optional[bool] = None + generateAudio: Optional[bool] = Field( + None, + description='Generate audio for the video. Only supported by veo 3 models.', + ) + negativePrompt: Optional[str] = None + personGeneration: Optional[PersonGeneration1] = None + sampleCount: Optional[int] = None + seed: Optional[int] = None + storageUri: Optional[str] = Field( + None, description='Optional Cloud Storage URI to upload the video' + ) + + +class VeoGenVidRequest(BaseModel): + instances: Optional[List[Instance1]] = None + parameters: Optional[Parameters1] = None + + +class VeoGenVidResponse(BaseModel): + name: str = Field( + ..., + description='Operation resource name', + examples=[ + 'projects/PROJECT_ID/locations/us-central1/publishers/google/models/MODEL_ID/operations/a1b07c8e-7b5a-4aba-bb34-3e1ccb8afcc8' + ], + ) + + class SearchContextSize(str, Enum): low = 'low' medium = 'medium' high = 'high' -class Type23(str, Enum): +class Type35(str, Enum): web_search_preview = 'web_search_preview' web_search_preview_2025_03_11 = 'web_search_preview_2025_03_11' @@ -2775,30 +4371,348 @@ class WebSearchPreviewTool(BaseModel): ) -class Status11(str, Enum): +class Status12(str, Enum): in_progress = 'in_progress' searching = 'searching' completed = 'completed' failed = 'failed' -class Type24(str, Enum): +class Type36(str, Enum): web_search_call = 'web_search_call' class WebSearchToolCall(BaseModel): id: str = Field(..., description='The unique ID of the web search tool call.\n') - status: Status11 = Field( + status: Status12 = Field( ..., description='The status of the web search tool call.\n' ) - type: Type24 = Field( + type: Type36 = Field( ..., description='The type of the web search tool call. Always `web_search_call`.\n', ) -class CreateModelResponseProperties(ModelResponseProperties): - pass +class WorkflowRunStatus(str, Enum): + WorkflowRunStatusStarted = 'WorkflowRunStatusStarted' + WorkflowRunStatusFailed = 'WorkflowRunStatusFailed' + WorkflowRunStatusCompleted = 'WorkflowRunStatusCompleted' + + +class ActionJobResult(BaseModel): + action_job_id: Optional[str] = Field( + None, description='Identifier of the job this result belongs to' + ) + action_run_id: Optional[str] = Field( + None, description='Identifier of the run this result belongs to' + ) + author: Optional[str] = Field(None, description='The author of the commit') + avg_vram: Optional[int] = Field( + None, description='The average VRAM used by the job' + ) + branch_name: Optional[str] = Field( + None, description='Name of the relevant git branch' + ) + comfy_run_flags: Optional[str] = Field( + None, description='The comfy run flags. E.g. `--low-vram`' + ) + commit_hash: Optional[str] = Field(None, description='The hash of the commit') + commit_id: Optional[str] = Field(None, description='The ID of the commit') + commit_message: Optional[str] = Field(None, description='The message of the commit') + commit_time: Optional[int] = Field( + None, description='The Unix timestamp when the commit was made' + ) + cuda_version: Optional[str] = Field(None, description='CUDA version used') + end_time: Optional[int] = Field( + None, description='The end time of the job as a Unix timestamp.' + ) + git_repo: Optional[str] = Field(None, description='The repository name') + id: Optional[UUID] = Field(None, description='Unique identifier for the job result') + job_trigger_user: Optional[str] = Field( + None, description='The user who triggered the job.' + ) + machine_stats: Optional[MachineStats] = None + operating_system: Optional[str] = Field(None, description='Operating system used') + peak_vram: Optional[int] = Field(None, description='The peak VRAM used by the job') + pr_number: Optional[str] = Field(None, description='The pull request number') + python_version: Optional[str] = Field(None, description='PyTorch version used') + pytorch_version: Optional[str] = Field(None, description='PyTorch version used') + start_time: Optional[int] = Field( + None, description='The start time of the job as a Unix timestamp.' + ) + status: Optional[WorkflowRunStatus] = None + storage_file: Optional[StorageFile] = None + workflow_name: Optional[str] = Field(None, description='Name of the workflow') + + +class BFLCannyInputs(BaseModel): + canny_high_threshold: Optional[CannyHighThreshold] = Field( + default_factory=lambda: CannyHighThreshold.model_validate(200), + description='High threshold for Canny edge detection', + title='Canny High Threshold', + ) + canny_low_threshold: Optional[CannyLowThreshold] = Field( + default_factory=lambda: CannyLowThreshold.model_validate(50), + description='Low threshold for Canny edge detection', + title='Canny Low Threshold', + ) + control_image: Optional[str] = Field( + None, + description='Base64 encoded image to use as control input if no preprocessed image is provided', + title='Control Image', + ) + guidance: Optional[Guidance] = Field( + default_factory=lambda: Guidance.model_validate(30), + description='Guidance strength for the image generation process', + title='Guidance', + ) + output_format: Optional[BFLOutputFormat] = Field( + 'jpeg', + description="Output format for the generated image. Can be 'jpeg' or 'png'.", + ) + preprocessed_image: Optional[str] = Field( + None, + description='Optional pre-processed image that will bypass the control preprocessing step', + title='Preprocessed Image', + ) + prompt: str = Field( + ..., + description='Text prompt for image generation', + examples=['ein fantastisches bild'], + title='Prompt', + ) + prompt_upsampling: Optional[bool] = Field( + False, + description='Whether to perform upsampling on the prompt', + title='Prompt Upsampling', + ) + safety_tolerance: Optional[int] = Field( + 2, + description='Tolerance level for input and output moderation. Between 0 and 6, 0 being most strict, 6 being least strict.', + ge=0, + le=6, + title='Safety Tolerance', + ) + seed: Optional[int] = Field( + None, + description='Optional seed for reproducibility', + examples=[42], + title='Seed', + ) + steps: Optional[Steps] = Field( + default_factory=lambda: Steps.model_validate(50), + description='Number of steps for the image generation process', + title='Steps', + ) + webhook_secret: Optional[str] = Field( + None, + description='Optional secret for webhook signature verification', + title='Webhook Secret', + ) + webhook_url: Optional[WebhookUrl] = Field( + None, description='URL to receive webhook notifications', title='Webhook Url' + ) + + +class BFLDepthInputs(BaseModel): + control_image: Optional[str] = Field( + None, + description='Base64 encoded image to use as control input', + title='Control Image', + ) + guidance: Optional[Guidance] = Field( + default_factory=lambda: Guidance.model_validate(15), + description='Guidance strength for the image generation process', + title='Guidance', + ) + output_format: Optional[BFLOutputFormat] = Field( + 'jpeg', + description="Output format for the generated image. Can be 'jpeg' or 'png'.", + ) + preprocessed_image: Optional[str] = Field( + None, + description='Optional pre-processed image that will bypass the control preprocessing step', + title='Preprocessed Image', + ) + prompt: str = Field( + ..., + description='Text prompt for image generation', + examples=['ein fantastisches bild'], + title='Prompt', + ) + prompt_upsampling: Optional[bool] = Field( + False, + description='Whether to perform upsampling on the prompt', + title='Prompt Upsampling', + ) + safety_tolerance: Optional[int] = Field( + 2, + description='Tolerance level for input and output moderation. Between 0 and 6, 0 being most strict, 6 being least strict.', + ge=0, + le=6, + title='Safety Tolerance', + ) + seed: Optional[int] = Field( + None, + description='Optional seed for reproducibility', + examples=[42], + title='Seed', + ) + steps: Optional[Steps] = Field( + default_factory=lambda: Steps.model_validate(50), + description='Number of steps for the image generation process', + title='Steps', + ) + webhook_secret: Optional[str] = Field( + None, + description='Optional secret for webhook signature verification', + title='Webhook Secret', + ) + webhook_url: Optional[WebhookUrl] = Field( + None, description='URL to receive webhook notifications', title='Webhook Url' + ) + + +class BFLFluxProExpandInputs(BaseModel): + bottom: Optional[Bottom] = Field( + 0, + description='Number of pixels to expand at the bottom of the image', + title='Bottom', + ) + guidance: Optional[Guidance2] = Field( + default_factory=lambda: Guidance2.model_validate(60), + description='Guidance strength for the image generation process', + title='Guidance', + ) + image: str = Field( + ..., + description='A Base64-encoded string representing the image you wish to expand.', + title='Image', + ) + left: Optional[Left] = Field( + 0, + description='Number of pixels to expand on the left side of the image', + title='Left', + ) + output_format: Optional[BFLOutputFormat] = Field( + 'jpeg', + description="Output format for the generated image. Can be 'jpeg' or 'png'.", + ) + prompt: Optional[str] = Field( + '', + description='The description of the changes you want to make. This text guides the expansion process, allowing you to specify features, styles, or modifications for the expanded areas.', + examples=['ein fantastisches bild'], + title='Prompt', + ) + prompt_upsampling: Optional[bool] = Field( + False, + description='Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation', + title='Prompt Upsampling', + ) + right: Optional[Right] = Field( + 0, + description='Number of pixels to expand on the right side of the image', + title='Right', + ) + safety_tolerance: Optional[int] = Field( + 2, + description='Tolerance level for input and output moderation. Between 0 and 6, 0 being most strict, 6 being least strict.', + examples=[2], + ge=0, + le=6, + title='Safety Tolerance', + ) + seed: Optional[int] = Field( + None, description='Optional seed for reproducibility', title='Seed' + ) + steps: Optional[Steps2] = Field( + default_factory=lambda: Steps2.model_validate(50), + description='Number of steps for the image generation process', + examples=[50], + title='Steps', + ) + top: Optional[Top] = Field( + 0, description='Number of pixels to expand at the top of the image', title='Top' + ) + webhook_secret: Optional[str] = Field( + None, + description='Optional secret for webhook signature verification', + title='Webhook Secret', + ) + webhook_url: Optional[WebhookUrl] = Field( + None, description='URL to receive webhook notifications', title='Webhook Url' + ) + + +class BFLFluxProFillInputs(BaseModel): + guidance: Optional[Guidance2] = Field( + default_factory=lambda: Guidance2.model_validate(60), + description='Guidance strength for the image generation process', + title='Guidance', + ) + image: str = Field( + ..., + description='A Base64-encoded string representing the image you wish to modify. Can contain alpha mask if desired.', + title='Image', + ) + mask: Optional[str] = Field( + None, + description='A Base64-encoded string representing a mask for the areas you want to modify in the image. The mask should be the same dimensions as the image and in black and white. Black areas (0%) indicate no modification, while white areas (100%) specify areas for inpainting. Optional if you provide an alpha mask in the original image. Validation: The endpoint verifies that the dimensions of the mask match the original image.', + title='Mask', + ) + output_format: Optional[BFLOutputFormat] = Field( + 'jpeg', + description="Output format for the generated image. Can be 'jpeg' or 'png'.", + ) + prompt: Optional[str] = Field( + '', + description='The description of the changes you want to make. This text guides the inpainting process, allowing you to specify features, styles, or modifications for the masked area.', + examples=['ein fantastisches bild'], + title='Prompt', + ) + prompt_upsampling: Optional[bool] = Field( + False, + description='Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation', + title='Prompt Upsampling', + ) + safety_tolerance: Optional[int] = Field( + 2, + description='Tolerance level for input and output moderation. Between 0 and 6, 0 being most strict, 6 being least strict.', + examples=[2], + ge=0, + le=6, + title='Safety Tolerance', + ) + seed: Optional[int] = Field( + None, description='Optional seed for reproducibility', title='Seed' + ) + steps: Optional[Steps2] = Field( + default_factory=lambda: Steps2.model_validate(50), + description='Number of steps for the image generation process', + examples=[50], + title='Steps', + ) + webhook_secret: Optional[str] = Field( + None, + description='Optional secret for webhook signature verification', + title='Webhook Secret', + ) + webhook_url: Optional[WebhookUrl] = Field( + None, description='URL to receive webhook notifications', title='Webhook Url' + ) + + +class BFLHTTPValidationError(BaseModel): + detail: Optional[List[BFLValidationError]] = Field(None, title='Detail') + + +class BulkNodeVersionsRequest(BaseModel): + node_versions: List[NodeVersionIdentifier] = Field( + ..., description='List of node ID and version pairs to retrieve' + ) + + +CreateModelResponseProperties = ModelResponseProperties class GeminiInlineData(BaseModel): @@ -2841,6 +4755,125 @@ class GeminiSystemInstructionContent(BaseModel): ) +class GeminiUsageMetadata(BaseModel): + cachedContentTokenCount: Optional[int] = Field( + None, + description='Output only. Number of tokens in the cached part in the input (the cached content).', + ) + candidatesTokenCount: Optional[int] = Field( + None, description='Number of tokens in the response(s).' + ) + candidatesTokensDetails: Optional[List[ModalityTokenCount]] = Field( + None, description='Breakdown of candidate tokens by modality.' + ) + promptTokenCount: Optional[int] = Field( + None, + description='Number of tokens in the request. When cachedContent is set, this is still the total effective prompt size meaning this includes the number of tokens in the cached content.', + ) + promptTokensDetails: Optional[List[ModalityTokenCount]] = Field( + None, description='Breakdown of prompt tokens by modality.' + ) + thoughtsTokenCount: Optional[int] = Field( + None, description='Number of tokens present in thoughts output.' + ) + toolUsePromptTokenCount: Optional[int] = Field( + None, description='Number of tokens present in tool-use prompt(s).' + ) + + +class GithubInstallation(BaseModel): + access_tokens_url: str = Field(..., description='The API URL for access tokens') + account: GithubUser + app_id: int = Field(..., description='The GitHub App ID') + created_at: datetime = Field(..., description='When the installation was created') + events: List[str] = Field( + ..., description='The events the installation subscribes to' + ) + html_url: str = Field(..., description='The HTML URL of the installation') + id: int = Field(..., description='The installation ID') + permissions: Dict[str, Any] = Field(..., description='The installation permissions') + repositories_url: str = Field(..., description='The API URL for repositories') + repository_selection: RepositorySelection = Field( + ..., description='Repository selection for the installation' + ) + single_file_name: Optional[str] = Field( + None, description='The single file name if applicable' + ) + target_id: int = Field(..., description='The target ID') + target_type: str = Field(..., description='The target type') + updated_at: datetime = Field( + ..., description='When the installation was last updated' + ) + + +class GithubReleaseAsset(BaseModel): + browser_download_url: str = Field(..., description='The browser download URL') + content_type: str = Field(..., description='The content type of the asset') + created_at: datetime = Field(..., description='When the asset was created') + download_count: int = Field(..., description='The number of downloads') + id: int = Field(..., description='The asset ID') + label: Optional[str] = Field(None, description='The label of the asset') + name: str = Field(..., description='The name of the asset') + node_id: str = Field(..., description='The asset node ID') + size: int = Field(..., description='The size of the asset in bytes') + state: State = Field(..., description='The state of the asset') + updated_at: datetime = Field(..., description='When the asset was last updated') + uploader: GithubUser + + +class Release(BaseModel): + assets: List[GithubReleaseAsset] = Field(..., description='Array of release assets') + assets_url: Optional[str] = Field(None, description='The URL to the release assets') + author: GithubUser + body: Optional[str] = Field(None, description='The release notes/body') + created_at: datetime = Field(..., description='When the release was created') + draft: bool = Field(..., description='Whether the release is a draft') + html_url: str = Field(..., description='The HTML URL of the release') + id: int = Field(..., description='The ID of the release') + name: Optional[str] = Field(None, description='The name of the release') + node_id: str = Field(..., description='The node ID of the release') + prerelease: bool = Field(..., description='Whether the release is a prerelease') + published_at: Optional[datetime] = Field( + None, description='When the release was published' + ) + tag_name: str = Field(..., description='The tag name of the release') + tarball_url: str = Field(..., description='URL to the tarball') + target_commitish: str = Field( + ..., description='The branch or commit the release was created from' + ) + upload_url: Optional[str] = Field( + None, description='The URL to upload release assets' + ) + url: str = Field(..., description='The API URL of the release') + zipball_url: str = Field(..., description='URL to the zipball') + + +class GithubRepository(BaseModel): + clone_url: str = Field(..., description='The clone URL of the repository') + created_at: datetime = Field(..., description='When the repository was created') + default_branch: str = Field(..., description='The default branch of the repository') + description: Optional[str] = Field(None, description='The repository description') + fork: bool = Field(..., description='Whether the repository is a fork') + full_name: str = Field( + ..., description='The full name of the repository (owner/repo)' + ) + git_url: str = Field(..., description='The git URL of the repository') + html_url: str = Field(..., description='The HTML URL of the repository') + id: int = Field(..., description='The repository ID') + name: str = Field(..., description='The name of the repository') + node_id: str = Field(..., description='The repository node ID') + owner: GithubUser + private: bool = Field(..., description='Whether the repository is private') + pushed_at: datetime = Field( + ..., description='When the repository was last pushed to' + ) + ssh_url: str = Field(..., description='The SSH URL of the repository') + updated_at: datetime = Field( + ..., description='When the repository was last updated' + ) + url: str = Field(..., description='The API URL of the repository') + + class IdeogramV3EditRequest(BaseModel): color_palette: Optional[IdeogramColorPalette] = None image: Optional[StrictBytes] = Field( @@ -3276,6 +5309,52 @@ class MoonvalleyTextToImageRequest(BaseModel): webhook_url: Optional[str] = None +class NodeVersion(BaseModel): + changelog: Optional[str] = Field( + None, description='Summary of changes made in this version' + ) + comfy_node_extract_status: Optional[str] = Field( + None, description='The status of comfy node extraction process.' + ) + createdAt: Optional[datetime] = Field( + None, description='The date and time the version was created.' + ) + dependencies: Optional[List[str]] = Field( + None, description='A list of pip dependencies required by the node.' + ) + deprecated: Optional[bool] = Field( + None, description='Indicates if this version is deprecated.' + ) + downloadUrl: Optional[str] = Field( + None, description='[Output Only] URL to download this version of the node' + ) + id: Optional[str] = None + node_id: Optional[str] = Field( + None, description='The unique identifier of the node.' + ) + status: Optional[NodeVersionStatus] = None + status_reason: Optional[str] = Field( + None, description='The reason for the status change.' + ) + supported_accelerators: Optional[List[str]] = Field( + None, + description='List of accelerators (e.g. CUDA, DirectML, ROCm) that this node supports', + ) + supported_comfyui_frontend_version: Optional[str] = Field( + None, description='Supported versions of ComfyUI frontend' + ) + supported_comfyui_version: Optional[str] = Field( + None, description='Supported versions of ComfyUI' + ) + supported_os: Optional[List[str]] = Field( + None, description='List of operating systems that this node supports' + ) + version: Optional[str] = Field( + None, + description='The version identifier, following semantic versioning. Must be unique for the node.', + ) + + class OutputContent(RootModel[Union[OutputTextContent, OutputAudioContent]]): root: Union[OutputTextContent, OutputAudioContent] @@ -3283,7 +5362,7 @@ class OutputContent(RootModel[Union[OutputTextContent, OutputAudioContent]]): class OutputMessage(BaseModel): content: List[OutputContent] = Field(..., description='The content of the message') role: Role4 = Field(..., description='The role of the message') - type: Type14 = Field(..., description='The type of output item') + type: Type15 = Field(..., description='The type of output item') class PikaBodyGenerate22I2vGenerate22I2vPost(BaseModel): @@ -3333,6 +5412,16 @@ class PikaHTTPValidationError(BaseModel): detail: Optional[List[PikaValidationError]] = Field(None, title='Detail') +class PublisherMember(BaseModel): + id: Optional[str] = Field( + None, description='The unique identifier for the publisher member.' + ) + role: Optional[str] = Field( + None, description='The role of the user in the publisher.' + ) + user: Optional[PublisherUser] = None + + class Reasoning(BaseModel): effort: Optional[ReasoningEffort] = 'medium' generate_summary: Optional[GenerateSummary] = Field( @@ -3345,13 +5434,88 @@ class Reasoning(BaseModel): ) +class RecraftImage(BaseModel): + b64_json: Optional[str] = None + features: Optional[RecraftImageFeatures] = None + image_id: UUID + revised_prompt: Optional[str] = None + url: Optional[str] = None + + +class RecraftProcessImageRequest(BaseModel): + image: StrictBytes + image_format: Optional[RecraftImageFormat] = None + response_format: Optional[RecraftResponseFormat] = None + + +class RecraftProcessImageResponse(BaseModel): + created: int + credits: int + image: RecraftImage + + +class RecraftTextLayout(RootModel[List[RecraftTextLayoutItem]]): + root: List[RecraftTextLayoutItem] + + +class RecraftTransformImageWithMaskRequest(BaseModel): + block_nsfw: Optional[bool] = None + calculate_features: Optional[bool] = None + image: StrictBytes + image_format: Optional[RecraftImageFormat] = None + mask: StrictBytes + model: Optional[RecraftTransformModel] = None + n: Optional[int] = None + negative_prompt: Optional[str] = None + prompt: str + response_format: Optional[RecraftResponseFormat] = None + style: Optional[RecraftImageStyle] = None + style_id: Optional[UUID] = None + substyle: Optional[RecraftImageSubStyle] = None + text_layout: Optional[RecraftTextLayout] = None + + +class ResponseContentPartAddedEvent(BaseModel): + content_index: int = Field( + ..., description='The index of the content part that was added.' + ) + item_id: str = Field( + ..., description='The ID of the output item that the content part was added to.' + ) + output_index: int = Field( + ..., + description='The index of the output item that the content part was added to.', + ) + part: OutputContent + type: Type20 = Field( + ..., description='The type of the event. Always `response.content_part.added`.' + ) + + +class ResponseContentPartDoneEvent(BaseModel): + content_index: int = Field( + ..., description='The index of the content part that is done.' + ) + item_id: str = Field( + ..., description='The ID of the output item that the content part was added to.' + ) + output_index: int = Field( + ..., + description='The index of the output item that the content part was added to.', + ) + part: OutputContent + type: Type21 = Field( + ..., description='The type of the event. Always `response.content_part.done`.' + ) + + class ResponseError(BaseModel): code: ResponseErrorCode message: str = Field(..., description='A human-readable description of the error.') class Rodin3DDownloadResponse(BaseModel): - list: Optional[RodinResourceItem] = None + list: Optional[List[RodinResourceItem]] = None class Rodin3DGenerateRequest(BaseModel): @@ -3371,6 +5535,11 @@ class Rodin3DGenerateResponse(BaseModel): uuid: Optional[str] = Field(None, description='Task UUID') +class RodinCheckStatusJobItem(BaseModel): + status: Optional[RodinStatusOptions] = None + uuid: Optional[str] = Field(None, description='sub uuid') + + class RunwayImageToVideoRequest(BaseModel): duration: RunwayDurationEnum model: RunwayModelEnum @@ -3384,6 +5553,109 @@ class RunwayImageToVideoRequest(BaseModel): ) +class StripeCharge(BaseModel): + amount: Optional[int] = None + amount_captured: Optional[int] = None + amount_refunded: Optional[int] = None + application: Optional[str] = None + application_fee: Optional[str] = None + application_fee_amount: Optional[int] = None + balance_transaction: Optional[str] = None + billing_details: Optional[StripeBillingDetails] = None + calculated_statement_descriptor: Optional[str] = None + captured: Optional[bool] = None + created: Optional[int] = None + currency: Optional[str] = None + customer: Optional[str] = None + description: Optional[str] = None + destination: Optional[Any] = None + dispute: Optional[Any] = None + disputed: Optional[bool] = None + failure_balance_transaction: Optional[Any] = None + failure_code: Optional[Any] = None + failure_message: Optional[Any] = None + fraud_details: Optional[Dict[str, Any]] = None + id: Optional[str] = None + invoice: Optional[Any] = None + livemode: Optional[bool] = None + metadata: Optional[Dict[str, Any]] = None + object: Optional[Object1] = None + on_behalf_of: Optional[Any] = None + order: Optional[Any] = None + outcome: Optional[StripeOutcome] = None + paid: Optional[bool] = None + payment_intent: Optional[str] = None + payment_method: Optional[str] = None + payment_method_details: Optional[StripePaymentMethodDetails] = None + radar_options: Optional[Dict[str, Any]] = None + receipt_email: Optional[str] = None + receipt_number: Optional[str] = None + receipt_url: Optional[str] = None + refunded: Optional[bool] = None + refunds: Optional[StripeRefundList] = None + review: Optional[Any] = None + shipping: Optional[StripeShipping] = None + source: Optional[Any] = None + source_transfer: Optional[Any] = None + statement_descriptor: Optional[Any] = None + statement_descriptor_suffix: Optional[Any] = None + status: Optional[str] = None + transfer_data: Optional[Any] = None + transfer_group: Optional[Any] = None + + +class StripeChargeList(BaseModel): + data: Optional[List[StripeCharge]] = None + has_more: Optional[bool] = None + object: Optional[str] = None + total_count: Optional[int] = None + url: Optional[str] = None + + +class StripePaymentIntent(BaseModel): + amount: Optional[int] = None + amount_capturable: Optional[int] = None + amount_details: Optional[StripeAmountDetails] = None + amount_received: Optional[int] = None + application: Optional[str] = None + application_fee_amount: Optional[int] = None + automatic_payment_methods: Optional[Any] = None + canceled_at: Optional[int] = None + cancellation_reason: Optional[str] = None + capture_method: Optional[str] = None + charges: Optional[StripeChargeList] = None + client_secret: Optional[str] = None + confirmation_method: Optional[str] = None + created: Optional[int] = None + currency: Optional[str] = None + customer: Optional[str] = None + description: Optional[str] = None + id: Optional[str] = None + invoice: Optional[str] = None + last_payment_error: Optional[Any] = None + latest_charge: Optional[str] = None + livemode: Optional[bool] = None + metadata: Optional[Dict[str, Any]] = None + next_action: Optional[Any] = None + object: Optional[Object3] = None + on_behalf_of: Optional[Any] = None + payment_method: Optional[str] = None + payment_method_configuration_details: Optional[Any] = None + payment_method_options: Optional[StripePaymentMethodOptions] = None + payment_method_types: Optional[List[str]] = None + processing: Optional[Any] = None + receipt_email: Optional[str] = None + review: Optional[Any] = None + setup_future_usage: Optional[Any] = None + shipping: Optional[StripeShipping] = None + source: Optional[Any] = None + statement_descriptor: Optional[Any] = None + statement_descriptor_suffix: Optional[Any] = None + status: Optional[str] = None + transfer_data: Optional[Any] = None + transfer_group: Optional[Any] = None + + class TextResponseFormatConfiguration( RootModel[ Union[ @@ -3411,6 +5683,22 @@ class Tool( ] = Field(..., discriminator='type') +class BulkNodeVersionResult(BaseModel): + error_message: Optional[str] = Field( + None, + description='Error message if retrieval failed (only present if status is error)', + ) + identifier: NodeVersionIdentifier + node_version: Optional[NodeVersion] = None + status: Status = Field(..., description='Status of the retrieval operation') + + +class BulkNodeVersionsResponse(BaseModel): + node_versions: List[BulkNodeVersionResult] = Field( + ..., description='List of retrieved node versions with their status' + ) + + class EasyInputMessage(BaseModel): content: Union[str, InputMessageContentList] = Field( ..., @@ -3439,6 +5727,16 @@ class GeminiGenerateContentRequest(BaseModel): videoMetadata: Optional[GeminiVideoMetadata] = None +class GithubReleaseWebhook(BaseModel): + action: Action = Field(..., description='The action performed on the release') + enterprise: Optional[GithubEnterprise] = None + installation: Optional[GithubInstallation] = None + organization: Optional[GithubOrganization] = None + release: Release = Field(..., description='The release object') + repository: GithubRepository + sender: GithubUser + + class ImagenGenerateImageRequest(BaseModel): instances: List[ImagenImageGenerationInstance] parameters: ImagenImageGenerationParameters @@ -3447,8 +5745,8 @@ class ImagenGenerateImageRequest(BaseModel): class InputMessage(BaseModel): content: Optional[InputMessageContentList] = None role: Optional[Role3] = None - status: Optional[Status2] = None - type: Optional[Type9] = None + status: Optional[Status3] = None + type: Optional[Type10] = None class Item( @@ -3519,6 +5817,70 @@ class OutputItem( ] +class Publisher(BaseModel): + createdAt: Optional[datetime] = Field( + None, description='The date and time the publisher was created.' + ) + description: Optional[str] = None + id: Optional[str] = Field( + None, + description="The unique identifier for the publisher. It's akin to a username. Should be lowercase.", + ) + logo: Optional[str] = Field(None, description="URL to the publisher's logo.") + members: Optional[List[PublisherMember]] = Field( + None, description='A list of members in the publisher.' + ) + name: Optional[str] = None + source_code_repo: Optional[str] = None + status: Optional[PublisherStatus] = None + support: Optional[str] = None + website: Optional[str] = None + + +class RecraftGenerateImageResponse(BaseModel): + created: int + credits: int + data: List[RecraftImage] + + +class RecraftImageToImageRequest(BaseModel): + block_nsfw: Optional[bool] = None + calculate_features: Optional[bool] = None + controls: Optional[RecraftUserControls] = None + image: StrictBytes + image_format: Optional[RecraftImageFormat] = None + model: Optional[RecraftTransformModel] = None + n: Optional[int] = None + negative_prompt: Optional[str] = None + prompt: str + response_format: Optional[RecraftResponseFormat] = None + strength: float + style: Optional[RecraftImageStyle] = None + style_id: Optional[UUID] = None + substyle: Optional[RecraftImageSubStyle] = None + text_layout: Optional[RecraftTextLayout] = None + + +class ResponseOutputItemAddedEvent(BaseModel): + item: OutputItem + output_index: int = Field( + ..., description='The index of the output item that was added.\n' + ) + type: Type29 = Field( + ..., description='The type of the event. Always `response.output_item.added`.\n' + ) + + +class ResponseOutputItemDoneEvent(BaseModel): + item: OutputItem + output_index: int = Field( + ..., description='The index of the output item that was marked done.\n' + ) + type: Type30 = Field( + ..., description='The type of the event. Always `response.output_item.done`.\n' + ) + + class Text(BaseModel): format: Optional[TextResponseFormatConfiguration] = None @@ -3552,6 +5914,28 @@ class ResponseProperties(BaseModel): ) +class Rodin3DCheckStatusResponse(BaseModel): + jobs: Optional[List[RodinCheckStatusJobItem]] = Field( + None, description='Details for the generation status.' + ) + + +class Data8(BaseModel): + object: Optional[StripePaymentIntent] = None + + +class StripeEvent(BaseModel): + api_version: Optional[str] = None + created: Optional[int] = None + data: Data8 + id: str + livemode: Optional[bool] = None + object: Object2 + pending_webhooks: Optional[int] = None + request: Optional[StripeRequestInfo] = None + type: Type31 + + class GeminiCandidate(BaseModel): citationMetadata: Optional[GeminiCitationMetadata] = None content: Optional[GeminiContent] = None @@ -3562,12 +5946,67 @@ class GeminiCandidate(BaseModel): class GeminiGenerateContentResponse(BaseModel): candidates: Optional[List[GeminiCandidate]] = None promptFeedback: Optional[GeminiPromptFeedback] = None + usageMetadata: Optional[GeminiUsageMetadata] = None class InputItem(RootModel[Union[EasyInputMessage, Item]]): root: Union[EasyInputMessage, Item] +class Node(BaseModel): + author: Optional[str] = None + banner_url: Optional[str] = Field(None, description="URL to the node's banner.") + category: Optional[str] = Field(None, description='The category of the node.') + created_at: Optional[datetime] = Field( + None, description='The date and time when the node was created' + ) + description: Optional[str] = None + downloads: Optional[int] = Field( + None, description='The number of downloads of the node.' + ) + github_stars: Optional[int] = Field( + None, description='Number of stars on the GitHub repository.' + ) + icon: Optional[str] = Field(None, description="URL to the node's icon.") + id: Optional[str] = Field(None, description='The unique identifier of the node.') + latest_version: Optional[NodeVersion] = None + license: Optional[str] = Field( + None, description="The path to the LICENSE file in the node's repository." + ) + name: Optional[str] = Field(None, description='The display name of the node.') + preempted_comfy_node_names: Optional[List[str]] = Field( + None, description='A list of Comfy node names that are preempted by this node.' + ) + publisher: Optional[Publisher] = None + rating: Optional[float] = Field(None, description='The average rating of the node.') + repository: Optional[str] = Field(None, description="URL to the node's repository.") + search_ranking: Optional[int] = Field( + None, + description="A numerical value representing the node's search ranking, used for sorting search results.", + ) + status: Optional[NodeStatus] = None + status_detail: Optional[str] = Field( + None, description='The status detail of the node.' + ) + supported_accelerators: Optional[List[str]] = Field( + None, + description='List of accelerators (e.g. CUDA, DirectML, ROCm) that this node supports', + ) + supported_comfyui_frontend_version: Optional[str] = Field( + None, description='Supported versions of ComfyUI frontend' + ) + supported_comfyui_version: Optional[str] = Field( + None, description='Supported versions of ComfyUI' + ) + supported_os: Optional[List[str]] = Field( + None, description='List of operating systems that this node supports' + ) + tags: Optional[List[str]] = None + translations: Optional[Dict[str, Dict[str, Any]]] = Field( + None, description='Translations of node metadata in different languages.' + ) + + class OpenAICreateResponse(CreateModelResponseProperties, ResponseProperties): include: Optional[List[Includable]] = Field( None, @@ -3615,8 +6054,73 @@ class OpenAIResponse(ModelResponseProperties, ResponseProperties): parallel_tool_calls: Optional[bool] = Field( True, description='Whether to allow the model to run tool calls in parallel.\n' ) - status: Optional[Status6] = Field( + status: Optional[Status7] = Field( None, description='The status of the response generation. One of `completed`, `failed`, `in_progress`, or `incomplete`.', ) usage: Optional[ResponseUsage] = None + + +class ResponseCompletedEvent(BaseModel): + response: OpenAIResponse + type: Type19 = Field( + ..., description='The type of the event. Always `response.completed`.' + ) + + +class ResponseCreatedEvent(BaseModel): + response: OpenAIResponse + type: Type22 = Field( + ..., description='The type of the event. Always `response.created`.' + ) + + +class ResponseFailedEvent(BaseModel): + response: OpenAIResponse + type: Type24 = Field( + ..., description='The type of the event. Always `response.failed`.\n' + ) + + +class ResponseInProgressEvent(BaseModel): + response: OpenAIResponse + type: Type27 = Field( + ..., description='The type of the event. Always `response.in_progress`.\n' + ) + + +class ResponseIncompleteEvent(BaseModel): + response: OpenAIResponse + type: Type28 = Field( + ..., description='The type of the event. Always `response.incomplete`.\n' + ) + + +class OpenAIResponseStreamEvent( + RootModel[ + Union[ + ResponseCreatedEvent, + ResponseInProgressEvent, + ResponseCompletedEvent, + ResponseFailedEvent, + ResponseIncompleteEvent, + ResponseOutputItemAddedEvent, + ResponseOutputItemDoneEvent, + ResponseContentPartAddedEvent, + ResponseContentPartDoneEvent, + ResponseErrorEvent, + ] + ] +): + root: Union[ + ResponseCreatedEvent, + ResponseInProgressEvent, + ResponseCompletedEvent, + ResponseFailedEvent, + ResponseIncompleteEvent, + ResponseOutputItemAddedEvent, + ResponseOutputItemDoneEvent, + ResponseContentPartAddedEvent, + ResponseContentPartDoneEvent, + ResponseErrorEvent, + ] = Field(..., description='Events that can be emitted during response streaming') diff --git a/comfy_api_nodes/apis/tripo_api.py b/comfy_api_nodes/apis/tripo_api.py index 626e8d277..9f43d4d09 100644 --- a/comfy_api_nodes/apis/tripo_api.py +++ b/comfy_api_nodes/apis/tripo_api.py @@ -127,7 +127,7 @@ class TripoTextToModelRequest(BaseModel): type: TripoTaskType = Field(TripoTaskType.TEXT_TO_MODEL, description='Type of task') prompt: str = Field(..., description='The text prompt describing the model to generate', max_length=1024) negative_prompt: Optional[str] = Field(None, description='The negative text prompt', max_length=1024) - model_version: Optional[TripoModelVersion] = TripoModelVersion.V2_5 + model_version: Optional[TripoModelVersion] = TripoModelVersion.v2_5_20250123 face_limit: Optional[int] = Field(None, description='The number of faces to limit the generation to') texture: Optional[bool] = Field(True, description='Whether to apply texture to the generated model') pbr: Optional[bool] = Field(True, description='Whether to apply PBR to the generated model') diff --git a/comfy_api_nodes/nodes_veo2.py b/comfy_api_nodes/nodes_veo2.py index df846d5dd..97bfe20e6 100644 --- a/comfy_api_nodes/nodes_veo2.py +++ b/comfy_api_nodes/nodes_veo2.py @@ -8,10 +8,10 @@ from typing import Optional from comfy.comfy_types.node_typing import IO, ComfyNodeABC from comfy_api.input_impl.video_types import VideoFromFile from comfy_api_nodes.apis import ( - Veo2GenVidRequest, - Veo2GenVidResponse, - Veo2GenVidPollRequest, - Veo2GenVidPollResponse + VeoGenVidRequest, + VeoGenVidResponse, + VeoGenVidPollRequest, + VeoGenVidPollResponse ) from comfy_api_nodes.apis.client import ( ApiEndpoint, @@ -35,7 +35,7 @@ def convert_image_to_base64(image: torch.Tensor): return tensor_to_base64_string(scaled_image) -def get_video_url_from_response(poll_response: Veo2GenVidPollResponse) -> Optional[str]: +def get_video_url_from_response(poll_response: VeoGenVidPollResponse) -> Optional[str]: if ( poll_response.response and hasattr(poll_response.response, "videos") @@ -130,6 +130,14 @@ class VeoVideoGenerationNode(ComfyNodeABC): "default": None, "tooltip": "Optional reference image to guide video generation", }), + "model": ( + IO.COMBO, + { + "options": ["veo-2.0-generate-001"], + "default": "veo-2.0-generate-001", + "tooltip": "Veo 2 model to use for video generation", + }, + ), }, "hidden": { "auth_token": "AUTH_TOKEN_COMFY_ORG", @@ -141,7 +149,7 @@ class VeoVideoGenerationNode(ComfyNodeABC): RETURN_TYPES = (IO.VIDEO,) FUNCTION = "generate_video" CATEGORY = "api node/video/Veo" - DESCRIPTION = "Generates videos from text prompts using Google's Veo API" + DESCRIPTION = "Generates videos from text prompts using Google's Veo 2 API" API_NODE = True def generate_video( @@ -154,6 +162,8 @@ class VeoVideoGenerationNode(ComfyNodeABC): person_generation="ALLOW", seed=0, image=None, + model="veo-2.0-generate-001", + generate_audio=False, unique_id: Optional[str] = None, **kwargs, ): @@ -188,16 +198,19 @@ class VeoVideoGenerationNode(ComfyNodeABC): parameters["negativePrompt"] = negative_prompt if seed > 0: parameters["seed"] = seed + # Only add generateAudio for Veo 3 models + if "veo-3.0" in model: + parameters["generateAudio"] = generate_audio # Initial request to start video generation initial_operation = SynchronousOperation( endpoint=ApiEndpoint( - path="/proxy/veo/generate", + path=f"/proxy/veo/{model}/generate", method=HttpMethod.POST, - request_model=Veo2GenVidRequest, - response_model=Veo2GenVidResponse + request_model=VeoGenVidRequest, + response_model=VeoGenVidResponse ), - request=Veo2GenVidRequest( + request=VeoGenVidRequest( instances=instances, parameters=parameters ), @@ -223,16 +236,16 @@ class VeoVideoGenerationNode(ComfyNodeABC): # Define the polling operation poll_operation = PollingOperation( poll_endpoint=ApiEndpoint( - path="/proxy/veo/poll", + path=f"/proxy/veo/{model}/poll", method=HttpMethod.POST, - request_model=Veo2GenVidPollRequest, - response_model=Veo2GenVidPollResponse + request_model=VeoGenVidPollRequest, + response_model=VeoGenVidPollResponse ), completed_statuses=["completed"], failed_statuses=[], # No failed statuses, we'll handle errors after polling status_extractor=status_extractor, progress_extractor=progress_extractor, - request=Veo2GenVidPollRequest( + request=VeoGenVidPollRequest( operationName=operation_name ), auth_kwargs=kwargs, @@ -298,11 +311,64 @@ class VeoVideoGenerationNode(ComfyNodeABC): return (VideoFromFile(video_io),) -# Register the node +class Veo3VideoGenerationNode(VeoVideoGenerationNode): + """ + Generates videos from text prompts using Google's Veo 3 API. + + Supported models: + - veo-3.0-generate-001 + - veo-3.0-fast-generate-001 + + This node extends the base Veo node with Veo 3 specific features including + audio generation and fixed 8-second duration. + """ + + @classmethod + def INPUT_TYPES(s): + parent_input = super().INPUT_TYPES() + + # Update model options for Veo 3 + parent_input["optional"]["model"] = ( + IO.COMBO, + { + "options": ["veo-3.0-generate-001", "veo-3.0-fast-generate-001"], + "default": "veo-3.0-generate-001", + "tooltip": "Veo 3 model to use for video generation", + }, + ) + + # Add generateAudio parameter + parent_input["optional"]["generate_audio"] = ( + IO.BOOLEAN, + { + "default": False, + "tooltip": "Generate audio for the video. Supported by all Veo 3 models.", + } + ) + + # Update duration constraints for Veo 3 (only 8 seconds supported) + parent_input["optional"]["duration_seconds"] = ( + IO.INT, + { + "default": 8, + "min": 8, + "max": 8, + "step": 1, + "display": "number", + "tooltip": "Duration of the output video in seconds (Veo 3 only supports 8 seconds)", + }, + ) + + return parent_input + + +# Register the nodes NODE_CLASS_MAPPINGS = { "VeoVideoGenerationNode": VeoVideoGenerationNode, + "Veo3VideoGenerationNode": Veo3VideoGenerationNode, } NODE_DISPLAY_NAME_MAPPINGS = { - "VeoVideoGenerationNode": "Google Veo2 Video Generation", + "VeoVideoGenerationNode": "Google Veo 2 Video Generation", + "Veo3VideoGenerationNode": "Google Veo 3 Video Generation", } From 5be6fd09ffb46cfdff240fb5b96dd8c06b2a0344 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Tue, 5 Aug 2025 15:48:56 +0800 Subject: [PATCH 099/243] Update template to 0.1.48 (#9182) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ffa7dce65..470060ab4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.47 +comfyui-workflow-templates==0.1.48 comfyui-embedded-docs==0.2.4 torch torchsde From d044a243986700aae552acdebf7e767ae8282e37 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 5 Aug 2025 03:12:27 -0700 Subject: [PATCH 100/243] Fix default shift and any latent size for qwen image model. (#9186) --- comfy/ldm/qwen_image/model.py | 9 +++++---- comfy/supported_models.py | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index ff631a60f..c15ab8e40 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -8,7 +8,7 @@ from einops import repeat from comfy.ldm.lightricks.model import TimestepEmbedding, Timesteps from comfy.ldm.modules.attention import optimized_attention_masked from comfy.ldm.flux.layers import EmbedND - +import comfy.ldm.common_dit class GELU(nn.Module): def __init__(self, dim_in: int, dim_out: int, approximate: str = "none", bias: bool = True, dtype=None, device=None, operations=None): @@ -364,8 +364,9 @@ class QwenImageTransformer2DModel(nn.Module): image_rotary_emb = self.pos_embeds(x, context) - orig_shape = x.shape - hidden_states = x.view(orig_shape[0], orig_shape[1], orig_shape[-2] // 2, 2, orig_shape[-1] // 2, 2) + hidden_states = comfy.ldm.common_dit.pad_to_patch_size(x, (1, self.patch_size, self.patch_size)) + orig_shape = hidden_states.shape + hidden_states = hidden_states.view(orig_shape[0], orig_shape[1], orig_shape[-2] // 2, 2, orig_shape[-1] // 2, 2) hidden_states = hidden_states.permute(0, 2, 4, 1, 3, 5) hidden_states = hidden_states.reshape(orig_shape[0], (orig_shape[-2] // 2) * (orig_shape[-1] // 2), orig_shape[1] * 4) @@ -396,4 +397,4 @@ class QwenImageTransformer2DModel(nn.Module): hidden_states = hidden_states.view(orig_shape[0], orig_shape[-2] // 2, orig_shape[-1] // 2, orig_shape[1], 2, 2) hidden_states = hidden_states.permute(0, 3, 1, 4, 2, 5) - return hidden_states.reshape(orig_shape) + return hidden_states.reshape(orig_shape)[:, :, :, :x.shape[-2], :x.shape[-1]] diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 880055bd3..156ff9e26 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1237,7 +1237,7 @@ class QwenImage(supported_models_base.BASE): sampling_settings = { "multiplier": 1.0, - "shift": 2.6, + "shift": 1.15, } memory_usage_factor = 1.8 #TODO From da1ad9b5163fb848f3ec87ccc4fd0c8069f6eff0 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Tue, 5 Aug 2025 19:24:12 +0800 Subject: [PATCH 101/243] Update template to 0.1.51 (#9187) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 470060ab4..9a6b04cf2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.48 +comfyui-workflow-templates==0.1.51 comfyui-embedded-docs==0.2.4 torch torchsde From 32a95bba8ac91e8a610c35ce4d9963d2453118c1 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Tue, 5 Aug 2025 07:33:02 -0400 Subject: [PATCH 102/243] ComfyUI version 0.3.49 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 7b29e338d..5e2d09c81 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.48" +__version__ = "0.3.49" diff --git a/pyproject.toml b/pyproject.toml index 256677fad..3c530ba85 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.48" +version = "0.3.49" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From d8c51ba15aef6b0df86a7ea0203881be55d7579b Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 5 Aug 2025 04:41:18 -0700 Subject: [PATCH 103/243] Add Qwen Image model to readme. (#9191) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 2abd8e600..119098f5c 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,7 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith - [Lumina Image 2.0](https://comfyanonymous.github.io/ComfyUI_examples/lumina2/) - [HiDream](https://comfyanonymous.github.io/ComfyUI_examples/hidream/) - [Cosmos Predict2](https://comfyanonymous.github.io/ComfyUI_examples/cosmos_predict2/) + - [Qwen Image](https://comfyanonymous.github.io/ComfyUI_examples/qwen_image/) - Image Editing Models - [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/) - [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model) From 9126c0cfe49508a64c429f97b45664b241aab3f2 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 6 Aug 2025 01:07:04 -0700 Subject: [PATCH 104/243] Qwen Image model merging node. (#9202) --- .../nodes_model_merging_model_specific.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/comfy_extras/nodes_model_merging_model_specific.py b/comfy_extras/nodes_model_merging_model_specific.py index 2c93cd84f..55eb3ccfe 100644 --- a/comfy_extras/nodes_model_merging_model_specific.py +++ b/comfy_extras/nodes_model_merging_model_specific.py @@ -314,6 +314,29 @@ class ModelMergeCosmosPredict2_14B(comfy_extras.nodes_model_merging.ModelMergeBl return {"required": arg_dict} +class ModelMergeQwenImage(comfy_extras.nodes_model_merging.ModelMergeBlocks): + CATEGORY = "advanced/model_merging/model_specific" + + @classmethod + def INPUT_TYPES(s): + arg_dict = { "model1": ("MODEL",), + "model2": ("MODEL",)} + + argument = ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}) + + arg_dict["pos_embeds."] = argument + arg_dict["img_in."] = argument + arg_dict["txt_norm."] = argument + arg_dict["txt_in."] = argument + arg_dict["time_text_embed."] = argument + + for i in range(60): + arg_dict["transformer_blocks.{}.".format(i)] = argument + + arg_dict["proj_out."] = argument + + return {"required": arg_dict} + NODE_CLASS_MAPPINGS = { "ModelMergeSD1": ModelMergeSD1, "ModelMergeSD2": ModelMergeSD1, #SD1 and SD2 have the same blocks @@ -329,4 +352,5 @@ NODE_CLASS_MAPPINGS = { "ModelMergeWAN2_1": ModelMergeWAN2_1, "ModelMergeCosmosPredict2_2B": ModelMergeCosmosPredict2_2B, "ModelMergeCosmosPredict2_14B": ModelMergeCosmosPredict2_14B, + "ModelMergeQwenImage": ModelMergeQwenImage, } From 4c3e57b0ae9fb7ff1322977915efe7e98544d15d Mon Sep 17 00:00:00 2001 From: flybirdxx <1119577418@qq.com> Date: Thu, 7 Aug 2025 01:23:11 +0800 Subject: [PATCH 105/243] Fixed an issue where qwenLora could not be loaded properly. (#9208) --- comfy/lora.py | 9 +++++++++ comfy/weight_adapter/lora.py | 5 +++++ 2 files changed, 14 insertions(+) diff --git a/comfy/lora.py b/comfy/lora.py index 387d5c52a..6686b7229 100644 --- a/comfy/lora.py +++ b/comfy/lora.py @@ -293,6 +293,15 @@ def model_lora_keys_unet(model, key_map={}): key_lora = k[len("diffusion_model."):-len(".weight")] key_map["{}".format(key_lora)] = k + if isinstance(model, comfy.model_base.QwenImage): + for k in sdk: + if k.startswith("diffusion_model.") and k.endswith(".weight"): #QwenImage lora format + key_lora = k[len("diffusion_model."):-len(".weight")] + # Direct mapping for transformer_blocks format (QwenImage LoRA format) + key_map["{}".format(key_lora)] = k + # Support transformer prefix format + key_map["transformer.{}".format(key_lora)] = k + return key_map diff --git a/comfy/weight_adapter/lora.py b/comfy/weight_adapter/lora.py index 729dbd9e6..47aa17d13 100644 --- a/comfy/weight_adapter/lora.py +++ b/comfy/weight_adapter/lora.py @@ -96,6 +96,7 @@ class LoRAAdapter(WeightAdapterBase): diffusers3_lora = "{}.lora.up.weight".format(x) mochi_lora = "{}.lora_B".format(x) transformers_lora = "{}.lora_linear_layer.up.weight".format(x) + qwen_default_lora = "{}.lora_B.default.weight".format(x) A_name = None if regular_lora in lora.keys(): @@ -122,6 +123,10 @@ class LoRAAdapter(WeightAdapterBase): A_name = transformers_lora B_name = "{}.lora_linear_layer.down.weight".format(x) mid_name = None + elif qwen_default_lora in lora.keys(): + A_name = qwen_default_lora + B_name = "{}.lora_A.default.weight".format(x) + mid_name = None if A_name is not None: mid = None From 32691b16f4e1a897e461e77f9d6dceba2d6f0cd1 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Thu, 7 Aug 2025 01:26:29 +0800 Subject: [PATCH 106/243] Update template to 0.1.52 (#9206) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 9a6b04cf2..d6926d610 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.51 +comfyui-workflow-templates==0.1.52 comfyui-embedded-docs==0.2.4 torch torchsde From 37d620a6b85f61b824363ed8170db373726ca45a Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Wed, 6 Aug 2025 16:52:39 -0700 Subject: [PATCH 107/243] Update frontend to v1.24.3 (#9175) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d6926d610..2f4692b03 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.23.4 +comfyui-frontend-package==1.24.4 comfyui-workflow-templates==0.1.52 comfyui-embedded-docs==0.2.4 torch From 05df2df489f6b237f63c5f7d42a943ae2be417e9 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 7 Aug 2025 08:20:40 -0700 Subject: [PATCH 108/243] Fix RepeatLatentBatch not working on multi dim latents. (#9227) --- nodes.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nodes.py b/nodes.py index 9bedbcaca..9448f9c1b 100644 --- a/nodes.py +++ b/nodes.py @@ -1229,12 +1229,12 @@ class RepeatLatentBatch: s = samples.copy() s_in = samples["samples"] - s["samples"] = s_in.repeat((amount, 1,1,1)) + s["samples"] = s_in.repeat((amount,) + ((1,) * (s_in.ndim - 1))) if "noise_mask" in samples and samples["noise_mask"].shape[0] > 1: masks = samples["noise_mask"] if masks.shape[0] < s_in.shape[0]: - masks = masks.repeat(math.ceil(s_in.shape[0] / masks.shape[0]), 1, 1, 1)[:s_in.shape[0]] - s["noise_mask"] = samples["noise_mask"].repeat((amount, 1,1,1)) + masks = masks.repeat((math.ceil(s_in.shape[0] / masks.shape[0]),) + ((1,) * (masks.ndim - 1)))[:s_in.shape[0]] + s["noise_mask"] = samples["noise_mask"].repeat((amount,) + ((1,) * (samples["noise_mask"].ndim - 1))) if "batch_index" in s: offset = max(s["batch_index"]) - min(s["batch_index"]) + 1 s["batch_index"] = s["batch_index"] + [x + (i * offset) for i in range(1, amount) for x in s["batch_index"]] From 42974a448c39af50c5f72d8c70267f9fe2971cd2 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Thu, 7 Aug 2025 14:54:09 -0700 Subject: [PATCH 109/243] _ui.py import torchaudio safety check (#9234) * Added safety around torchaudio import in _ui.py * Trusted cursor too much, fixed torchaudio bool --- comfy_api/latest/_ui.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/comfy_api/latest/_ui.py b/comfy_api/latest/_ui.py index 6b8a39d58..61597038f 100644 --- a/comfy_api/latest/_ui.py +++ b/comfy_api/latest/_ui.py @@ -9,7 +9,11 @@ from typing import Type import av import numpy as np import torch -import torchaudio +try: + import torchaudio + TORCH_AUDIO_AVAILABLE = True +except ImportError: + TORCH_AUDIO_AVAILABLE = False from PIL import Image as PILImage from PIL.PngImagePlugin import PngInfo @@ -302,6 +306,8 @@ class AudioSaveHelper: # Resample if necessary if sample_rate != audio["sample_rate"]: + if not TORCH_AUDIO_AVAILABLE: + raise Exception("torchaudio is not available; cannot resample audio.") waveform = torchaudio.functional.resample(waveform, audio["sample_rate"], sample_rate) # Create output with specified format From bf2a1b5b1ef72b240454f3ac44f5209af45efe00 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 8 Aug 2025 06:37:50 +0300 Subject: [PATCH 110/243] async API nodes (#9129) * converted API nodes to async * converted BFL API nodes to async * fixed client bug; converted gemini, ideogram, minimax * fixed client bug; converted openai nodes * fixed client bug; converted moonvalley, pika nodes * fixed client bug; converted kling, luma nodes * converted pixverse, rodin nodes * converted tripo, veo2 * converted recraft nodes * add lost log_request_response call --- comfy_api_nodes/apinode_utils.py | 152 ++--- comfy_api_nodes/apis/client.py | 901 +++++++++++----------------- comfy_api_nodes/nodes_bfl.py | 134 +++-- comfy_api_nodes/nodes_gemini.py | 4 +- comfy_api_nodes/nodes_ideogram.py | 22 +- comfy_api_nodes/nodes_kling.py | 130 ++-- comfy_api_nodes/nodes_luma.py | 70 ++- comfy_api_nodes/nodes_minimax.py | 14 +- comfy_api_nodes/nodes_moonvalley.py | 38 +- comfy_api_nodes/nodes_openai.py | 28 +- comfy_api_nodes/nodes_pika.py | 63 +- comfy_api_nodes/nodes_pixverse.py | 46 +- comfy_api_nodes/nodes_recraft.py | 44 +- comfy_api_nodes/nodes_rodin.py | 147 ++--- comfy_api_nodes/nodes_runway.py | 54 +- comfy_api_nodes/nodes_stability.py | 23 +- comfy_api_nodes/nodes_tripo.py | 69 ++- comfy_api_nodes/nodes_veo2.py | 15 +- 18 files changed, 878 insertions(+), 1076 deletions(-) diff --git a/comfy_api_nodes/apinode_utils.py b/comfy_api_nodes/apinode_utils.py index 788e2803f..f953f86df 100644 --- a/comfy_api_nodes/apinode_utils.py +++ b/comfy_api_nodes/apinode_utils.py @@ -1,4 +1,5 @@ from __future__ import annotations +import aiohttp import io import logging import mimetypes @@ -21,7 +22,6 @@ from server import PromptServer import numpy as np from PIL import Image -import requests import torch import math import base64 @@ -30,7 +30,7 @@ from io import BytesIO import av -def download_url_to_video_output(video_url: str, timeout: int = None) -> VideoFromFile: +async def download_url_to_video_output(video_url: str, timeout: int = None) -> VideoFromFile: """Downloads a video from a URL and returns a `VIDEO` output. Args: @@ -39,7 +39,7 @@ def download_url_to_video_output(video_url: str, timeout: int = None) -> VideoFr Returns: A Comfy node `VIDEO` output. """ - video_io = download_url_to_bytesio(video_url, timeout) + video_io = await download_url_to_bytesio(video_url, timeout) if video_io is None: error_msg = f"Failed to download video from {video_url}" logging.error(error_msg) @@ -62,7 +62,7 @@ def downscale_image_tensor(image, total_pixels=1536 * 1024) -> torch.Tensor: return s -def validate_and_cast_response( +async def validate_and_cast_response( response, timeout: int = None, node_id: Union[str, None] = None ) -> torch.Tensor: """Validates and casts a response to a torch.Tensor. @@ -86,35 +86,24 @@ def validate_and_cast_response( image_tensors: list[torch.Tensor] = [] # Process each image in the data array - for image_data in data: - image_url = image_data.url - b64_data = image_data.b64_json + async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=timeout)) as session: + for img_data in data: + img_bytes: bytes + if img_data.b64_json: + img_bytes = base64.b64decode(img_data.b64_json) + elif img_data.url: + if node_id: + PromptServer.instance.send_progress_text(f"Result URL: {img_data.url}", node_id) + async with session.get(img_data.url) as resp: + if resp.status != 200: + raise ValueError("Failed to download generated image") + img_bytes = await resp.read() + else: + raise ValueError("Invalid image payload – neither URL nor base64 data present.") - if not image_url and not b64_data: - raise ValueError("No image was generated in the response") - - if b64_data: - img_data = base64.b64decode(b64_data) - img = Image.open(io.BytesIO(img_data)) - - elif image_url: - if node_id: - PromptServer.instance.send_progress_text( - f"Result URL: {image_url}", node_id - ) - img_response = requests.get(image_url, timeout=timeout) - if img_response.status_code != 200: - raise ValueError("Failed to download the image") - img = Image.open(io.BytesIO(img_response.content)) - - img = img.convert("RGBA") - - # Convert to numpy array, normalize to float32 between 0 and 1 - img_array = np.array(img).astype(np.float32) / 255.0 - img_tensor = torch.from_numpy(img_array) - - # Add to list of tensors - image_tensors.append(img_tensor) + pil_img = Image.open(BytesIO(img_bytes)).convert("RGBA") + arr = np.asarray(pil_img).astype(np.float32) / 255.0 + image_tensors.append(torch.from_numpy(arr)) return torch.stack(image_tensors, dim=0) @@ -175,7 +164,7 @@ def mimetype_to_extension(mime_type: str) -> str: return mime_type.split("/")[-1].lower() -def download_url_to_bytesio(url: str, timeout: int = None) -> BytesIO: +async def download_url_to_bytesio(url: str, timeout: int = None) -> BytesIO: """Downloads content from a URL using requests and returns it as BytesIO. Args: @@ -185,9 +174,11 @@ def download_url_to_bytesio(url: str, timeout: int = None) -> BytesIO: Returns: BytesIO object containing the downloaded content. """ - response = requests.get(url, stream=True, timeout=timeout) - response.raise_for_status() # Raises HTTPError for bad responses (4XX or 5XX) - return BytesIO(response.content) + timeout_cfg = aiohttp.ClientTimeout(total=timeout) if timeout else None + async with aiohttp.ClientSession(timeout=timeout_cfg) as session: + async with session.get(url) as resp: + resp.raise_for_status() # Raises HTTPError for bad responses (4XX or 5XX) + return BytesIO(await resp.read()) def bytesio_to_image_tensor(image_bytesio: BytesIO, mode: str = "RGBA") -> torch.Tensor: @@ -210,15 +201,15 @@ def bytesio_to_image_tensor(image_bytesio: BytesIO, mode: str = "RGBA") -> torch return torch.from_numpy(image_array).unsqueeze(0) -def download_url_to_image_tensor(url: str, timeout: int = None) -> torch.Tensor: +async def download_url_to_image_tensor(url: str, timeout: int = None) -> torch.Tensor: """Downloads an image from a URL and returns a [B, H, W, C] tensor.""" - image_bytesio = download_url_to_bytesio(url, timeout) + image_bytesio = await download_url_to_bytesio(url, timeout) return bytesio_to_image_tensor(image_bytesio) -def process_image_response(response: requests.Response) -> torch.Tensor: +def process_image_response(response_content: bytes | str) -> torch.Tensor: """Uses content from a Response object and converts it to a torch.Tensor""" - return bytesio_to_image_tensor(BytesIO(response.content)) + return bytesio_to_image_tensor(BytesIO(response_content)) def _tensor_to_pil(image: torch.Tensor, total_pixels: int = 2048 * 2048) -> Image.Image: @@ -336,10 +327,10 @@ def text_filepath_to_data_uri(filepath: str) -> str: return f"data:{mime_type};base64,{base64_string}" -def upload_file_to_comfyapi( +async def upload_file_to_comfyapi( file_bytes_io: BytesIO, filename: str, - upload_mime_type: str, + upload_mime_type: Optional[str], auth_kwargs: Optional[dict[str, str]] = None, ) -> str: """ @@ -354,7 +345,10 @@ def upload_file_to_comfyapi( Returns: The download URL for the uploaded file. """ - request_object = UploadRequest(file_name=filename, content_type=upload_mime_type) + if upload_mime_type is None: + request_object = UploadRequest(file_name=filename) + else: + request_object = UploadRequest(file_name=filename, content_type=upload_mime_type) operation = SynchronousOperation( endpoint=ApiEndpoint( path="/customers/storage", @@ -366,12 +360,8 @@ def upload_file_to_comfyapi( auth_kwargs=auth_kwargs, ) - response: UploadResponse = operation.execute() - upload_response = ApiClient.upload_file( - response.upload_url, file_bytes_io, content_type=upload_mime_type - ) - upload_response.raise_for_status() - + response: UploadResponse = await operation.execute() + await ApiClient.upload_file(response.upload_url, file_bytes_io, content_type=upload_mime_type) return response.download_url @@ -399,7 +389,7 @@ def video_to_base64_string( return base64.b64encode(video_bytes_io.getvalue()).decode("utf-8") -def upload_video_to_comfyapi( +async def upload_video_to_comfyapi( video: VideoInput, auth_kwargs: Optional[dict[str, str]] = None, container: VideoContainer = VideoContainer.MP4, @@ -439,9 +429,7 @@ def upload_video_to_comfyapi( video.save_to(video_bytes_io, format=container, codec=codec) video_bytes_io.seek(0) - return upload_file_to_comfyapi( - video_bytes_io, filename, upload_mime_type, auth_kwargs - ) + return await upload_file_to_comfyapi(video_bytes_io, filename, upload_mime_type, auth_kwargs) def audio_tensor_to_contiguous_ndarray(waveform: torch.Tensor) -> np.ndarray: @@ -501,7 +489,7 @@ def audio_ndarray_to_bytesio( return audio_bytes_io -def upload_audio_to_comfyapi( +async def upload_audio_to_comfyapi( audio: AudioInput, auth_kwargs: Optional[dict[str, str]] = None, container_format: str = "mp4", @@ -527,7 +515,7 @@ def upload_audio_to_comfyapi( audio_data_np, sample_rate, container_format, codec_name ) - return upload_file_to_comfyapi(audio_bytes_io, filename, mime_type, auth_kwargs) + return await upload_file_to_comfyapi(audio_bytes_io, filename, mime_type, auth_kwargs) def audio_to_base64_string( @@ -544,7 +532,7 @@ def audio_to_base64_string( return base64.b64encode(audio_bytes).decode("utf-8") -def upload_images_to_comfyapi( +async def upload_images_to_comfyapi( image: torch.Tensor, max_images=8, auth_kwargs: Optional[dict[str, str]] = None, @@ -561,55 +549,15 @@ def upload_images_to_comfyapi( mime_type: Optional MIME type for the image. """ # if batch, try to upload each file if max_images is greater than 0 - idx_image = 0 download_urls: list[str] = [] is_batch = len(image.shape) > 3 - batch_length = 1 - if is_batch: - batch_length = image.shape[0] - while True: - curr_image = image - if len(image.shape) > 3: - curr_image = image[idx_image] - # get BytesIO version of image - img_binary = tensor_to_bytesio(curr_image, mime_type=mime_type) - # first, request upload/download urls from comfy API - if not mime_type: - request_object = UploadRequest(file_name=img_binary.name) - else: - request_object = UploadRequest( - file_name=img_binary.name, content_type=mime_type - ) - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/customers/storage", - method=HttpMethod.POST, - request_model=UploadRequest, - response_model=UploadResponse, - ), - request=request_object, - auth_kwargs=auth_kwargs, - ) - response = operation.execute() + batch_len = image.shape[0] if is_batch else 1 - upload_response = ApiClient.upload_file( - response.upload_url, img_binary, content_type=mime_type - ) - # verify success - try: - upload_response.raise_for_status() - except requests.exceptions.HTTPError as e: - raise ValueError(f"Could not upload one or more images: {e}") from e - # add download_url to list - download_urls.append(response.download_url) - - idx_image += 1 - # stop uploading additional files if done - if is_batch and max_images > 0: - if idx_image >= max_images: - break - if idx_image >= batch_length: - break + for idx in range(min(batch_len, max_images)): + tensor = image[idx] if is_batch else image + img_io = tensor_to_bytesio(tensor, mime_type=mime_type) + url = await upload_file_to_comfyapi(img_io, img_io.name, mime_type, auth_kwargs) + download_urls.append(url) return download_urls diff --git a/comfy_api_nodes/apis/client.py b/comfy_api_nodes/apis/client.py index 2a4bac88b..4ad0b783b 100644 --- a/comfy_api_nodes/apis/client.py +++ b/comfy_api_nodes/apis/client.py @@ -43,7 +43,7 @@ operation = ApiOperation( endpoint=user_info_endpoint, request=request ) -user_profile = operation.execute(client=api_client) # Returns immediately with the result +user_profile = await operation.execute(client=api_client) # Returns immediately with the result # Example 2: Asynchronous API Operation with Polling @@ -87,18 +87,19 @@ operation = PollingOperation( ) # This will make the initial request and then poll until completion -result = operation.execute(client=api_client) # Returns the final ImageGenerationResult when done +result = await operation.execute(client=api_client) # Returns the final ImageGenerationResult when done """ from __future__ import annotations +import aiohttp +import asyncio import logging -import time import io import socket +from aiohttp.client_exceptions import ClientError, ClientResponseError from typing import Dict, Type, Optional, Any, TypeVar, Generic, Callable, Tuple from enum import Enum import json -import requests from urllib.parse import urljoin, urlparse from pydantic import BaseModel, Field import uuid # For generating unique operation IDs @@ -174,6 +175,7 @@ class ApiClient: retry_delay: float = 1.0, retry_backoff_factor: float = 2.0, retry_status_codes: Optional[Tuple[int, ...]] = None, + session: Optional[aiohttp.ClientSession] = None, ): self.base_url = base_url self.auth_token = auth_token @@ -186,13 +188,16 @@ class ApiClient: # Default retry status codes: 408 (Request Timeout), 429 (Too Many Requests), # 500, 502, 503, 504 (Server Errors) self.retry_status_codes = retry_status_codes or (408, 429, 500, 502, 503, 504) + self._session: Optional[aiohttp.ClientSession] = session + self._owns_session = session is None # Track if we have to close it - def _generate_operation_id(self, path: str) -> str: + @staticmethod + def _generate_operation_id(path: str) -> str: """Generates a unique operation ID for logging.""" return f"{path.strip('/').replace('/', '_')}_{uuid.uuid4().hex[:8]}" + @staticmethod def _create_json_payload_args( - self, data: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None, ) -> Dict[str, Any]: @@ -203,31 +208,53 @@ class ApiClient: def _create_form_data_args( self, - data: Dict[str, Any], - files: Dict[str, Any], + data: Dict[str, Any] | None, + files: Dict[str, Any] | None, headers: Optional[Dict[str, str]] = None, - multipart_parser = None, + multipart_parser: Callable | None = None, ) -> Dict[str, Any]: if headers and "Content-Type" in headers: del headers["Content-Type"] - if multipart_parser: + if multipart_parser and data: data = multipart_parser(data) - return { - "data": data, - "files": files, - "headers": headers, - } + form = aiohttp.FormData(default_to_multipart=True) + if data: # regular text fields + for k, v in data.items(): + if v is None: + continue # aiohttp fails to serialize "None" values + # aiohttp expects strings or bytes; convert enums etc. + form.add_field(k, str(v) if not isinstance(v, (bytes, bytearray)) else v) + if files: + file_iter = files if isinstance(files, list) else files.items() + for field_name, file_obj in file_iter: + if file_obj is None: + continue # aiohttp fails to serialize "None" values + # file_obj can be (filename, bytes/io.BytesIO, content_type) tuple + if isinstance(file_obj, tuple): + filename, file_value, content_type = self._unpack_tuple(file_obj) + else: + file_value = file_obj + filename = getattr(file_obj, "name", field_name) + content_type = "application/octet-stream" + + form.add_field( + name=field_name, + value=file_value, + filename=filename, + content_type=content_type, + ) + return {"data": form, "headers": headers or {}} + + @staticmethod def _create_urlencoded_form_data_args( - self, data: Dict[str, Any], headers: Optional[Dict[str, str]] = None, ) -> Dict[str, Any]: headers = headers or {} headers["Content-Type"] = "application/x-www-form-urlencoded" - return { "data": data, "headers": headers, @@ -244,7 +271,7 @@ class ApiClient: return headers - def _check_connectivity(self, target_url: str) -> Dict[str, bool]: + async def _check_connectivity(self, target_url: str) -> Dict[str, bool]: """ Check connectivity to determine if network issues are local or server-related. @@ -258,52 +285,39 @@ class ApiClient: "internet_accessible": False, "api_accessible": False, "is_local_issue": False, - "is_api_issue": False + "is_api_issue": False, } + timeout = aiohttp.ClientTimeout(total=5.0) + async with aiohttp.ClientSession(timeout=timeout) as session: + try: + async with session.get("https://www.google.com", ssl=self.verify_ssl) as resp: + results["internet_accessible"] = resp.status < 500 + except (ClientError, asyncio.TimeoutError, socket.gaierror): + results["is_local_issue"] = True + return results # cannot reach the internet – early exit - # First check basic internet connectivity using a reliable external site - try: - # Use a reliable external domain for checking basic connectivity - check_response = requests.get("https://www.google.com", - timeout=5.0, - verify=self.verify_ssl) - if check_response.status_code < 500: - results["internet_accessible"] = True - except (requests.RequestException, socket.error): - results["internet_accessible"] = False - results["is_local_issue"] = True - return results - - # Now check API server connectivity - try: - # Extract domain from the target URL to do a simpler health check - parsed_url = urlparse(target_url) - api_base = f"{parsed_url.scheme}://{parsed_url.netloc}" - - # Try to reach the API domain - api_response = requests.get(f"{api_base}/health", timeout=5.0, verify=self.verify_ssl) - if api_response.status_code < 500: - results["api_accessible"] = True - else: - results["api_accessible"] = False - results["is_api_issue"] = True - except requests.RequestException: - results["api_accessible"] = False - # If we can reach the internet but not the API, it's an API issue - results["is_api_issue"] = True + # Now check API health endpoint + parsed = urlparse(target_url) + health_url = f"{parsed.scheme}://{parsed.netloc}/health" + try: + async with session.get(health_url, ssl=self.verify_ssl) as resp: + results["api_accessible"] = resp.status < 500 + except ClientError: + pass # leave as False + results["is_api_issue"] = results["internet_accessible"] and not results["api_accessible"] return results - def request( + async def request( self, method: str, path: str, params: Optional[Dict[str, Any]] = None, data: Optional[Dict[str, Any]] = None, - files: Optional[Dict[str, Any]] = None, + files: Optional[Dict[str, Any] | list[tuple[str, Any]]] = None, headers: Optional[Dict[str, str]] = None, content_type: str = "application/json", - multipart_parser: Callable = None, + multipart_parser: Callable | None = None, retry_count: int = 0, # Used internally for tracking retries ) -> Dict[str, Any]: """ @@ -327,18 +341,19 @@ class ApiClient: ApiServerError: If the API server is unreachable but internet is working Exception: For other request failures """ - # Use urljoin but ensure path is relative to avoid absolute path behavior - relative_path = path.lstrip('/') + + # Build full URL and merge headers + relative_path = path.lstrip("/") url = urljoin(self.base_url, relative_path) - self.check_auth(self.auth_token, self.comfy_api_key) - # Combine default headers with any provided headers + self._check_auth(self.auth_token, self.comfy_api_key) + request_headers = self.get_headers() if headers: request_headers.update(headers) - - # Let requests handle the content type when files are present. if files: - del request_headers["Content-Type"] + request_headers.pop("Content-Type", None) + if params: + params = {k: v for k, v in params.items() if v is not None} # aiohttp fails to serialize None values logging.debug(f"[DEBUG] Request Headers: {request_headers}") logging.debug(f"[DEBUG] Files: {files}") @@ -346,11 +361,9 @@ class ApiClient: logging.debug(f"[DEBUG] Data: {data}") if content_type == "application/x-www-form-urlencoded": - payload_args = self._create_urlencoded_form_data_args(data, request_headers) + payload_args = self._create_urlencoded_form_data_args(data or {}, request_headers) elif content_type == "multipart/form-data": - payload_args = self._create_form_data_args( - data, files, request_headers, multipart_parser - ) + payload_args = self._create_form_data_args(data, files, request_headers, multipart_parser) else: payload_args = self._create_json_payload_args(data, request_headers) @@ -361,220 +374,67 @@ class ApiClient: request_url=url, request_headers=request_headers, request_params=params, - request_data=data if content_type == "application/json" else "[form-data or other]" + request_data=data if content_type == "application/json" else "[form-data or other]", ) + session = await self._get_session() try: - response = requests.request( - method=method, - url=url, + async with session.request( + method, + url, params=params, - timeout=self.timeout, - verify=self.verify_ssl, + ssl=self.verify_ssl, **payload_args, - ) + ) as resp: + if resp.status >= 400: + try: + error_data = await resp.json() + except (aiohttp.ContentTypeError, json.JSONDecodeError): + error_data = await resp.text() - # Check if we should retry based on status code - if (response.status_code in self.retry_status_codes and - retry_count < self.max_retries): + return await self._handle_http_error( + ClientResponseError(resp.request_info, resp.history, status=resp.status, message=error_data), + operation_id, + method, + url, + params, + data, + files, + headers, + content_type, + multipart_parser, + retry_count=retry_count, + response_content=error_data, + ) - # Calculate delay with exponential backoff - delay = self.retry_delay * (self.retry_backoff_factor ** retry_count) - - logging.warning( - f"Request failed with status {response.status_code}. " - f"Retrying in {delay:.2f}s ({retry_count + 1}/{self.max_retries})" - ) - - time.sleep(delay) - return self.request( - method=method, - path=path, - params=params, - data=data, - files=files, - headers=headers, - content_type=content_type, - multipart_parser=multipart_parser, - retry_count=retry_count + 1, - ) - - # Raise exception for error status codes - response.raise_for_status() - - # Log successful response - response_content_to_log = response.content - try: - # Attempt to parse JSON for prettier logging, fallback to raw content - response_content_to_log = response.json() - except json.JSONDecodeError: - pass # Keep as bytes/str if not JSON - - request_logger.log_request_response( - operation_id=operation_id, - request_method=method, # Pass request details again for context in log - request_url=url, - response_status_code=response.status_code, - response_headers=dict(response.headers), - response_content=response_content_to_log - ) - - except requests.ConnectionError as e: - error_message = f"ConnectionError: {str(e)}" - request_logger.log_request_response( - operation_id=operation_id, - request_method=method, - request_url=url, - error_message=error_message - ) - # Only perform connectivity check if we've exhausted all retries - if retry_count >= self.max_retries: - # Check connectivity to determine if it's a local or API issue - connectivity = self._check_connectivity(self.base_url) - - if connectivity["is_local_issue"]: - raise LocalNetworkError( - "Unable to connect to the API server due to local network issues. " - "Please check your internet connection and try again." - ) from e - elif connectivity["is_api_issue"]: - raise ApiServerError( - f"The API server at {self.base_url} is currently unreachable. " - f"The service may be experiencing issues. Please try again later." - ) from e - - # If we haven't exhausted retries yet, retry the request - if retry_count < self.max_retries: - delay = self.retry_delay * (self.retry_backoff_factor ** retry_count) - logging.warning( - f"Connection error: {str(e)}. " - f"Retrying in {delay:.2f}s ({retry_count + 1}/{self.max_retries})" - ) - time.sleep(delay) - return self.request( - method=method, - path=path, - params=params, - data=data, - files=files, - headers=headers, - content_type=content_type, - multipart_parser=multipart_parser, - retry_count=retry_count + 1, - ) - - # If we've exhausted retries and didn't identify the specific issue, - # raise a generic exception - final_error_message = ( - f"Unable to connect to the API server after {self.max_retries} attempts. " - f"Please check your internet connection or try again later." - ) - request_logger.log_request_response( # Log final failure - operation_id=operation_id, - request_method=method, request_url=url, - error_message=final_error_message - ) - raise Exception(final_error_message) from e - - except requests.Timeout as e: - error_message = f"Timeout: {str(e)}" - request_logger.log_request_response( - operation_id=operation_id, - request_method=method, request_url=url, - error_message=error_message - ) - # Retry timeouts if we haven't exhausted retries - if retry_count < self.max_retries: - delay = self.retry_delay * (self.retry_backoff_factor ** retry_count) - logging.warning( - f"Request timed out. " - f"Retrying in {delay:.2f}s ({retry_count + 1}/{self.max_retries})" - ) - time.sleep(delay) - return self.request( - method=method, - path=path, - params=params, - data=data, - files=files, - headers=headers, - content_type=content_type, - multipart_parser=multipart_parser, - retry_count=retry_count + 1, - ) - final_error_message = ( - f"Request timed out after {self.timeout} seconds and {self.max_retries} retry attempts. " - f"The server might be experiencing high load or the operation is taking longer than expected." - ) - request_logger.log_request_response( # Log final failure - operation_id=operation_id, - request_method=method, request_url=url, - error_message=final_error_message - ) - raise Exception(final_error_message) from e - - except requests.HTTPError as e: - status_code = e.response.status_code if hasattr(e, "response") else None - original_error_message = f"HTTP Error: {str(e)}" - error_content_for_log = None - if hasattr(e, "response") and e.response is not None: - error_content_for_log = e.response.content + # Success – parse JSON (safely) and log try: - error_content_for_log = e.response.json() - except json.JSONDecodeError: - pass + payload = await resp.json() + response_content_to_log = payload + except (aiohttp.ContentTypeError, json.JSONDecodeError): + payload = {} + response_content_to_log = await resp.text() - - # Try to extract detailed error message from JSON response for user display - # but log the full error content. - user_display_error_message = original_error_message - - try: - if hasattr(e, "response") and e.response is not None and e.response.content: - error_json = e.response.json() - if "error" in error_json and "message" in error_json["error"]: - user_display_error_message = f"API Error: {error_json['error']['message']}" - if "type" in error_json["error"]: - user_display_error_message += f" (Type: {error_json['error']['type']})" - elif isinstance(error_json, dict): # Handle cases where error is just a JSON dict - user_display_error_message = f"API Error: {json.dumps(error_json)}" - else: # Non-dict JSON error - user_display_error_message = f"API Error: {str(error_json)}" - except json.JSONDecodeError: - # If not JSON, use the raw content if it's not too long, or a summary - if hasattr(e, "response") and e.response is not None and e.response.content: - raw_content = e.response.content.decode(errors='ignore') - if len(raw_content) < 200: # Arbitrary limit for display - user_display_error_message = f"API Error (raw): {raw_content}" - else: - user_display_error_message = f"API Error (raw, status {status_code})" - - request_logger.log_request_response( - operation_id=operation_id, - request_method=method, request_url=url, - response_status_code=status_code, - response_headers=dict(e.response.headers) if hasattr(e, "response") and e.response is not None else None, - response_content=error_content_for_log, - error_message=original_error_message # Log the original exception string as error - ) - - logging.debug(f"[DEBUG] API Error: {user_display_error_message} (Status: {status_code})") - if hasattr(e, "response") and e.response is not None and e.response.content: - logging.debug(f"[DEBUG] Response content: {e.response.content}") - - # Retry if the status code is in our retry list and we haven't exhausted retries - if (status_code in self.retry_status_codes and - retry_count < self.max_retries): - - delay = self.retry_delay * (self.retry_backoff_factor ** retry_count) - logging.warning( - f"HTTP error {status_code}. " - f"Retrying in {delay:.2f}s ({retry_count + 1}/{self.max_retries})" + request_logger.log_request_response( + operation_id=operation_id, + request_method=method, + request_url=url, + response_status_code=resp.status, + response_headers=dict(resp.headers), + response_content=response_content_to_log, ) - time.sleep(delay) - return self.request( - method=method, - path=path, + return payload + + except (ClientError, asyncio.TimeoutError, socket.gaierror) as e: + # Treat as *connection* problem – optionally retry, else escalate + if retry_count < self.max_retries: + delay = self.retry_delay * (self.retry_backoff_factor ** retry_count) + logging.warning("Connection error. Retrying in %.2fs (%s/%s): %s", delay, retry_count + 1, + self.max_retries, str(e)) + await asyncio.sleep(delay) + return await self.request( + method, + path, params=params, data=data, files=files, @@ -583,40 +443,34 @@ class ApiClient: multipart_parser=multipart_parser, retry_count=retry_count + 1, ) + # One final connectivity check for diagnostics + connectivity = await self._check_connectivity(self.base_url) + if connectivity["is_local_issue"]: + raise LocalNetworkError( + "Unable to connect to the API server due to local network issues. " + "Please check your internet connection and try again." + ) from e + raise ApiServerError( + f"The API server at {self.base_url} is currently unreachable. " + f"The service may be experiencing issues. Please try again later." + ) from e - # Specific error messages for common status codes for user display - if status_code == 401: - user_display_error_message = "Unauthorized: Please login first to use this node." - elif status_code == 402: - user_display_error_message = "Payment Required: Please add credits to your account to use this node." - elif status_code == 409: - user_display_error_message = "There is a problem with your account. Please contact support@comfy.org." - elif status_code == 429: - user_display_error_message = "Rate Limit Exceeded: Please try again later." - # else, user_display_error_message remains as parsed from response or original HTTPError string - - raise Exception(user_display_error_message) # Raise with the user-friendly message - - # Parse and return JSON response - if response.content: - return response.json() - return {} - - def check_auth(self, auth_token, comfy_api_key): + @staticmethod + def _check_auth(auth_token, comfy_api_key): """Verify that an auth token is present or comfy_api_key is present""" if auth_token is None and comfy_api_key is None: raise Exception("Unauthorized: Please login first to use this node.") return auth_token or comfy_api_key @staticmethod - def upload_file( + async def upload_file( upload_url: str, file: io.BytesIO | str, content_type: str | None = None, max_retries: int = 3, retry_delay: float = 1.0, retry_backoff_factor: float = 2.0, - ): + ) -> aiohttp.ClientResponse: """Upload a file to the API with retry logic. Args: @@ -627,112 +481,167 @@ class ApiClient: retry_delay: Initial delay between retries in seconds retry_backoff_factor: Multiplier for the delay after each retry """ - headers = {} + headers: Dict[str, str] = {} + skip_auto_headers: set[str] = set() if content_type: headers["Content-Type"] = content_type + else: + # tell aiohttp not to add Content-Type that will break the request signature and result in a 403 status. + skip_auto_headers.add("Content-Type") - # Prepare the file data + # Extract file bytes if isinstance(file, io.BytesIO): - file.seek(0) # Ensure we're at the start of the file + file.seek(0) data = file.read() elif isinstance(file, str): with open(file, "rb") as f: data = f.read() else: - raise ValueError("File must be either a BytesIO object or a file path string") + raise ValueError("File must be BytesIO or str path") - # Try the upload with retries - last_exception = None - operation_id = f"upload_{upload_url.split('/')[-1]}_{uuid.uuid4().hex[:8]}" # Simplified ID for uploads - - # Log initial attempt (without full file data for brevity) + operation_id = f"upload_{upload_url.split('/')[-1]}_{uuid.uuid4().hex[:8]}" request_logger.log_request_response( operation_id=operation_id, request_method="PUT", request_url=upload_url, request_headers=headers, - request_data=f"[File data of type {content_type or 'unknown'}, size {len(data)} bytes]" + request_data=f"[File data {len(data)} bytes]", ) - for retry_attempt in range(max_retries + 1): + delay = retry_delay + for attempt in range(max_retries + 1): try: - response = requests.put(upload_url, data=data, headers=headers) - response.raise_for_status() + timeout = aiohttp.ClientTimeout(total=None) # honour server side timeouts + async with aiohttp.ClientSession(timeout=timeout) as session: + async with session.put( + upload_url, data=data, headers=headers, skip_auto_headers=skip_auto_headers, + ) as resp: + resp.raise_for_status() + request_logger.log_request_response( + operation_id=operation_id, + request_method="PUT", + request_url=upload_url, + response_status_code=resp.status, + response_headers=dict(resp.headers), + response_content="File uploaded successfully.", + ) + return resp + except (ClientError, asyncio.TimeoutError) as e: request_logger.log_request_response( operation_id=operation_id, - request_method="PUT", request_url=upload_url, # For context - response_status_code=response.status_code, - response_headers=dict(response.headers), - response_content="File uploaded successfully." # Or response.text if available + request_method="PUT", + request_url=upload_url, + response_status_code=e.status if hasattr(e, "status") else None, + response_headers=dict(e.headers) if getattr(e, "headers") else None, + response_content=None, + error_message=f"{type(e).__name__}: {str(e)}", ) - return response - - except (requests.ConnectionError, requests.Timeout, requests.HTTPError) as e: - last_exception = e - error_message_for_log = f"{type(e).__name__}: {str(e)}" - response_content_for_log = None - status_code_for_log = None - headers_for_log = None - - if hasattr(e, 'response') and e.response is not None: - status_code_for_log = e.response.status_code - headers_for_log = dict(e.response.headers) - try: - response_content_for_log = e.response.json() - except json.JSONDecodeError: - response_content_for_log = e.response.content - - - request_logger.log_request_response( - operation_id=operation_id, - request_method="PUT", request_url=upload_url, - response_status_code=status_code_for_log, - response_headers=headers_for_log, - response_content=response_content_for_log, - error_message=error_message_for_log - ) - - if retry_attempt < max_retries: - delay = retry_delay * (retry_backoff_factor ** retry_attempt) + if attempt < max_retries: logging.warning( - f"File upload failed: {str(e)}. " - f"Retrying in {delay:.2f}s ({retry_attempt + 1}/{max_retries})" + "Upload failed (%s/%s). Retrying in %.2fs. %s", attempt + 1, max_retries, delay, str(e) ) - time.sleep(delay) + await asyncio.sleep(delay) + delay *= retry_backoff_factor else: - break # Max retries reached + raise NetworkError(f"Failed to upload file after {max_retries + 1} attempts: {e}") from e - # If we've exhausted all retries, determine the final error type and raise - final_error_message = f"Failed to upload file after {max_retries + 1} attempts. Error: {str(last_exception)}" - try: - # Check basic internet connectivity - check_response = requests.get("https://www.google.com", timeout=5.0, verify=True) # Assuming verify=True is desired - if check_response.status_code >= 500: # Google itself has an issue (rare) - final_error_message = (f"Failed to upload file. Internet connectivity check to Google failed " - f"(status {check_response.status_code}). Original error: {str(last_exception)}") - # Not raising LocalNetworkError here as Google itself might be down. - # If Google is reachable, the issue is likely with the upload server or a more specific local problem - # not caught by a simple Google ping (e.g., DNS for the specific upload URL, firewall). - # The original last_exception is probably most relevant. + async def _handle_http_error( + self, + exc: ClientResponseError, + operation_id: str, + *req_meta, + retry_count: int, + response_content: dict | str = "", + ) -> Dict[str, Any]: + status_code = exc.status + if status_code == 401: + user_friendly = "Unauthorized: Please login first to use this node." + elif status_code == 402: + user_friendly = "Payment Required: Please add credits to your account to use this node." + elif status_code == 409: + user_friendly = "There is a problem with your account. Please contact support@comfy.org." + elif status_code == 429: + user_friendly = "Rate Limit Exceeded: Please try again later." + else: + if isinstance(response_content, dict): + if "error" in response_content and "message" in response_content["error"]: + user_friendly = f"API Error: {response_content['error']['message']}" + if "type" in response_content["error"]: + user_friendly += f" (Type: {response_content['error']['type']})" + else: # Handle cases where error is just a JSON dict with unknown format + user_friendly = f"API Error: {json.dumps(response_content)}" + else: + if len(response_content) < 200: # Arbitrary limit for display + user_friendly = f"API Error (raw): {response_content}" + else: + user_friendly = f"API Error (raw, status {response_content})" - except (requests.RequestException, socket.error) as conn_check_exc: - # Could not reach Google, likely a local network issue - final_error_message = (f"Failed to upload file due to network connectivity issues " - f"(cannot reach Google: {str(conn_check_exc)}). " - f"Original upload error: {str(last_exception)}") - request_logger.log_request_response( # Log final failure reason - operation_id=operation_id, - request_method="PUT", request_url=upload_url, - error_message=final_error_message - ) - raise LocalNetworkError(final_error_message) from last_exception - - request_logger.log_request_response( # Log final failure reason if not LocalNetworkError + request_logger.log_request_response( operation_id=operation_id, - request_method="PUT", request_url=upload_url, - error_message=final_error_message + request_method=req_meta[0], + request_url=req_meta[1], + response_status_code=exc.status, + response_headers=dict(req_meta[5]) if req_meta[5] else None, + response_content=response_content, + error_message=f"HTTP Error {exc.status}", ) - raise Exception(final_error_message) from last_exception + + logging.debug(f"[DEBUG] API Error: {user_friendly} (Status: {status_code})") + if response_content: + logging.debug(f"[DEBUG] Response content: {response_content}") + + # Retry if eligible + if status_code in self.retry_status_codes and retry_count < self.max_retries: + delay = self.retry_delay * (self.retry_backoff_factor ** retry_count) + logging.warning( + "HTTP error %s. Retrying in %.2fs (%s/%s)", + status_code, + delay, + retry_count + 1, + self.max_retries, + ) + await asyncio.sleep(delay) + return await self.request( + req_meta[0], # method + req_meta[1].replace(self.base_url, ""), # path + params=req_meta[2], + data=req_meta[3], + files=req_meta[4], + headers=req_meta[5], + content_type=req_meta[6], + multipart_parser=req_meta[7], + retry_count=retry_count + 1, + ) + + raise Exception(user_friendly) from exc + + @staticmethod + def _unpack_tuple(t): + """Helper to normalise (filename, file, content_type) tuples.""" + if len(t) == 3: + return t + elif len(t) == 2: + return t[0], t[1], "application/octet-stream" + else: + raise ValueError("files tuple must be (filename, file[, content_type])") + + async def _get_session(self) -> aiohttp.ClientSession: + if self._session is None or self._session.closed: + timeout = aiohttp.ClientTimeout(total=self.timeout) + self._session = aiohttp.ClientSession(timeout=timeout) + self._owns_session = True + return self._session + + async def close(self) -> None: + if self._owns_session and self._session and not self._session.closed: + await self._session.close() + + async def __aenter__(self) -> "ApiClient": + """Allow usage as async‑context‑manager – ensures clean teardown""" + return self + + async def __aexit__(self, exc_type, exc, tb): + await self.close() class ApiEndpoint(Generic[T, R]): @@ -763,31 +672,28 @@ class ApiEndpoint(Generic[T, R]): class SynchronousOperation(Generic[T, R]): - """ - Represents a single synchronous API operation. - """ + """Represents a single synchronous API operation.""" def __init__( self, endpoint: ApiEndpoint[T, R], request: T, - files: Optional[Dict[str, Any]] = None, + files: Optional[Dict[str, Any] | list[tuple[str, Any]]] = None, api_base: str | None = None, auth_token: Optional[str] = None, comfy_api_key: Optional[str] = None, - auth_kwargs: Optional[Dict[str,str]] = None, + auth_kwargs: Optional[Dict[str, str]] = None, timeout: float = 604800.0, verify_ssl: bool = True, content_type: str = "application/json", - multipart_parser: Callable = None, + multipart_parser: Callable | None = None, max_retries: int = 3, retry_delay: float = 1.0, retry_backoff_factor: float = 2.0, - ): + ) -> None: self.endpoint = endpoint self.request = request - self.response = None - self.error = None + self.files = files self.api_base: str = api_base or args.comfy_api_base self.auth_token = auth_token self.comfy_api_key = comfy_api_key @@ -796,91 +702,64 @@ class SynchronousOperation(Generic[T, R]): self.comfy_api_key = auth_kwargs.get("comfy_api_key", self.comfy_api_key) self.timeout = timeout self.verify_ssl = verify_ssl - self.files = files self.content_type = content_type self.multipart_parser = multipart_parser self.max_retries = max_retries self.retry_delay = retry_delay self.retry_backoff_factor = retry_backoff_factor - def execute(self, client: Optional[ApiClient] = None) -> R: - """Execute the API operation using the provided client or create one with retry support""" - try: - # Create client if not provided - if client is None: - client = ApiClient( - base_url=self.api_base, - auth_token=self.auth_token, - comfy_api_key=self.comfy_api_key, - timeout=self.timeout, - verify_ssl=self.verify_ssl, - max_retries=self.max_retries, - retry_delay=self.retry_delay, - retry_backoff_factor=self.retry_backoff_factor, - ) - - # Convert request model to dict, but use None for EmptyRequest - request_dict = ( - None - if isinstance(self.request, EmptyRequest) - else self.request.model_dump(exclude_none=True) + async def execute(self, client: Optional[ApiClient] = None) -> R: + owns_client = client is None + if owns_client: + client = ApiClient( + base_url=self.api_base, + auth_token=self.auth_token, + comfy_api_key=self.comfy_api_key, + timeout=self.timeout, + verify_ssl=self.verify_ssl, + max_retries=self.max_retries, + retry_delay=self.retry_delay, + retry_backoff_factor=self.retry_backoff_factor, ) - if request_dict: - for key, value in request_dict.items(): - if isinstance(value, Enum): - request_dict[key] = value.value - # Debug log for request + try: + request_dict: Optional[Dict[str, Any]] + if isinstance(self.request, EmptyRequest): + request_dict = None + else: + request_dict = self.request.model_dump(exclude_none=True) + for k, v in list(request_dict.items()): + if isinstance(v, Enum): + request_dict[k] = v.value + logging.debug( f"[DEBUG] API Request: {self.endpoint.method.value} {self.endpoint.path}" ) logging.debug(f"[DEBUG] Request Data: {json.dumps(request_dict, indent=2)}") logging.debug(f"[DEBUG] Query Params: {self.endpoint.query_params}") - # Make the request with built-in retry - resp = client.request( - method=self.endpoint.method.value, - path=self.endpoint.path, - data=request_dict, + response_json = await client.request( + self.endpoint.method.value, + self.endpoint.path, params=self.endpoint.query_params, + data=request_dict, files=self.files, content_type=self.content_type, - multipart_parser=self.multipart_parser + multipart_parser=self.multipart_parser, ) - # Debug log for response logging.debug("=" * 50) logging.debug("[DEBUG] RESPONSE DETAILS:") logging.debug("[DEBUG] Status Code: 200 (Success)") - logging.debug(f"[DEBUG] Response Body: {json.dumps(resp, indent=2)}") + logging.debug(f"[DEBUG] Response Body: {json.dumps(response_json, indent=2)}") logging.debug("=" * 50) - # Parse and return the response - return self._parse_response(resp) - - except LocalNetworkError as e: - # Propagate specific network error types - logging.error(f"[ERROR] Local network error: {str(e)}") - raise - - except ApiServerError as e: - # Propagate API server errors - logging.error(f"[ERROR] API server error: {str(e)}") - raise - - except Exception as e: - logging.error(f"[ERROR] API Exception: {str(e)}") - raise Exception(str(e)) - - def _parse_response(self, resp): - """Parse response data - can be overridden by subclasses""" - # The response is already the complete object, don't extract just the "data" field - # as that would lose the outer structure (created timestamp, etc.) - - # Parse response using the provided model - self.response = self.endpoint.response_model.model_validate(resp) - logging.debug(f"[DEBUG] Parsed Response: {self.response}") - return self.response + parsed_response = self.endpoint.response_model.model_validate(response_json) + logging.debug(f"[DEBUG] Parsed Response: {parsed_response}") + return parsed_response + finally: + if owns_client: + await client.close() class TaskStatus(str, Enum): @@ -892,23 +771,21 @@ class TaskStatus(str, Enum): class PollingOperation(Generic[T, R]): - """ - Represents an asynchronous API operation that requires polling for completion. - """ + """Represents an asynchronous API operation that requires polling for completion.""" def __init__( self, poll_endpoint: ApiEndpoint[EmptyRequest, R], - completed_statuses: list, - failed_statuses: list, + completed_statuses: list[str], + failed_statuses: list[str], status_extractor: Callable[[R], str], - progress_extractor: Callable[[R], float] = None, - result_url_extractor: Callable[[R], str] = None, + progress_extractor: Callable[[R], float] | None = None, + result_url_extractor: Callable[[R], str] | None = None, request: Optional[T] = None, api_base: str | None = None, auth_token: Optional[str] = None, comfy_api_key: Optional[str] = None, - auth_kwargs: Optional[Dict[str,str]] = None, + auth_kwargs: Optional[Dict[str, str]] = None, poll_interval: float = 5.0, max_poll_attempts: int = 120, # Default max polling attempts (10 minutes with 5s interval) max_retries: int = 3, # Max retries per individual API call @@ -916,7 +793,7 @@ class PollingOperation(Generic[T, R]): retry_backoff_factor: float = 2.0, estimated_duration: Optional[float] = None, node_id: Optional[str] = None, - ): + ) -> None: self.poll_endpoint = poll_endpoint self.request = request self.api_base: str = api_base or args.comfy_api_base @@ -931,100 +808,73 @@ class PollingOperation(Generic[T, R]): self.retry_delay = retry_delay self.retry_backoff_factor = retry_backoff_factor self.estimated_duration = estimated_duration - - # Polling configuration - self.status_extractor = status_extractor or ( - lambda x: getattr(x, "status", None) - ) + self.status_extractor = status_extractor or (lambda x: getattr(x, "status", None)) self.progress_extractor = progress_extractor self.result_url_extractor = result_url_extractor self.node_id = node_id self.completed_statuses = completed_statuses self.failed_statuses = failed_statuses + self.final_response: Optional[R] = None - # For storing response data - self.final_response = None - self.error = None - - def execute(self, client: Optional[ApiClient] = None) -> R: - """Execute the polling operation using the provided client. If failed, raise an exception.""" + async def execute(self, client: Optional[ApiClient] = None) -> R: + owns_client = client is None + if owns_client: + client = ApiClient( + base_url=self.api_base, + auth_token=self.auth_token, + comfy_api_key=self.comfy_api_key, + max_retries=self.max_retries, + retry_delay=self.retry_delay, + retry_backoff_factor=self.retry_backoff_factor, + ) try: - if client is None: - client = ApiClient( - base_url=self.api_base, - auth_token=self.auth_token, - comfy_api_key=self.comfy_api_key, - max_retries=self.max_retries, - retry_delay=self.retry_delay, - retry_backoff_factor=self.retry_backoff_factor, - ) - return self._poll_until_complete(client) - except LocalNetworkError as e: - # Provide clear message for local network issues - raise Exception( - f"Polling failed due to local network issues. Please check your internet connection. " - f"Details: {str(e)}" - ) from e - except ApiServerError as e: - # Provide clear message for API server issues - raise Exception( - f"Polling failed due to API server issues. The service may be experiencing problems. " - f"Please try again later. Details: {str(e)}" - ) from e - except Exception as e: - raise Exception(f"Error during polling: {str(e)}") + return await self._poll_until_complete(client) + finally: + if owns_client: + await client.close() def _display_text_on_node(self, text: str): - """Sends text to the client which will be displayed on the node in the UI""" if not self.node_id: return - PromptServer.instance.send_progress_text(text, self.node_id) - def _display_time_progress_on_node(self, time_completed: int): + def _display_time_progress_on_node(self, time_completed: int | float): if not self.node_id: return - if self.estimated_duration is not None: - estimated_time_remaining = max( - 0, int(self.estimated_duration) - int(time_completed) - ) - message = f"Task in progress: {time_completed:.0f}s (~{estimated_time_remaining:.0f}s remaining)" + remaining = max(0, int(self.estimated_duration) - time_completed) + message = f"Task in progress: {time_completed}s (~{remaining}s remaining)" else: - message = f"Task in progress: {time_completed:.0f}s" + message = f"Task in progress: {time_completed}s" self._display_text_on_node(message) def _check_task_status(self, response: R) -> TaskStatus: - """Check task status using the status extractor function""" try: status = self.status_extractor(response) if status in self.completed_statuses: return TaskStatus.COMPLETED - elif status in self.failed_statuses: + if status in self.failed_statuses: return TaskStatus.FAILED return TaskStatus.PENDING except Exception as e: - logging.error(f"Error extracting status: {e}") + logging.error("Error extracting status: %s", e) return TaskStatus.PENDING - def _poll_until_complete(self, client: ApiClient) -> R: + async def _poll_until_complete(self, client: ApiClient) -> R: """Poll until the task is complete""" - poll_count = 0 consecutive_errors = 0 max_consecutive_errors = min(5, self.max_retries * 2) # Limit consecutive errors if self.progress_extractor: progress = utils.ProgressBar(PROGRESS_BAR_MAX) - while poll_count < self.max_poll_attempts: + status = TaskStatus.PENDING + for poll_count in range(1, self.max_poll_attempts + 1): try: - poll_count += 1 logging.debug(f"[DEBUG] Polling attempt #{poll_count}") request_dict = ( - self.request.model_dump(exclude_none=True) - if self.request is not None - else None + None if self.request is None else self.request.model_dump(exclude_none=True) ) if poll_count == 1: @@ -1036,18 +886,14 @@ class PollingOperation(Generic[T, R]): ) # Query task status - resp = client.request( - method=self.poll_endpoint.method.value, - path=self.poll_endpoint.path, + resp = await client.request( + self.poll_endpoint.method.value, + self.poll_endpoint.path, params=self.poll_endpoint.query_params, data=request_dict, ) - - # Successfully got a response, reset consecutive error count - consecutive_errors = 0 - - # Parse response - response_obj = self.poll_endpoint.response_model.model_validate(resp) + consecutive_errors = 0 # reset on success + response_obj: R = self.poll_endpoint.response_model.model_validate(resp) # Check if task is complete status = self._check_task_status(response_obj) @@ -1065,45 +911,30 @@ class PollingOperation(Generic[T, R]): result_url = self.result_url_extractor(response_obj) if result_url: message = f"Result URL: {result_url}" - else: - message = "Task completed successfully!" logging.debug(f"[DEBUG] {message}") self._display_text_on_node(message) self.final_response = response_obj if self.progress_extractor: progress.update(100) return self.final_response - elif status == TaskStatus.FAILED: + if status == TaskStatus.FAILED: message = f"Task failed: {json.dumps(resp)}" logging.error(f"[DEBUG] {message}") raise Exception(message) - else: - logging.debug("[DEBUG] Task still pending, continuing to poll...") - - # Wait before polling again - logging.debug( - f"[DEBUG] Waiting {self.poll_interval} seconds before next poll" - ) + logging.debug("[DEBUG] Task still pending, continuing to poll...") + # Task pending – wait for i in range(int(self.poll_interval)): - time_completed = (poll_count * self.poll_interval) + i - self._display_time_progress_on_node(time_completed) - time.sleep(1) + self._display_time_progress_on_node((poll_count - 1) * self.poll_interval + i) + await asyncio.sleep(1) - except (LocalNetworkError, ApiServerError) as e: - # For network-related errors, increment error count and potentially abort + except (LocalNetworkError, ApiServerError, NetworkError) as e: consecutive_errors += 1 if consecutive_errors >= max_consecutive_errors: raise Exception( - f"Polling aborted after {consecutive_errors} consecutive network errors: {str(e)}" + f"Polling aborted after {consecutive_errors} network errors: {str(e)}" ) from e - - # Log the error but continue polling - logging.warning( - f"Network error during polling (attempt {poll_count}/{self.max_poll_attempts}): {str(e)}. " - f"Will retry in {self.poll_interval} seconds." - ) - time.sleep(self.poll_interval) - + logging.warning("Network error (%s/%s): %s", consecutive_errors, max_consecutive_errors, str(e)) + await asyncio.sleep(self.poll_interval) except Exception as e: # For other errors, increment count and potentially abort consecutive_errors += 1 @@ -1117,10 +948,10 @@ class PollingOperation(Generic[T, R]): f"Error during polling (attempt {poll_count}/{self.max_poll_attempts}): {str(e)}. " f"Will retry in {self.poll_interval} seconds." ) - time.sleep(self.poll_interval) + await asyncio.sleep(self.poll_interval) # If we've exhausted all polling attempts raise Exception( - f"Polling timed out after {poll_count} attempts ({poll_count * self.poll_interval} seconds). " - f"The operation may still be running on the server but is taking longer than expected." + f"Polling timed out after {self.max_poll_attempts} attempts (" f"{self.max_poll_attempts * self.poll_interval} seconds). " + "The operation may still be running on the server but is taking longer than expected." ) diff --git a/comfy_api_nodes/nodes_bfl.py b/comfy_api_nodes/nodes_bfl.py index d93fbd778..c09be8d5b 100644 --- a/comfy_api_nodes/nodes_bfl.py +++ b/comfy_api_nodes/nodes_bfl.py @@ -1,3 +1,4 @@ +import asyncio import io from inspect import cleandoc from typing import Union, Optional @@ -28,7 +29,7 @@ from comfy_api_nodes.apinode_utils import ( import numpy as np from PIL import Image -import requests +import aiohttp import torch import base64 import time @@ -44,18 +45,18 @@ def convert_mask_to_image(mask: torch.Tensor): return mask -def handle_bfl_synchronous_operation( +async def handle_bfl_synchronous_operation( operation: SynchronousOperation, timeout_bfl_calls=360, node_id: Union[str, None] = None, ): - response_api: BFLFluxProGenerateResponse = operation.execute() - return _poll_until_generated( + response_api: BFLFluxProGenerateResponse = await operation.execute() + return await _poll_until_generated( response_api.polling_url, timeout=timeout_bfl_calls, node_id=node_id ) -def _poll_until_generated( +async def _poll_until_generated( polling_url: str, timeout=360, node_id: Union[str, None] = None ): # used bfl-comfy-nodes to verify code implementation: @@ -66,55 +67,56 @@ def _poll_until_generated( retry_404_seconds = 2 retry_202_seconds = 2 retry_pending_seconds = 1 - request = requests.Request(method=HttpMethod.GET, url=polling_url) - # NOTE: should True loop be replaced with checking if workflow has been interrupted? - while True: - if node_id: - time_elapsed = time.time() - start_time - PromptServer.instance.send_progress_text( - f"Generating ({time_elapsed:.0f}s)", node_id - ) - response = requests.Session().send(request.prepare()) - if response.status_code == 200: - result = response.json() - if result["status"] == BFLStatus.ready: - img_url = result["result"]["sample"] - if node_id: - PromptServer.instance.send_progress_text( - f"Result URL: {img_url}", node_id - ) - img_response = requests.get(img_url) - return process_image_response(img_response) - elif result["status"] in [ - BFLStatus.request_moderated, - BFLStatus.content_moderated, - ]: - status = result["status"] - raise Exception( - f"BFL API did not return an image due to: {status}." + async with aiohttp.ClientSession() as session: + # NOTE: should True loop be replaced with checking if workflow has been interrupted? + while True: + if node_id: + time_elapsed = time.time() - start_time + PromptServer.instance.send_progress_text( + f"Generating ({time_elapsed:.0f}s)", node_id ) - elif result["status"] == BFLStatus.error: - raise Exception(f"BFL API encountered an error: {result}.") - elif result["status"] == BFLStatus.pending: - time.sleep(retry_pending_seconds) - continue - elif response.status_code == 404: - if retries_404 < max_retries_404: - retries_404 += 1 - time.sleep(retry_404_seconds) - continue - raise Exception( - f"BFL API could not find task after {max_retries_404} tries." - ) - elif response.status_code == 202: - time.sleep(retry_202_seconds) - elif time.time() - start_time > timeout: - raise Exception( - f"BFL API experienced a timeout; could not return request under {timeout} seconds." - ) - else: - raise Exception(f"BFL API encountered an error: {response.json()}") + + async with session.get(polling_url) as response: + if response.status == 200: + result = await response.json() + if result["status"] == BFLStatus.ready: + img_url = result["result"]["sample"] + if node_id: + PromptServer.instance.send_progress_text( + f"Result URL: {img_url}", node_id + ) + async with session.get(img_url) as img_resp: + return process_image_response(await img_resp.content.read()) + elif result["status"] in [ + BFLStatus.request_moderated, + BFLStatus.content_moderated, + ]: + status = result["status"] + raise Exception( + f"BFL API did not return an image due to: {status}." + ) + elif result["status"] == BFLStatus.error: + raise Exception(f"BFL API encountered an error: {result}.") + elif result["status"] == BFLStatus.pending: + await asyncio.sleep(retry_pending_seconds) + continue + elif response.status == 404: + if retries_404 < max_retries_404: + retries_404 += 1 + await asyncio.sleep(retry_404_seconds) + continue + raise Exception( + f"BFL API could not find task after {max_retries_404} tries." + ) + elif response.status == 202: + await asyncio.sleep(retry_202_seconds) + elif time.time() - start_time > timeout: + raise Exception( + f"BFL API experienced a timeout; could not return request under {timeout} seconds." + ) + else: + raise Exception(f"BFL API encountered an error: {response.json()}") def convert_image_to_base64(image: torch.Tensor): scaled_image = downscale_image_tensor(image, total_pixels=2048 * 2048) @@ -222,7 +224,7 @@ class FluxProUltraImageNode(ComfyNodeABC): API_NODE = True CATEGORY = "api node/image/BFL" - def api_call( + async def api_call( self, prompt: str, aspect_ratio: str, @@ -266,7 +268,7 @@ class FluxProUltraImageNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - output_image = handle_bfl_synchronous_operation(operation, node_id=unique_id) + output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) return (output_image,) @@ -354,7 +356,7 @@ class FluxKontextProImageNode(ComfyNodeABC): BFL_PATH = "/proxy/bfl/flux-kontext-pro/generate" - def api_call( + async def api_call( self, prompt: str, aspect_ratio: str, @@ -397,7 +399,7 @@ class FluxKontextProImageNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - output_image = handle_bfl_synchronous_operation(operation, node_id=unique_id) + output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) return (output_image,) @@ -489,7 +491,7 @@ class FluxProImageNode(ComfyNodeABC): API_NODE = True CATEGORY = "api node/image/BFL" - def api_call( + async def api_call( self, prompt: str, prompt_upsampling, @@ -524,7 +526,7 @@ class FluxProImageNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - output_image = handle_bfl_synchronous_operation(operation, node_id=unique_id) + output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) return (output_image,) @@ -632,7 +634,7 @@ class FluxProExpandNode(ComfyNodeABC): API_NODE = True CATEGORY = "api node/image/BFL" - def api_call( + async def api_call( self, image: torch.Tensor, prompt: str, @@ -670,7 +672,7 @@ class FluxProExpandNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - output_image = handle_bfl_synchronous_operation(operation, node_id=unique_id) + output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) return (output_image,) @@ -744,7 +746,7 @@ class FluxProFillNode(ComfyNodeABC): API_NODE = True CATEGORY = "api node/image/BFL" - def api_call( + async def api_call( self, image: torch.Tensor, mask: torch.Tensor, @@ -780,7 +782,7 @@ class FluxProFillNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - output_image = handle_bfl_synchronous_operation(operation, node_id=unique_id) + output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) return (output_image,) @@ -879,7 +881,7 @@ class FluxProCannyNode(ComfyNodeABC): API_NODE = True CATEGORY = "api node/image/BFL" - def api_call( + async def api_call( self, control_image: torch.Tensor, prompt: str, @@ -929,7 +931,7 @@ class FluxProCannyNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - output_image = handle_bfl_synchronous_operation(operation, node_id=unique_id) + output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) return (output_image,) @@ -1008,7 +1010,7 @@ class FluxProDepthNode(ComfyNodeABC): API_NODE = True CATEGORY = "api node/image/BFL" - def api_call( + async def api_call( self, control_image: torch.Tensor, prompt: str, @@ -1045,7 +1047,7 @@ class FluxProDepthNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - output_image = handle_bfl_synchronous_operation(operation, node_id=unique_id) + output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) return (output_image,) diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index af33279d5..3751fb2a1 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -303,7 +303,7 @@ class GeminiNode(ComfyNodeABC): """ return GeminiPart(text=text) - def api_call( + async def api_call( self, prompt: str, model: GeminiModel, @@ -332,7 +332,7 @@ class GeminiNode(ComfyNodeABC): parts.extend(files) # Create response - response = SynchronousOperation( + response = await SynchronousOperation( endpoint=get_gemini_endpoint(model), request=GeminiGenerateContentRequest( contents=[ diff --git a/comfy_api_nodes/nodes_ideogram.py b/comfy_api_nodes/nodes_ideogram.py index b8487355f..db24e6da4 100644 --- a/comfy_api_nodes/nodes_ideogram.py +++ b/comfy_api_nodes/nodes_ideogram.py @@ -212,7 +212,7 @@ V3_RESOLUTIONS= [ "1536x640" ] -def download_and_process_images(image_urls): +async def download_and_process_images(image_urls): """Helper function to download and process multiple images from URLs""" # Initialize list to store image tensors @@ -220,7 +220,7 @@ def download_and_process_images(image_urls): for image_url in image_urls: # Using functions from apinode_utils.py to handle downloading and processing - image_bytesio = download_url_to_bytesio(image_url) # Download image content to BytesIO + image_bytesio = await download_url_to_bytesio(image_url) # Download image content to BytesIO img_tensor = bytesio_to_image_tensor(image_bytesio, mode="RGB") # Convert to torch.Tensor with RGB mode image_tensors.append(img_tensor) @@ -328,7 +328,7 @@ class IdeogramV1(ComfyNodeABC): DESCRIPTION = cleandoc(__doc__ or "") API_NODE = True - def api_call( + async def api_call( self, prompt, turbo=False, @@ -367,7 +367,7 @@ class IdeogramV1(ComfyNodeABC): auth_kwargs=kwargs, ) - response = operation.execute() + response = await operation.execute() if not response.data or len(response.data) == 0: raise Exception("No images were generated in the response") @@ -378,7 +378,7 @@ class IdeogramV1(ComfyNodeABC): raise Exception("No image URLs were generated in the response") display_image_urls_on_node(image_urls, unique_id) - return (download_and_process_images(image_urls),) + return (await download_and_process_images(image_urls),) class IdeogramV2(ComfyNodeABC): @@ -487,7 +487,7 @@ class IdeogramV2(ComfyNodeABC): DESCRIPTION = cleandoc(__doc__ or "") API_NODE = True - def api_call( + async def api_call( self, prompt, turbo=False, @@ -543,7 +543,7 @@ class IdeogramV2(ComfyNodeABC): auth_kwargs=kwargs, ) - response = operation.execute() + response = await operation.execute() if not response.data or len(response.data) == 0: raise Exception("No images were generated in the response") @@ -554,7 +554,7 @@ class IdeogramV2(ComfyNodeABC): raise Exception("No image URLs were generated in the response") display_image_urls_on_node(image_urls, unique_id) - return (download_and_process_images(image_urls),) + return (await download_and_process_images(image_urls),) class IdeogramV3(ComfyNodeABC): """ @@ -653,7 +653,7 @@ class IdeogramV3(ComfyNodeABC): DESCRIPTION = cleandoc(__doc__ or "") API_NODE = True - def api_call( + async def api_call( self, prompt, image=None, @@ -774,7 +774,7 @@ class IdeogramV3(ComfyNodeABC): ) # Execute the operation and process response - response = operation.execute() + response = await operation.execute() if not response.data or len(response.data) == 0: raise Exception("No images were generated in the response") @@ -785,7 +785,7 @@ class IdeogramV3(ComfyNodeABC): raise Exception("No image URLs were generated in the response") display_image_urls_on_node(image_urls, unique_id) - return (download_and_process_images(image_urls),) + return (await download_and_process_images(image_urls),) NODE_CLASS_MAPPINGS = { diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index 69e9e5cf0..9d9eb5628 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -109,7 +109,7 @@ class KlingApiError(Exception): pass -def poll_until_finished( +async def poll_until_finished( auth_kwargs: dict[str, str], api_endpoint: ApiEndpoint[Any, R], result_url_extractor: Optional[Callable[[R], str]] = None, @@ -117,7 +117,7 @@ def poll_until_finished( node_id: Optional[str] = None, ) -> R: """Polls the Kling API endpoint until the task reaches a terminal state, then returns the response.""" - return PollingOperation( + return await PollingOperation( poll_endpoint=api_endpoint, completed_statuses=[ KlingTaskStatus.succeed.value, @@ -278,18 +278,18 @@ def get_images_urls_from_response(response) -> Optional[str]: return None -def video_result_to_node_output( +async def video_result_to_node_output( video: KlingVideoResult, ) -> tuple[VideoFromFile, str, str]: """Converts a KlingVideoResult to a tuple of (VideoFromFile, str, str) to be used as a ComfyUI node output.""" return ( - download_url_to_video_output(video.url), + await download_url_to_video_output(str(video.url)), str(video.id), str(video.duration), ) -def image_result_to_node_output( +async def image_result_to_node_output( images: list[KlingImageResult], ) -> torch.Tensor: """ @@ -297,9 +297,9 @@ def image_result_to_node_output( If multiple images are returned, they will be stacked along the batch dimension. """ if len(images) == 1: - return download_url_to_image_tensor(images[0].url) + return await download_url_to_image_tensor(str(images[0].url)) else: - return torch.cat([download_url_to_image_tensor(image.url) for image in images]) + return torch.cat([await download_url_to_image_tensor(str(image.url)) for image in images]) class KlingNodeBase(ComfyNodeABC): @@ -467,10 +467,10 @@ class KlingTextToVideoNode(KlingNodeBase): RETURN_NAMES = ("VIDEO", "video_id", "duration") DESCRIPTION = "Kling Text to Video Node" - def get_response( + async def get_response( self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None ) -> KlingText2VideoResponse: - return poll_until_finished( + return await poll_until_finished( auth_kwargs, ApiEndpoint( path=f"{PATH_TEXT_TO_VIDEO}/{task_id}", @@ -483,7 +483,7 @@ class KlingTextToVideoNode(KlingNodeBase): node_id=node_id, ) - def api_call( + async def api_call( self, prompt: str, negative_prompt: str, @@ -519,17 +519,17 @@ class KlingTextToVideoNode(KlingNodeBase): auth_kwargs=kwargs, ) - task_creation_response = initial_operation.execute() + task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.data.task_id - final_response = self.get_response( + final_response = await self.get_response( task_id, auth_kwargs=kwargs, node_id=unique_id ) validate_video_result_response(final_response) video = get_video_from_response(final_response) - return video_result_to_node_output(video) + return await video_result_to_node_output(video) class KlingCameraControlT2VNode(KlingTextToVideoNode): @@ -581,7 +581,7 @@ class KlingCameraControlT2VNode(KlingTextToVideoNode): DESCRIPTION = "Transform text into cinematic videos with professional camera movements that simulate real-world cinematography. Control virtual camera actions including zoom, rotation, pan, tilt, and first-person view, while maintaining focus on your original text." - def api_call( + async def api_call( self, prompt: str, negative_prompt: str, @@ -591,7 +591,7 @@ class KlingCameraControlT2VNode(KlingTextToVideoNode): unique_id: Optional[str] = None, **kwargs, ): - return super().api_call( + return await super().api_call( model_name=KlingVideoGenModelName.kling_v1, cfg_scale=cfg_scale, mode=KlingVideoGenMode.std, @@ -670,10 +670,10 @@ class KlingImage2VideoNode(KlingNodeBase): RETURN_NAMES = ("VIDEO", "video_id", "duration") DESCRIPTION = "Kling Image to Video Node" - def get_response( + async def get_response( self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None ) -> KlingImage2VideoResponse: - return poll_until_finished( + return await poll_until_finished( auth_kwargs, ApiEndpoint( path=f"{PATH_IMAGE_TO_VIDEO}/{task_id}", @@ -686,7 +686,7 @@ class KlingImage2VideoNode(KlingNodeBase): node_id=node_id, ) - def api_call( + async def api_call( self, start_frame: torch.Tensor, prompt: str, @@ -733,17 +733,17 @@ class KlingImage2VideoNode(KlingNodeBase): auth_kwargs=kwargs, ) - task_creation_response = initial_operation.execute() + task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.data.task_id - final_response = self.get_response( + final_response = await self.get_response( task_id, auth_kwargs=kwargs, node_id=unique_id ) validate_video_result_response(final_response) video = get_video_from_response(final_response) - return video_result_to_node_output(video) + return await video_result_to_node_output(video) class KlingCameraControlI2VNode(KlingImage2VideoNode): @@ -798,7 +798,7 @@ class KlingCameraControlI2VNode(KlingImage2VideoNode): DESCRIPTION = "Transform still images into cinematic videos with professional camera movements that simulate real-world cinematography. Control virtual camera actions including zoom, rotation, pan, tilt, and first-person view, while maintaining focus on your original image." - def api_call( + async def api_call( self, start_frame: torch.Tensor, prompt: str, @@ -809,7 +809,7 @@ class KlingCameraControlI2VNode(KlingImage2VideoNode): unique_id: Optional[str] = None, **kwargs, ): - return super().api_call( + return await super().api_call( model_name=KlingVideoGenModelName.kling_v1_5, start_frame=start_frame, cfg_scale=cfg_scale, @@ -897,7 +897,7 @@ class KlingStartEndFrameNode(KlingImage2VideoNode): DESCRIPTION = "Generate a video sequence that transitions between your provided start and end images. The node creates all frames in between, producing a smooth transformation from the first frame to the last." - def api_call( + async def api_call( self, start_frame: torch.Tensor, end_frame: torch.Tensor, @@ -912,7 +912,7 @@ class KlingStartEndFrameNode(KlingImage2VideoNode): mode, duration, model_name = KlingStartEndFrameNode.get_mode_string_mapping()[ mode ] - return super().api_call( + return await super().api_call( prompt=prompt, negative_prompt=negative_prompt, model_name=model_name, @@ -964,10 +964,10 @@ class KlingVideoExtendNode(KlingNodeBase): RETURN_NAMES = ("VIDEO", "video_id", "duration") DESCRIPTION = "Kling Video Extend Node. Extend videos made by other Kling nodes. The video_id is created by using other Kling Nodes." - def get_response( + async def get_response( self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None ) -> KlingVideoExtendResponse: - return poll_until_finished( + return await poll_until_finished( auth_kwargs, ApiEndpoint( path=f"{PATH_VIDEO_EXTEND}/{task_id}", @@ -980,7 +980,7 @@ class KlingVideoExtendNode(KlingNodeBase): node_id=node_id, ) - def api_call( + async def api_call( self, prompt: str, negative_prompt: str, @@ -1006,17 +1006,17 @@ class KlingVideoExtendNode(KlingNodeBase): auth_kwargs=kwargs, ) - task_creation_response = initial_operation.execute() + task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.data.task_id - final_response = self.get_response( + final_response = await self.get_response( task_id, auth_kwargs=kwargs, node_id=unique_id ) validate_video_result_response(final_response) video = get_video_from_response(final_response) - return video_result_to_node_output(video) + return await video_result_to_node_output(video) class KlingVideoEffectsBase(KlingNodeBase): @@ -1025,10 +1025,10 @@ class KlingVideoEffectsBase(KlingNodeBase): RETURN_TYPES = ("VIDEO", "STRING", "STRING") RETURN_NAMES = ("VIDEO", "video_id", "duration") - def get_response( + async def get_response( self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None ) -> KlingVideoEffectsResponse: - return poll_until_finished( + return await poll_until_finished( auth_kwargs, ApiEndpoint( path=f"{PATH_VIDEO_EFFECTS}/{task_id}", @@ -1041,7 +1041,7 @@ class KlingVideoEffectsBase(KlingNodeBase): node_id=node_id, ) - def api_call( + async def api_call( self, dual_character: bool, effect_scene: KlingDualCharacterEffectsScene | KlingSingleImageEffectsScene, @@ -1084,17 +1084,17 @@ class KlingVideoEffectsBase(KlingNodeBase): auth_kwargs=kwargs, ) - task_creation_response = initial_operation.execute() + task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.data.task_id - final_response = self.get_response( + final_response = await self.get_response( task_id, auth_kwargs=kwargs, node_id=unique_id ) validate_video_result_response(final_response) video = get_video_from_response(final_response) - return video_result_to_node_output(video) + return await video_result_to_node_output(video) class KlingDualCharacterVideoEffectNode(KlingVideoEffectsBase): @@ -1142,7 +1142,7 @@ class KlingDualCharacterVideoEffectNode(KlingVideoEffectsBase): RETURN_TYPES = ("VIDEO", "STRING") RETURN_NAMES = ("VIDEO", "duration") - def api_call( + async def api_call( self, image_left: torch.Tensor, image_right: torch.Tensor, @@ -1153,7 +1153,7 @@ class KlingDualCharacterVideoEffectNode(KlingVideoEffectsBase): unique_id: Optional[str] = None, **kwargs, ): - video, _, duration = super().api_call( + video, _, duration = await super().api_call( dual_character=True, effect_scene=effect_scene, model_name=model_name, @@ -1208,7 +1208,7 @@ class KlingSingleImageVideoEffectNode(KlingVideoEffectsBase): DESCRIPTION = "Achieve different special effects when generating a video based on the effect_scene." - def api_call( + async def api_call( self, image: torch.Tensor, effect_scene: KlingSingleImageEffectsScene, @@ -1217,7 +1217,7 @@ class KlingSingleImageVideoEffectNode(KlingVideoEffectsBase): unique_id: Optional[str] = None, **kwargs, ): - return super().api_call( + return await super().api_call( dual_character=False, effect_scene=effect_scene, model_name=model_name, @@ -1253,11 +1253,11 @@ class KlingLipSyncBase(KlingNodeBase): f"Text is too long. Maximum length is {MAX_PROMPT_LENGTH_LIP_SYNC} characters." ) - def get_response( + async def get_response( self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None ) -> KlingLipSyncResponse: """Polls the Kling API endpoint until the task reaches a terminal state.""" - return poll_until_finished( + return await poll_until_finished( auth_kwargs, ApiEndpoint( path=f"{PATH_LIP_SYNC}/{task_id}", @@ -1270,7 +1270,7 @@ class KlingLipSyncBase(KlingNodeBase): node_id=node_id, ) - def api_call( + async def api_call( self, video: VideoInput, audio: Optional[AudioInput] = None, @@ -1287,12 +1287,12 @@ class KlingLipSyncBase(KlingNodeBase): self.validate_lip_sync_video(video) # Upload video to Comfy API and get download URL - video_url = upload_video_to_comfyapi(video, auth_kwargs=kwargs) + video_url = await upload_video_to_comfyapi(video, auth_kwargs=kwargs) logging.info("Uploaded video to Comfy API. URL: %s", video_url) # Upload the audio file to Comfy API and get download URL if audio: - audio_url = upload_audio_to_comfyapi(audio, auth_kwargs=kwargs) + audio_url = await upload_audio_to_comfyapi(audio, auth_kwargs=kwargs) logging.info("Uploaded audio to Comfy API. URL: %s", audio_url) else: audio_url = None @@ -1319,17 +1319,17 @@ class KlingLipSyncBase(KlingNodeBase): auth_kwargs=kwargs, ) - task_creation_response = initial_operation.execute() + task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.data.task_id - final_response = self.get_response( + final_response = await self.get_response( task_id, auth_kwargs=kwargs, node_id=unique_id ) validate_video_result_response(final_response) video = get_video_from_response(final_response) - return video_result_to_node_output(video) + return await video_result_to_node_output(video) class KlingLipSyncAudioToVideoNode(KlingLipSyncBase): @@ -1357,7 +1357,7 @@ class KlingLipSyncAudioToVideoNode(KlingLipSyncBase): DESCRIPTION = "Kling Lip Sync Audio to Video Node. Syncs mouth movements in a video file to the audio content of an audio file. When using, ensure that the audio contains clearly distinguishable vocals and that the video contains a distinct face. The audio file should not be larger than 5MB. The video file should not be larger than 100MB, should have height/width between 720px and 1920px, and should be between 2s and 10s in length." - def api_call( + async def api_call( self, video: VideoInput, audio: AudioInput, @@ -1365,7 +1365,7 @@ class KlingLipSyncAudioToVideoNode(KlingLipSyncBase): unique_id: Optional[str] = None, **kwargs, ): - return super().api_call( + return await super().api_call( video=video, audio=audio, voice_language=voice_language, @@ -1469,7 +1469,7 @@ class KlingLipSyncTextToVideoNode(KlingLipSyncBase): DESCRIPTION = "Kling Lip Sync Text to Video Node. Syncs mouth movements in a video file to a text prompt. The video file should not be larger than 100MB, should have height/width between 720px and 1920px, and should be between 2s and 10s in length." - def api_call( + async def api_call( self, video: VideoInput, text: str, @@ -1479,7 +1479,7 @@ class KlingLipSyncTextToVideoNode(KlingLipSyncBase): **kwargs, ): voice_id, voice_language = KlingLipSyncTextToVideoNode.get_voice_config()[voice] - return super().api_call( + return await super().api_call( video=video, text=text, voice_language=voice_language, @@ -1533,10 +1533,10 @@ class KlingVirtualTryOnNode(KlingImageGenerationBase): DESCRIPTION = "Kling Virtual Try On Node. Input a human image and a cloth image to try on the cloth on the human. You can merge multiple clothing item pictures into one image with a white background." - def get_response( + async def get_response( self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None ) -> KlingVirtualTryOnResponse: - return poll_until_finished( + return await poll_until_finished( auth_kwargs, ApiEndpoint( path=f"{PATH_VIRTUAL_TRY_ON}/{task_id}", @@ -1549,7 +1549,7 @@ class KlingVirtualTryOnNode(KlingImageGenerationBase): node_id=node_id, ) - def api_call( + async def api_call( self, human_image: torch.Tensor, cloth_image: torch.Tensor, @@ -1572,17 +1572,17 @@ class KlingVirtualTryOnNode(KlingImageGenerationBase): auth_kwargs=kwargs, ) - task_creation_response = initial_operation.execute() + task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.data.task_id - final_response = self.get_response( + final_response = await self.get_response( task_id, auth_kwargs=kwargs, node_id=unique_id ) validate_image_result_response(final_response) images = get_images_from_response(final_response) - return (image_result_to_node_output(images),) + return (await image_result_to_node_output(images),) class KlingImageGenerationNode(KlingImageGenerationBase): @@ -1655,13 +1655,13 @@ class KlingImageGenerationNode(KlingImageGenerationBase): DESCRIPTION = "Kling Image Generation Node. Generate an image from a text prompt with an optional reference image." - def get_response( + async def get_response( self, task_id: str, auth_kwargs: Optional[dict[str, str]], node_id: Optional[str] = None, ) -> KlingImageGenerationsResponse: - return poll_until_finished( + return await poll_until_finished( auth_kwargs, ApiEndpoint( path=f"{PATH_IMAGE_GENERATIONS}/{task_id}", @@ -1674,7 +1674,7 @@ class KlingImageGenerationNode(KlingImageGenerationBase): node_id=node_id, ) - def api_call( + async def api_call( self, model_name: KlingImageGenModelName, prompt: str, @@ -1714,17 +1714,17 @@ class KlingImageGenerationNode(KlingImageGenerationBase): auth_kwargs=kwargs, ) - task_creation_response = initial_operation.execute() + task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.data.task_id - final_response = self.get_response( + final_response = await self.get_response( task_id, auth_kwargs=kwargs, node_id=unique_id ) validate_image_result_response(final_response) images = get_images_from_response(final_response) - return (image_result_to_node_output(images),) + return (await image_result_to_node_output(images),) NODE_CLASS_MAPPINGS = { diff --git a/comfy_api_nodes/nodes_luma.py b/comfy_api_nodes/nodes_luma.py index 525dc38e6..b3c32bed5 100644 --- a/comfy_api_nodes/nodes_luma.py +++ b/comfy_api_nodes/nodes_luma.py @@ -38,7 +38,7 @@ from comfy_api_nodes.apinode_utils import ( ) from server import PromptServer -import requests +import aiohttp import torch from io import BytesIO @@ -217,7 +217,7 @@ class LumaImageGenerationNode(ComfyNodeABC): }, } - def api_call( + async def api_call( self, prompt: str, model: str, @@ -234,19 +234,19 @@ class LumaImageGenerationNode(ComfyNodeABC): # handle image_luma_ref api_image_ref = None if image_luma_ref is not None: - api_image_ref = self._convert_luma_refs( + api_image_ref = await self._convert_luma_refs( image_luma_ref, max_refs=4, auth_kwargs=kwargs, ) # handle style_luma_ref api_style_ref = None if style_image is not None: - api_style_ref = self._convert_style_image( + api_style_ref = await self._convert_style_image( style_image, weight=style_image_weight, auth_kwargs=kwargs, ) # handle character_ref images character_ref = None if character_image is not None: - download_urls = upload_images_to_comfyapi( + download_urls = await upload_images_to_comfyapi( character_image, max_images=4, auth_kwargs=kwargs, ) character_ref = LumaCharacterRef( @@ -270,7 +270,7 @@ class LumaImageGenerationNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - response_api: LumaGeneration = operation.execute() + response_api: LumaGeneration = await operation.execute() operation = PollingOperation( poll_endpoint=ApiEndpoint( @@ -286,19 +286,20 @@ class LumaImageGenerationNode(ComfyNodeABC): node_id=unique_id, auth_kwargs=kwargs, ) - response_poll = operation.execute() + response_poll = await operation.execute() - img_response = requests.get(response_poll.assets.image) - img = process_image_response(img_response) + async with aiohttp.ClientSession() as session: + async with session.get(response_poll.assets.image) as img_response: + img = process_image_response(await img_response.content.read()) return (img,) - def _convert_luma_refs( + async def _convert_luma_refs( self, luma_ref: LumaReferenceChain, max_refs: int, auth_kwargs: Optional[dict[str,str]] = None ): luma_urls = [] ref_count = 0 for ref in luma_ref.refs: - download_urls = upload_images_to_comfyapi( + download_urls = await upload_images_to_comfyapi( ref.image, max_images=1, auth_kwargs=auth_kwargs ) luma_urls.append(download_urls[0]) @@ -307,13 +308,13 @@ class LumaImageGenerationNode(ComfyNodeABC): break return luma_ref.create_api_model(download_urls=luma_urls, max_refs=max_refs) - def _convert_style_image( + async def _convert_style_image( self, style_image: torch.Tensor, weight: float, auth_kwargs: Optional[dict[str,str]] = None ): chain = LumaReferenceChain( first_ref=LumaReference(image=style_image, weight=weight) ) - return self._convert_luma_refs(chain, max_refs=1, auth_kwargs=auth_kwargs) + return await self._convert_luma_refs(chain, max_refs=1, auth_kwargs=auth_kwargs) class LumaImageModifyNode(ComfyNodeABC): @@ -370,7 +371,7 @@ class LumaImageModifyNode(ComfyNodeABC): }, } - def api_call( + async def api_call( self, prompt: str, model: str, @@ -381,7 +382,7 @@ class LumaImageModifyNode(ComfyNodeABC): **kwargs, ): # first, upload image - download_urls = upload_images_to_comfyapi( + download_urls = await upload_images_to_comfyapi( image, max_images=1, auth_kwargs=kwargs, ) image_url = download_urls[0] @@ -402,7 +403,7 @@ class LumaImageModifyNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - response_api: LumaGeneration = operation.execute() + response_api: LumaGeneration = await operation.execute() operation = PollingOperation( poll_endpoint=ApiEndpoint( @@ -418,10 +419,11 @@ class LumaImageModifyNode(ComfyNodeABC): node_id=unique_id, auth_kwargs=kwargs, ) - response_poll = operation.execute() + response_poll = await operation.execute() - img_response = requests.get(response_poll.assets.image) - img = process_image_response(img_response) + async with aiohttp.ClientSession() as session: + async with session.get(response_poll.assets.image) as img_response: + img = process_image_response(await img_response.content.read()) return (img,) @@ -494,7 +496,7 @@ class LumaTextToVideoGenerationNode(ComfyNodeABC): }, } - def api_call( + async def api_call( self, prompt: str, model: str, @@ -529,7 +531,7 @@ class LumaTextToVideoGenerationNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - response_api: LumaGeneration = operation.execute() + response_api: LumaGeneration = await operation.execute() if unique_id: PromptServer.instance.send_progress_text(f"Luma video generation started: {response_api.id}", unique_id) @@ -549,10 +551,11 @@ class LumaTextToVideoGenerationNode(ComfyNodeABC): estimated_duration=LUMA_T2V_AVERAGE_DURATION, auth_kwargs=kwargs, ) - response_poll = operation.execute() + response_poll = await operation.execute() - vid_response = requests.get(response_poll.assets.video) - return (VideoFromFile(BytesIO(vid_response.content)),) + async with aiohttp.ClientSession() as session: + async with session.get(response_poll.assets.video) as vid_response: + return (VideoFromFile(BytesIO(await vid_response.content.read())),) class LumaImageToVideoGenerationNode(ComfyNodeABC): @@ -626,7 +629,7 @@ class LumaImageToVideoGenerationNode(ComfyNodeABC): }, } - def api_call( + async def api_call( self, prompt: str, model: str, @@ -644,7 +647,7 @@ class LumaImageToVideoGenerationNode(ComfyNodeABC): raise Exception( "At least one of first_image and last_image requires an input." ) - keyframes = self._convert_to_keyframes(first_image, last_image, auth_kwargs=kwargs) + keyframes = await self._convert_to_keyframes(first_image, last_image, auth_kwargs=kwargs) duration = duration if model != LumaVideoModel.ray_1_6 else None resolution = resolution if model != LumaVideoModel.ray_1_6 else None @@ -667,7 +670,7 @@ class LumaImageToVideoGenerationNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - response_api: LumaGeneration = operation.execute() + response_api: LumaGeneration = await operation.execute() if unique_id: PromptServer.instance.send_progress_text(f"Luma video generation started: {response_api.id}", unique_id) @@ -687,12 +690,13 @@ class LumaImageToVideoGenerationNode(ComfyNodeABC): estimated_duration=LUMA_I2V_AVERAGE_DURATION, auth_kwargs=kwargs, ) - response_poll = operation.execute() + response_poll = await operation.execute() - vid_response = requests.get(response_poll.assets.video) - return (VideoFromFile(BytesIO(vid_response.content)),) + async with aiohttp.ClientSession() as session: + async with session.get(response_poll.assets.video) as vid_response: + return (VideoFromFile(BytesIO(await vid_response.content.read())),) - def _convert_to_keyframes( + async def _convert_to_keyframes( self, first_image: torch.Tensor = None, last_image: torch.Tensor = None, @@ -703,12 +707,12 @@ class LumaImageToVideoGenerationNode(ComfyNodeABC): frame0 = None frame1 = None if first_image is not None: - download_urls = upload_images_to_comfyapi( + download_urls = await upload_images_to_comfyapi( first_image, max_images=1, auth_kwargs=auth_kwargs, ) frame0 = LumaImageReference(type="image", url=download_urls[0]) if last_image is not None: - download_urls = upload_images_to_comfyapi( + download_urls = await upload_images_to_comfyapi( last_image, max_images=1, auth_kwargs=auth_kwargs, ) frame1 = LumaImageReference(type="image", url=download_urls[0]) diff --git a/comfy_api_nodes/nodes_minimax.py b/comfy_api_nodes/nodes_minimax.py index 9b46636db..58d2ed90c 100644 --- a/comfy_api_nodes/nodes_minimax.py +++ b/comfy_api_nodes/nodes_minimax.py @@ -86,7 +86,7 @@ class MinimaxTextToVideoNode: API_NODE = True OUTPUT_NODE = True - def generate_video( + async def generate_video( self, prompt_text, seed=0, @@ -104,12 +104,12 @@ class MinimaxTextToVideoNode: # upload image, if passed in image_url = None if image is not None: - image_url = upload_images_to_comfyapi(image, max_images=1, auth_kwargs=kwargs)[0] + image_url = (await upload_images_to_comfyapi(image, max_images=1, auth_kwargs=kwargs))[0] # TODO: figure out how to deal with subject properly, API returns invalid params when using S2V-01 model subject_reference = None if subject is not None: - subject_url = upload_images_to_comfyapi(subject, max_images=1, auth_kwargs=kwargs)[0] + subject_url = (await upload_images_to_comfyapi(subject, max_images=1, auth_kwargs=kwargs))[0] subject_reference = [SubjectReferenceItem(image=subject_url)] @@ -130,7 +130,7 @@ class MinimaxTextToVideoNode: ), auth_kwargs=kwargs, ) - response = video_generate_operation.execute() + response = await video_generate_operation.execute() task_id = response.task_id if not task_id: @@ -151,7 +151,7 @@ class MinimaxTextToVideoNode: node_id=unique_id, auth_kwargs=kwargs, ) - task_result = video_generate_operation.execute() + task_result = await video_generate_operation.execute() file_id = task_result.file_id if file_id is None: @@ -167,7 +167,7 @@ class MinimaxTextToVideoNode: request=EmptyRequest(), auth_kwargs=kwargs, ) - file_result = file_retrieve_operation.execute() + file_result = await file_retrieve_operation.execute() file_url = file_result.file.download_url if file_url is None: @@ -182,7 +182,7 @@ class MinimaxTextToVideoNode: message = f"Result URL: {file_url}" PromptServer.instance.send_progress_text(message, unique_id) - video_io = download_url_to_bytesio(file_url) + video_io = await download_url_to_bytesio(file_url) if video_io is None: error_msg = f"Failed to download video from {file_url}" logging.error(error_msg) diff --git a/comfy_api_nodes/nodes_moonvalley.py b/comfy_api_nodes/nodes_moonvalley.py index 789fcef02..164ca3ea5 100644 --- a/comfy_api_nodes/nodes_moonvalley.py +++ b/comfy_api_nodes/nodes_moonvalley.py @@ -95,14 +95,14 @@ def get_video_url_from_response(response) -> Optional[str]: return None -def poll_until_finished( +async def poll_until_finished( auth_kwargs: dict[str, str], api_endpoint: ApiEndpoint[Any, R], result_url_extractor: Optional[Callable[[R], str]] = None, node_id: Optional[str] = None, ) -> R: """Polls the Moonvalley API endpoint until the task reaches a terminal state, then returns the response.""" - return PollingOperation( + return await PollingOperation( poll_endpoint=api_endpoint, completed_statuses=[ "completed", @@ -394,10 +394,10 @@ class BaseMoonvalleyVideoNode: else: return control_map["Motion Transfer"] - def get_response( + async def get_response( self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None ) -> MoonvalleyPromptResponse: - return poll_until_finished( + return await poll_until_finished( auth_kwargs, ApiEndpoint( path=f"{API_PROMPTS_ENDPOINT}/{task_id}", @@ -507,7 +507,7 @@ class MoonvalleyImg2VideoNode(BaseMoonvalleyVideoNode): RETURN_NAMES = ("video",) DESCRIPTION = "Moonvalley Marey Image to Video Node" - def generate( + async def generate( self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs ): image = kwargs.get("image", None) @@ -532,9 +532,9 @@ class MoonvalleyImg2VideoNode(BaseMoonvalleyVideoNode): # Get MIME type from tensor - assuming PNG format for image tensors mime_type = "image/png" - image_url = upload_images_to_comfyapi( + image_url = (await upload_images_to_comfyapi( image, max_images=1, auth_kwargs=kwargs, mime_type=mime_type - )[0] + ))[0] request = MoonvalleyTextToVideoRequest( image_url=image_url, prompt_text=prompt, inference_params=inference_params @@ -549,14 +549,14 @@ class MoonvalleyImg2VideoNode(BaseMoonvalleyVideoNode): request=request, auth_kwargs=kwargs, ) - task_creation_response = initial_operation.execute() + task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.id - final_response = self.get_response( + final_response = await self.get_response( task_id, auth_kwargs=kwargs, node_id=unique_id ) - video = download_url_to_video_output(final_response.output_url) + video = await download_url_to_video_output(final_response.output_url) return (video,) @@ -609,7 +609,7 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): RETURN_TYPES = ("VIDEO",) RETURN_NAMES = ("video",) - def generate( + async def generate( self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs ): video = kwargs.get("video") @@ -620,7 +620,7 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): video_url = "" if video: validated_video = validate_video_to_video_input(video) - video_url = upload_video_to_comfyapi(validated_video, auth_kwargs=kwargs) + video_url = await upload_video_to_comfyapi(validated_video, auth_kwargs=kwargs) control_type = kwargs.get("control_type") motion_intensity = kwargs.get("motion_intensity") @@ -658,15 +658,15 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): request=request, auth_kwargs=kwargs, ) - task_creation_response = initial_operation.execute() + task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.id - final_response = self.get_response( + final_response = await self.get_response( task_id, auth_kwargs=kwargs, node_id=unique_id ) - video = download_url_to_video_output(final_response.output_url) + video = await download_url_to_video_output(final_response.output_url) return (video,) @@ -688,7 +688,7 @@ class MoonvalleyTxt2VideoNode(BaseMoonvalleyVideoNode): del input_types["optional"][param] return input_types - def generate( + async def generate( self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs ): validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) @@ -717,15 +717,15 @@ class MoonvalleyTxt2VideoNode(BaseMoonvalleyVideoNode): request=request, auth_kwargs=kwargs, ) - task_creation_response = initial_operation.execute() + task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.id - final_response = self.get_response( + final_response = await self.get_response( task_id, auth_kwargs=kwargs, node_id=unique_id ) - video = download_url_to_video_output(final_response.output_url) + video = await download_url_to_video_output(final_response.output_url) return (video,) diff --git a/comfy_api_nodes/nodes_openai.py b/comfy_api_nodes/nodes_openai.py index be1d2de4a..ab3c5363b 100644 --- a/comfy_api_nodes/nodes_openai.py +++ b/comfy_api_nodes/nodes_openai.py @@ -163,7 +163,7 @@ class OpenAIDalle2(ComfyNodeABC): DESCRIPTION = cleandoc(__doc__ or "") API_NODE = True - def api_call( + async def api_call( self, prompt, seed=0, @@ -233,9 +233,9 @@ class OpenAIDalle2(ComfyNodeABC): auth_kwargs=kwargs, ) - response = operation.execute() + response = await operation.execute() - img_tensor = validate_and_cast_response(response, node_id=unique_id) + img_tensor = await validate_and_cast_response(response, node_id=unique_id) return (img_tensor,) @@ -311,7 +311,7 @@ class OpenAIDalle3(ComfyNodeABC): DESCRIPTION = cleandoc(__doc__ or "") API_NODE = True - def api_call( + async def api_call( self, prompt, seed=0, @@ -343,9 +343,9 @@ class OpenAIDalle3(ComfyNodeABC): auth_kwargs=kwargs, ) - response = operation.execute() + response = await operation.execute() - img_tensor = validate_and_cast_response(response, node_id=unique_id) + img_tensor = await validate_and_cast_response(response, node_id=unique_id) return (img_tensor,) @@ -446,7 +446,7 @@ class OpenAIGPTImage1(ComfyNodeABC): DESCRIPTION = cleandoc(__doc__ or "") API_NODE = True - def api_call( + async def api_call( self, prompt, seed=0, @@ -537,9 +537,9 @@ class OpenAIGPTImage1(ComfyNodeABC): auth_kwargs=kwargs, ) - response = operation.execute() + response = await operation.execute() - img_tensor = validate_and_cast_response(response, node_id=unique_id) + img_tensor = await validate_and_cast_response(response, node_id=unique_id) return (img_tensor,) @@ -623,7 +623,7 @@ class OpenAIChatNode(OpenAITextNode): DESCRIPTION = "Generate text responses from an OpenAI model." - def get_result_response( + async def get_result_response( self, response_id: str, include: Optional[list[Includable]] = None, @@ -639,7 +639,7 @@ class OpenAIChatNode(OpenAITextNode): creation above for more information. """ - return PollingOperation( + return await PollingOperation( poll_endpoint=ApiEndpoint( path=f"{RESPONSES_ENDPOINT}/{response_id}", method=HttpMethod.GET, @@ -784,7 +784,7 @@ class OpenAIChatNode(OpenAITextNode): self.history[session_id] = new_history - def api_call( + async def api_call( self, prompt: str, persist_context: bool, @@ -815,7 +815,7 @@ class OpenAIChatNode(OpenAITextNode): previous_response_id = None # Create response - create_response = SynchronousOperation( + create_response = await SynchronousOperation( endpoint=ApiEndpoint( path=RESPONSES_ENDPOINT, method=HttpMethod.POST, @@ -848,7 +848,7 @@ class OpenAIChatNode(OpenAITextNode): response_id = create_response.id # Get result output - result_response = self.get_result_response(response_id, auth_kwargs=kwargs) + result_response = await self.get_result_response(response_id, auth_kwargs=kwargs) output_text = self.parse_output_text_from_response(result_response) # Update history diff --git a/comfy_api_nodes/nodes_pika.py b/comfy_api_nodes/nodes_pika.py index 1cc708564..a8dc43cb3 100644 --- a/comfy_api_nodes/nodes_pika.py +++ b/comfy_api_nodes/nodes_pika.py @@ -122,7 +122,7 @@ class PikaNodeBase(ComfyNodeABC): FUNCTION = "api_call" RETURN_TYPES = ("VIDEO",) - def poll_for_task_status( + async def poll_for_task_status( self, task_id: str, auth_kwargs: Optional[dict[str, str]] = None, @@ -152,9 +152,9 @@ class PikaNodeBase(ComfyNodeABC): node_id=node_id, estimated_duration=60 ) - return polling_operation.execute() + return await polling_operation.execute() - def execute_task( + async def execute_task( self, initial_operation: SynchronousOperation[R, PikaGenerateResponse], auth_kwargs: Optional[dict[str, str]] = None, @@ -169,14 +169,14 @@ class PikaNodeBase(ComfyNodeABC): Returns: A tuple containing the video file as a VIDEO output. """ - initial_response = initial_operation.execute() + initial_response = await initial_operation.execute() if not is_valid_initial_response(initial_response): error_msg = f"Pika initial request failed. Code: {initial_response.code}, Message: {initial_response.message}, Data: {initial_response.data}" logging.error(error_msg) raise PikaApiError(error_msg) task_id = initial_response.video_id - final_response = self.poll_for_task_status(task_id, auth_kwargs) + final_response = await self.poll_for_task_status(task_id, auth_kwargs) if not is_valid_video_response(final_response): error_msg = ( f"Pika task {task_id} succeeded but no video data found in response." @@ -187,7 +187,7 @@ class PikaNodeBase(ComfyNodeABC): video_url = str(final_response.url) logging.info("Pika task %s succeeded. Video URL: %s", task_id, video_url) - return (download_url_to_video_output(video_url),) + return (await download_url_to_video_output(video_url),) class PikaImageToVideoV2_2(PikaNodeBase): @@ -212,7 +212,7 @@ class PikaImageToVideoV2_2(PikaNodeBase): DESCRIPTION = "Sends an image and prompt to the Pika API v2.2 to generate a video." - def api_call( + async def api_call( self, image: torch.Tensor, prompt_text: str, @@ -251,7 +251,7 @@ class PikaImageToVideoV2_2(PikaNodeBase): auth_kwargs=kwargs, ) - return self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) class PikaTextToVideoNodeV2_2(PikaNodeBase): @@ -281,7 +281,7 @@ class PikaTextToVideoNodeV2_2(PikaNodeBase): DESCRIPTION = "Sends a text prompt to the Pika API v2.2 to generate a video." - def api_call( + async def api_call( self, prompt_text: str, negative_prompt: str, @@ -311,7 +311,7 @@ class PikaTextToVideoNodeV2_2(PikaNodeBase): content_type="application/x-www-form-urlencoded", ) - return self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) class PikaScenesV2_2(PikaNodeBase): @@ -361,7 +361,7 @@ class PikaScenesV2_2(PikaNodeBase): DESCRIPTION = "Combine your images to create a video with the objects in them. Upload multiple images as ingredients and generate a high-quality video that incorporates all of them." - def api_call( + async def api_call( self, prompt_text: str, negative_prompt: str, @@ -420,7 +420,7 @@ class PikaScenesV2_2(PikaNodeBase): auth_kwargs=kwargs, ) - return self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) class PikAdditionsNode(PikaNodeBase): @@ -462,7 +462,7 @@ class PikAdditionsNode(PikaNodeBase): DESCRIPTION = "Add any object or image into your video. Upload a video and specify what you'd like to add to create a seamlessly integrated result." - def api_call( + async def api_call( self, video: VideoInput, image: torch.Tensor, @@ -481,10 +481,10 @@ class PikAdditionsNode(PikaNodeBase): image_bytes_io = tensor_to_bytesio(image) image_bytes_io.seek(0) - pika_files = [ - ("video", ("video.mp4", video_bytes_io, "video/mp4")), - ("image", ("image.png", image_bytes_io, "image/png")), - ] + pika_files = { + "video": ("video.mp4", video_bytes_io, "video/mp4"), + "image": ("image.png", image_bytes_io, "image/png"), + } # Prepare non-file data pika_request_data = PikaBodyGeneratePikadditionsGeneratePikadditionsPost( @@ -506,7 +506,7 @@ class PikAdditionsNode(PikaNodeBase): auth_kwargs=kwargs, ) - return self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) class PikaSwapsNode(PikaNodeBase): @@ -558,7 +558,7 @@ class PikaSwapsNode(PikaNodeBase): DESCRIPTION = "Swap out any object or region of your video with a new image or object. Define areas to replace either with a mask or coordinates." RETURN_TYPES = ("VIDEO",) - def api_call( + async def api_call( self, video: VideoInput, image: torch.Tensor, @@ -587,11 +587,11 @@ class PikaSwapsNode(PikaNodeBase): image_bytes_io = tensor_to_bytesio(image) image_bytes_io.seek(0) - pika_files = [ - ("video", ("video.mp4", video_bytes_io, "video/mp4")), - ("image", ("image.png", image_bytes_io, "image/png")), - ("modifyRegionMask", ("mask.png", mask_bytes_io, "image/png")), - ] + pika_files = { + "video": ("video.mp4", video_bytes_io, "video/mp4"), + "image": ("image.png", image_bytes_io, "image/png"), + "modifyRegionMask": ("mask.png", mask_bytes_io, "image/png"), + } # Prepare non-file data pika_request_data = PikaBodyGeneratePikaswapsGeneratePikaswapsPost( @@ -613,7 +613,7 @@ class PikaSwapsNode(PikaNodeBase): auth_kwargs=kwargs, ) - return self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) class PikaffectsNode(PikaNodeBase): @@ -664,7 +664,7 @@ class PikaffectsNode(PikaNodeBase): DESCRIPTION = "Generate a video with a specific Pikaffect. Supported Pikaffects: Cake-ify, Crumble, Crush, Decapitate, Deflate, Dissolve, Explode, Eye-pop, Inflate, Levitate, Melt, Peel, Poke, Squish, Ta-da, Tear" - def api_call( + async def api_call( self, image: torch.Tensor, pikaffect: str, @@ -693,7 +693,7 @@ class PikaffectsNode(PikaNodeBase): auth_kwargs=kwargs, ) - return self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) class PikaStartEndFrameNode2_2(PikaNodeBase): @@ -718,7 +718,7 @@ class PikaStartEndFrameNode2_2(PikaNodeBase): DESCRIPTION = "Generate a video by combining your first and last frame. Upload two images to define the start and end points, and let the AI create a smooth transition between them." - def api_call( + async def api_call( self, image_start: torch.Tensor, image_end: torch.Tensor, @@ -732,10 +732,7 @@ class PikaStartEndFrameNode2_2(PikaNodeBase): ) -> tuple[VideoFromFile]: pika_files = [ - ( - "keyFrames", - ("image_start.png", tensor_to_bytesio(image_start), "image/png"), - ), + ("keyFrames", ("image_start.png", tensor_to_bytesio(image_start), "image/png")), ("keyFrames", ("image_end.png", tensor_to_bytesio(image_end), "image/png")), ] @@ -758,7 +755,7 @@ class PikaStartEndFrameNode2_2(PikaNodeBase): auth_kwargs=kwargs, ) - return self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) NODE_CLASS_MAPPINGS = { diff --git a/comfy_api_nodes/nodes_pixverse.py b/comfy_api_nodes/nodes_pixverse.py index ef4a9a802..7c5a52feb 100644 --- a/comfy_api_nodes/nodes_pixverse.py +++ b/comfy_api_nodes/nodes_pixverse.py @@ -30,7 +30,7 @@ from comfy.comfy_types.node_typing import IO, ComfyNodeABC from comfy_api.input_impl import VideoFromFile import torch -import requests +import aiohttp from io import BytesIO @@ -47,7 +47,7 @@ def get_video_url_from_response( return str(response.Resp.url) -def upload_image_to_pixverse(image: torch.Tensor, auth_kwargs=None): +async def upload_image_to_pixverse(image: torch.Tensor, auth_kwargs=None): # first, upload image to Pixverse and get image id to use in actual generation call files = {"image": tensor_to_bytesio(image)} operation = SynchronousOperation( @@ -62,7 +62,7 @@ def upload_image_to_pixverse(image: torch.Tensor, auth_kwargs=None): content_type="multipart/form-data", auth_kwargs=auth_kwargs, ) - response_upload: PixverseImageUploadResponse = operation.execute() + response_upload: PixverseImageUploadResponse = await operation.execute() if response_upload.Resp is None: raise Exception( @@ -164,7 +164,7 @@ class PixverseTextToVideoNode(ComfyNodeABC): }, } - def api_call( + async def api_call( self, prompt: str, aspect_ratio: str, @@ -205,7 +205,7 @@ class PixverseTextToVideoNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - response_api = operation.execute() + response_api = await operation.execute() if response_api.Resp is None: raise Exception(f"PixVerse request failed: '{response_api.ErrMsg}'") @@ -229,11 +229,11 @@ class PixverseTextToVideoNode(ComfyNodeABC): result_url_extractor=get_video_url_from_response, estimated_duration=AVERAGE_DURATION_T2V, ) - response_poll = operation.execute() + response_poll = await operation.execute() - vid_response = requests.get(response_poll.Resp.url) - - return (VideoFromFile(BytesIO(vid_response.content)),) + async with aiohttp.ClientSession() as session: + async with session.get(response_poll.Resp.url) as vid_response: + return (VideoFromFile(BytesIO(await vid_response.content.read())),) class PixverseImageToVideoNode(ComfyNodeABC): @@ -302,7 +302,7 @@ class PixverseImageToVideoNode(ComfyNodeABC): }, } - def api_call( + async def api_call( self, image: torch.Tensor, prompt: str, @@ -316,7 +316,7 @@ class PixverseImageToVideoNode(ComfyNodeABC): **kwargs, ): validate_string(prompt, strip_whitespace=False) - img_id = upload_image_to_pixverse(image, auth_kwargs=kwargs) + img_id = await upload_image_to_pixverse(image, auth_kwargs=kwargs) # 1080p is limited to 5 seconds duration # only normal motion_mode supported for 1080p or for non-5 second duration @@ -345,7 +345,7 @@ class PixverseImageToVideoNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - response_api = operation.execute() + response_api = await operation.execute() if response_api.Resp is None: raise Exception(f"PixVerse request failed: '{response_api.ErrMsg}'") @@ -369,10 +369,11 @@ class PixverseImageToVideoNode(ComfyNodeABC): result_url_extractor=get_video_url_from_response, estimated_duration=AVERAGE_DURATION_I2V, ) - response_poll = operation.execute() + response_poll = await operation.execute() - vid_response = requests.get(response_poll.Resp.url) - return (VideoFromFile(BytesIO(vid_response.content)),) + async with aiohttp.ClientSession() as session: + async with session.get(response_poll.Resp.url) as vid_response: + return (VideoFromFile(BytesIO(await vid_response.content.read())),) class PixverseTransitionVideoNode(ComfyNodeABC): @@ -436,7 +437,7 @@ class PixverseTransitionVideoNode(ComfyNodeABC): }, } - def api_call( + async def api_call( self, first_frame: torch.Tensor, last_frame: torch.Tensor, @@ -450,8 +451,8 @@ class PixverseTransitionVideoNode(ComfyNodeABC): **kwargs, ): validate_string(prompt, strip_whitespace=False) - first_frame_id = upload_image_to_pixverse(first_frame, auth_kwargs=kwargs) - last_frame_id = upload_image_to_pixverse(last_frame, auth_kwargs=kwargs) + first_frame_id = await upload_image_to_pixverse(first_frame, auth_kwargs=kwargs) + last_frame_id = await upload_image_to_pixverse(last_frame, auth_kwargs=kwargs) # 1080p is limited to 5 seconds duration # only normal motion_mode supported for 1080p or for non-5 second duration @@ -480,7 +481,7 @@ class PixverseTransitionVideoNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - response_api = operation.execute() + response_api = await operation.execute() if response_api.Resp is None: raise Exception(f"PixVerse request failed: '{response_api.ErrMsg}'") @@ -504,10 +505,11 @@ class PixverseTransitionVideoNode(ComfyNodeABC): result_url_extractor=get_video_url_from_response, estimated_duration=AVERAGE_DURATION_T2V, ) - response_poll = operation.execute() + response_poll = await operation.execute() - vid_response = requests.get(response_poll.Resp.url) - return (VideoFromFile(BytesIO(vid_response.content)),) + async with aiohttp.ClientSession() as session: + async with session.get(response_poll.Resp.url) as vid_response: + return (VideoFromFile(BytesIO(await vid_response.content.read())),) NODE_CLASS_MAPPINGS = { diff --git a/comfy_api_nodes/nodes_recraft.py b/comfy_api_nodes/nodes_recraft.py index e369c4b7e..c8516b368 100644 --- a/comfy_api_nodes/nodes_recraft.py +++ b/comfy_api_nodes/nodes_recraft.py @@ -37,7 +37,7 @@ from io import BytesIO from PIL import UnidentifiedImageError -def handle_recraft_file_request( +async def handle_recraft_file_request( image: torch.Tensor, path: str, mask: torch.Tensor=None, @@ -71,13 +71,13 @@ def handle_recraft_file_request( auth_kwargs=auth_kwargs, multipart_parser=recraft_multipart_parser, ) - response: RecraftImageGenerationResponse = operation.execute() + response: RecraftImageGenerationResponse = await operation.execute() all_bytesio = [] if response.image is not None: - all_bytesio.append(download_url_to_bytesio(response.image.url, timeout=timeout)) + all_bytesio.append(await download_url_to_bytesio(response.image.url, timeout=timeout)) else: for data in response.data: - all_bytesio.append(download_url_to_bytesio(data.url, timeout=timeout)) + all_bytesio.append(await download_url_to_bytesio(data.url, timeout=timeout)) return all_bytesio @@ -395,7 +395,7 @@ class RecraftTextToImageNode: }, } - def api_call( + async def api_call( self, prompt: str, size: str, @@ -439,7 +439,7 @@ class RecraftTextToImageNode: ), auth_kwargs=kwargs, ) - response: RecraftImageGenerationResponse = operation.execute() + response: RecraftImageGenerationResponse = await operation.execute() images = [] urls = [] for data in response.data: @@ -451,7 +451,7 @@ class RecraftTextToImageNode: f"Result URL: {urls_string}", unique_id ) image = bytesio_to_image_tensor( - download_url_to_bytesio(data.url, timeout=1024) + await download_url_to_bytesio(data.url, timeout=1024) ) if len(image.shape) < 4: image = image.unsqueeze(0) @@ -538,7 +538,7 @@ class RecraftImageToImageNode: }, } - def api_call( + async def api_call( self, image: torch.Tensor, prompt: str, @@ -578,7 +578,7 @@ class RecraftImageToImageNode: total = image.shape[0] pbar = ProgressBar(total) for i in range(total): - sub_bytes = handle_recraft_file_request( + sub_bytes = await handle_recraft_file_request( image=image[i], path="/proxy/recraft/images/imageToImage", request=request, @@ -654,7 +654,7 @@ class RecraftImageInpaintingNode: }, } - def api_call( + async def api_call( self, image: torch.Tensor, mask: torch.Tensor, @@ -690,7 +690,7 @@ class RecraftImageInpaintingNode: total = image.shape[0] pbar = ProgressBar(total) for i in range(total): - sub_bytes = handle_recraft_file_request( + sub_bytes = await handle_recraft_file_request( image=image[i], mask=mask[i:i+1], path="/proxy/recraft/images/inpaint", @@ -779,7 +779,7 @@ class RecraftTextToVectorNode: }, } - def api_call( + async def api_call( self, prompt: str, substyle: str, @@ -821,7 +821,7 @@ class RecraftTextToVectorNode: ), auth_kwargs=kwargs, ) - response: RecraftImageGenerationResponse = operation.execute() + response: RecraftImageGenerationResponse = await operation.execute() svg_data = [] urls = [] for data in response.data: @@ -831,7 +831,7 @@ class RecraftTextToVectorNode: PromptServer.instance.send_progress_text( f"Result URL: {' '.join(urls)}", unique_id ) - svg_data.append(download_url_to_bytesio(data.url, timeout=1024)) + svg_data.append(await download_url_to_bytesio(data.url, timeout=1024)) return (SVG(svg_data),) @@ -861,7 +861,7 @@ class RecraftVectorizeImageNode: }, } - def api_call( + async def api_call( self, image: torch.Tensor, **kwargs, @@ -870,7 +870,7 @@ class RecraftVectorizeImageNode: total = image.shape[0] pbar = ProgressBar(total) for i in range(total): - sub_bytes = handle_recraft_file_request( + sub_bytes = await handle_recraft_file_request( image=image[i], path="/proxy/recraft/images/vectorize", auth_kwargs=kwargs, @@ -942,7 +942,7 @@ class RecraftReplaceBackgroundNode: }, } - def api_call( + async def api_call( self, image: torch.Tensor, prompt: str, @@ -973,7 +973,7 @@ class RecraftReplaceBackgroundNode: total = image.shape[0] pbar = ProgressBar(total) for i in range(total): - sub_bytes = handle_recraft_file_request( + sub_bytes = await handle_recraft_file_request( image=image[i], path="/proxy/recraft/images/replaceBackground", request=request, @@ -1011,7 +1011,7 @@ class RecraftRemoveBackgroundNode: }, } - def api_call( + async def api_call( self, image: torch.Tensor, **kwargs, @@ -1020,7 +1020,7 @@ class RecraftRemoveBackgroundNode: total = image.shape[0] pbar = ProgressBar(total) for i in range(total): - sub_bytes = handle_recraft_file_request( + sub_bytes = await handle_recraft_file_request( image=image[i], path="/proxy/recraft/images/removeBackground", auth_kwargs=kwargs, @@ -1062,7 +1062,7 @@ class RecraftCrispUpscaleNode: }, } - def api_call( + async def api_call( self, image: torch.Tensor, **kwargs, @@ -1071,7 +1071,7 @@ class RecraftCrispUpscaleNode: total = image.shape[0] pbar = ProgressBar(total) for i in range(total): - sub_bytes = handle_recraft_file_request( + sub_bytes = await handle_recraft_file_request( image=image[i], path=self.RECRAFT_PATH, auth_kwargs=kwargs, diff --git a/comfy_api_nodes/nodes_rodin.py b/comfy_api_nodes/nodes_rodin.py index 67f90478c..c89d087e5 100644 --- a/comfy_api_nodes/nodes_rodin.py +++ b/comfy_api_nodes/nodes_rodin.py @@ -9,11 +9,10 @@ from __future__ import annotations from inspect import cleandoc from comfy.comfy_types.node_typing import IO import folder_paths as comfy_paths -import requests +import aiohttp import os import datetime -import shutil -import time +import asyncio import io import logging import math @@ -66,7 +65,6 @@ def create_task_error(response: Rodin3DGenerateResponse): return hasattr(response, "error") - class Rodin3DAPI: """ Generate 3D Assets using Rodin API @@ -123,8 +121,8 @@ class Rodin3DAPI: else: return "Generating" - def CreateGenerateTask(self, images=None, seed=1, material="PBR", quality="medium", tier="Regular", mesh_mode="Quad", **kwargs): - if images == None: + async def create_generate_task(self, images=None, seed=1, material="PBR", quality="medium", tier="Regular", mesh_mode="Quad", **kwargs): + if images is None: raise Exception("Rodin 3D generate requires at least 1 image.") if len(images) >= 5: raise Exception("Rodin 3D generate requires up to 5 image.") @@ -155,7 +153,7 @@ class Rodin3DAPI: auth_kwargs=kwargs, ) - response = operation.execute() + response = await operation.execute() if create_task_error(response): error_message = f"Rodin3D Create 3D generate Task Failed. Message: {response.message}, error: {response.error}" @@ -168,7 +166,7 @@ class Rodin3DAPI: logging.info(f"[ Rodin3D API - Submit Jobs ] UUID: {task_uuid}") return task_uuid, subscription_key - def poll_for_task_status(self, subscription_key, **kwargs) -> Rodin3DCheckStatusResponse: + async def poll_for_task_status(self, subscription_key, **kwargs) -> Rodin3DCheckStatusResponse: path = "/proxy/rodin/api/v2/status" @@ -191,11 +189,9 @@ class Rodin3DAPI: logging.info("[ Rodin3D API - CheckStatus ] Generate Start!") - return poll_operation.execute() + return await poll_operation.execute() - - - def GetRodinDownloadList(self, uuid, **kwargs) -> Rodin3DDownloadResponse: + async def get_rodin_download_list(self, uuid, **kwargs) -> Rodin3DDownloadResponse: logging.info("[ Rodin3D API - Downloading ] Generate Successfully!") path = "/proxy/rodin/api/v2/download" @@ -212,53 +208,59 @@ class Rodin3DAPI: auth_kwargs=kwargs ) - return operation.execute() + return await operation.execute() - def GetQualityAndMode(self, PolyCount): - if PolyCount == "200K-Triangle": + def get_quality_mode(self, poly_count): + if poly_count == "200K-Triangle": mesh_mode = "Raw" quality = "medium" else: mesh_mode = "Quad" - if PolyCount == "4K-Quad": + if poly_count == "4K-Quad": quality = "extra-low" - elif PolyCount == "8K-Quad": + elif poly_count == "8K-Quad": quality = "low" - elif PolyCount == "18K-Quad": + elif poly_count == "18K-Quad": quality = "medium" - elif PolyCount == "50K-Quad": + elif poly_count == "50K-Quad": quality = "high" else: quality = "medium" return mesh_mode, quality - def DownLoadFiles(self, Url_List): - Save_path = os.path.join(comfy_paths.get_output_directory(), "Rodin3D", datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) - os.makedirs(Save_path, exist_ok=True) + async def download_files(self, url_list): + save_path = os.path.join(comfy_paths.get_output_directory(), "Rodin3D", datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) + os.makedirs(save_path, exist_ok=True) model_file_path = None - for Item in Url_List.list: - url = Item.url - file_name = Item.name - file_path = os.path.join(Save_path, file_name) - if file_path.endswith(".glb"): - model_file_path = file_path - logging.info(f"[ Rodin3D API - download_files ] Downloading file: {file_path}") - max_retries = 5 - for attempt in range(max_retries): - try: - with requests.get(url, stream=True) as r: - r.raise_for_status() - with open(file_path, "wb") as f: - shutil.copyfileobj(r.raw, f) - break - except Exception as e: - logging.info(f"[ Rodin3D API - download_files ] Error downloading {file_path}:{e}") - if attempt < max_retries - 1: - logging.info("Retrying...") - time.sleep(2) - else: - logging.info(f"[ Rodin3D API - download_files ] Failed to download {file_path} after {max_retries} attempts.") + async with aiohttp.ClientSession() as session: + for i in url_list.list: + url = i.url + file_name = i.name + file_path = os.path.join(save_path, file_name) + if file_path.endswith(".glb"): + model_file_path = file_path + logging.info(f"[ Rodin3D API - download_files ] Downloading file: {file_path}") + max_retries = 5 + for attempt in range(max_retries): + try: + async with session.get(url) as resp: + resp.raise_for_status() + with open(file_path, "wb") as f: + async for chunk in resp.content.iter_chunked(32 * 1024): + f.write(chunk) + break + except Exception as e: + logging.info(f"[ Rodin3D API - download_files ] Error downloading {file_path}:{e}") + if attempt < max_retries - 1: + logging.info("Retrying...") + await asyncio.sleep(2) + else: + logging.info( + "[ Rodin3D API - download_files ] Failed to download %s after %s attempts.", + file_path, + max_retries, + ) return model_file_path @@ -285,7 +287,7 @@ class Rodin3D_Regular(Rodin3DAPI): }, } - def api_call( + async def api_call( self, Images, Seed, @@ -298,14 +300,17 @@ class Rodin3D_Regular(Rodin3DAPI): m_images = [] for i in range(num_images): m_images.append(Images[i]) - mesh_mode, quality = self.GetQualityAndMode(Polygon_count) - task_uuid, subscription_key = self.CreateGenerateTask(images=m_images, seed=Seed, material=Material_Type, quality=quality, tier=tier, mesh_mode=mesh_mode, **kwargs) - self.poll_for_task_status(subscription_key, **kwargs) - Download_List = self.GetRodinDownloadList(task_uuid, **kwargs) - model = self.DownLoadFiles(Download_List) + mesh_mode, quality = self.get_quality_mode(Polygon_count) + task_uuid, subscription_key = await self.create_generate_task(images=m_images, seed=Seed, material=Material_Type, + quality=quality, tier=tier, mesh_mode=mesh_mode, + **kwargs) + await self.poll_for_task_status(subscription_key, **kwargs) + download_list = await self.get_rodin_download_list(task_uuid, **kwargs) + model = await self.download_files(download_list) return (model,) + class Rodin3D_Detail(Rodin3DAPI): @classmethod def INPUT_TYPES(s): @@ -328,7 +333,7 @@ class Rodin3D_Detail(Rodin3DAPI): }, } - def api_call( + async def api_call( self, Images, Seed, @@ -341,14 +346,17 @@ class Rodin3D_Detail(Rodin3DAPI): m_images = [] for i in range(num_images): m_images.append(Images[i]) - mesh_mode, quality = self.GetQualityAndMode(Polygon_count) - task_uuid, subscription_key = self.CreateGenerateTask(images=m_images, seed=Seed, material=Material_Type, quality=quality, tier=tier, mesh_mode=mesh_mode, **kwargs) - self.poll_for_task_status(subscription_key, **kwargs) - Download_List = self.GetRodinDownloadList(task_uuid, **kwargs) - model = self.DownLoadFiles(Download_List) + mesh_mode, quality = self.get_quality_mode(Polygon_count) + task_uuid, subscription_key = await self.create_generate_task(images=m_images, seed=Seed, material=Material_Type, + quality=quality, tier=tier, mesh_mode=mesh_mode, + **kwargs) + await self.poll_for_task_status(subscription_key, **kwargs) + download_list = await self.get_rodin_download_list(task_uuid, **kwargs) + model = await self.download_files(download_list) return (model,) + class Rodin3D_Smooth(Rodin3DAPI): @classmethod def INPUT_TYPES(s): @@ -371,7 +379,7 @@ class Rodin3D_Smooth(Rodin3DAPI): }, } - def api_call( + async def api_call( self, Images, Seed, @@ -384,14 +392,17 @@ class Rodin3D_Smooth(Rodin3DAPI): m_images = [] for i in range(num_images): m_images.append(Images[i]) - mesh_mode, quality = self.GetQualityAndMode(Polygon_count) - task_uuid, subscription_key = self.CreateGenerateTask(images=m_images, seed=Seed, material=Material_Type, quality=quality, tier=tier, mesh_mode=mesh_mode, **kwargs) - self.poll_for_task_status(subscription_key, **kwargs) - Download_List = self.GetRodinDownloadList(task_uuid, **kwargs) - model = self.DownLoadFiles(Download_List) + mesh_mode, quality = self.get_quality_mode(Polygon_count) + task_uuid, subscription_key = await self.create_generate_task(images=m_images, seed=Seed, material=Material_Type, + quality=quality, tier=tier, mesh_mode=mesh_mode, + **kwargs) + await self.poll_for_task_status(subscription_key, **kwargs) + download_list = await self.get_rodin_download_list(task_uuid, **kwargs) + model = await self.download_files(download_list) return (model,) + class Rodin3D_Sketch(Rodin3DAPI): @classmethod def INPUT_TYPES(s): @@ -423,7 +434,7 @@ class Rodin3D_Sketch(Rodin3DAPI): }, } - def api_call( + async def api_call( self, Images, Seed, @@ -437,10 +448,12 @@ class Rodin3D_Sketch(Rodin3DAPI): material_type = "PBR" quality = "medium" mesh_mode = "Quad" - task_uuid, subscription_key = self.CreateGenerateTask(images=m_images, seed=Seed, material=material_type, quality=quality, tier=tier, mesh_mode=mesh_mode, **kwargs) - self.poll_for_task_status(subscription_key, **kwargs) - Download_List = self.GetRodinDownloadList(task_uuid, **kwargs) - model = self.DownLoadFiles(Download_List) + task_uuid, subscription_key = await self.create_generate_task( + images=m_images, seed=Seed, material=material_type, quality=quality, tier=tier, mesh_mode=mesh_mode, **kwargs + ) + await self.poll_for_task_status(subscription_key, **kwargs) + download_list = await self.get_rodin_download_list(task_uuid, **kwargs) + model = await self.download_files(download_list) return (model,) diff --git a/comfy_api_nodes/nodes_runway.py b/comfy_api_nodes/nodes_runway.py index af4b321f9..98024a9fa 100644 --- a/comfy_api_nodes/nodes_runway.py +++ b/comfy_api_nodes/nodes_runway.py @@ -99,14 +99,14 @@ def validate_input_image(image: torch.Tensor) -> bool: return image.shape[2] < 8000 and image.shape[1] < 8000 -def poll_until_finished( +async def poll_until_finished( auth_kwargs: dict[str, str], api_endpoint: ApiEndpoint[Any, TaskStatusResponse], estimated_duration: Optional[int] = None, node_id: Optional[str] = None, ) -> TaskStatusResponse: """Polls the Runway API endpoint until the task reaches a terminal state, then returns the response.""" - return PollingOperation( + return await PollingOperation( poll_endpoint=api_endpoint, completed_statuses=[ TaskStatus.SUCCEEDED.value, @@ -115,7 +115,7 @@ def poll_until_finished( TaskStatus.FAILED.value, TaskStatus.CANCELLED.value, ], - status_extractor=lambda response: (response.status.value), + status_extractor=lambda response: response.status.value, auth_kwargs=auth_kwargs, result_url_extractor=get_video_url_from_task_status, estimated_duration=estimated_duration, @@ -167,11 +167,11 @@ class RunwayVideoGenNode(ComfyNodeABC): ) return True - def get_response( + async def get_response( self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None ) -> RunwayImageToVideoResponse: """Poll the task status until it is finished then get the response.""" - return poll_until_finished( + return await poll_until_finished( auth_kwargs, ApiEndpoint( path=f"{PATH_GET_TASK_STATUS}/{task_id}", @@ -183,7 +183,7 @@ class RunwayVideoGenNode(ComfyNodeABC): node_id=node_id, ) - def generate_video( + async def generate_video( self, request: RunwayImageToVideoRequest, auth_kwargs: dict[str, str], @@ -200,15 +200,15 @@ class RunwayVideoGenNode(ComfyNodeABC): auth_kwargs=auth_kwargs, ) - initial_response = initial_operation.execute() + initial_response = await initial_operation.execute() self.validate_task_created(initial_response) task_id = initial_response.id - final_response = self.get_response(task_id, auth_kwargs, node_id) + final_response = await self.get_response(task_id, auth_kwargs, node_id) self.validate_response(final_response) video_url = get_video_url_from_task_status(final_response) - return (download_url_to_video_output(video_url),) + return (await download_url_to_video_output(video_url),) class RunwayImageToVideoNodeGen3a(RunwayVideoGenNode): @@ -250,7 +250,7 @@ class RunwayImageToVideoNodeGen3a(RunwayVideoGenNode): }, } - def api_call( + async def api_call( self, prompt: str, start_frame: torch.Tensor, @@ -265,7 +265,7 @@ class RunwayImageToVideoNodeGen3a(RunwayVideoGenNode): validate_input_image(start_frame) # Upload image - download_urls = upload_images_to_comfyapi( + download_urls = await upload_images_to_comfyapi( start_frame, max_images=1, mime_type="image/png", @@ -274,7 +274,7 @@ class RunwayImageToVideoNodeGen3a(RunwayVideoGenNode): if len(download_urls) != 1: raise RunwayApiError("Failed to upload one or more images to comfy api.") - return self.generate_video( + return await self.generate_video( RunwayImageToVideoRequest( promptText=prompt, seed=seed, @@ -333,7 +333,7 @@ class RunwayImageToVideoNodeGen4(RunwayVideoGenNode): }, } - def api_call( + async def api_call( self, prompt: str, start_frame: torch.Tensor, @@ -348,7 +348,7 @@ class RunwayImageToVideoNodeGen4(RunwayVideoGenNode): validate_input_image(start_frame) # Upload image - download_urls = upload_images_to_comfyapi( + download_urls = await upload_images_to_comfyapi( start_frame, max_images=1, mime_type="image/png", @@ -357,7 +357,7 @@ class RunwayImageToVideoNodeGen4(RunwayVideoGenNode): if len(download_urls) != 1: raise RunwayApiError("Failed to upload one or more images to comfy api.") - return self.generate_video( + return await self.generate_video( RunwayImageToVideoRequest( promptText=prompt, seed=seed, @@ -382,10 +382,10 @@ class RunwayFirstLastFrameNode(RunwayVideoGenNode): DESCRIPTION = "Upload first and last keyframes, draft a prompt, and generate a video. More complex transitions, such as cases where the Last frame is completely different from the First frame, may benefit from the longer 10s duration. This would give the generation more time to smoothly transition between the two inputs. Before diving in, review these best practices to ensure that your input selections will set your generation up for success: https://help.runwayml.com/hc/en-us/articles/34170748696595-Creating-with-Keyframes-on-Gen-3." - def get_response( + async def get_response( self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None ) -> RunwayImageToVideoResponse: - return poll_until_finished( + return await poll_until_finished( auth_kwargs, ApiEndpoint( path=f"{PATH_GET_TASK_STATUS}/{task_id}", @@ -437,7 +437,7 @@ class RunwayFirstLastFrameNode(RunwayVideoGenNode): }, } - def api_call( + async def api_call( self, prompt: str, start_frame: torch.Tensor, @@ -455,7 +455,7 @@ class RunwayFirstLastFrameNode(RunwayVideoGenNode): # Upload images stacked_input_images = image_tensor_pair_to_batch(start_frame, end_frame) - download_urls = upload_images_to_comfyapi( + download_urls = await upload_images_to_comfyapi( stacked_input_images, max_images=2, mime_type="image/png", @@ -464,7 +464,7 @@ class RunwayFirstLastFrameNode(RunwayVideoGenNode): if len(download_urls) != 2: raise RunwayApiError("Failed to upload one or more images to comfy api.") - return self.generate_video( + return await self.generate_video( RunwayImageToVideoRequest( promptText=prompt, seed=seed, @@ -543,11 +543,11 @@ class RunwayTextToImageNode(ComfyNodeABC): ) return True - def get_response( + async def get_response( self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None ) -> TaskStatusResponse: """Poll the task status until it is finished then get the response.""" - return poll_until_finished( + return await poll_until_finished( auth_kwargs, ApiEndpoint( path=f"{PATH_GET_TASK_STATUS}/{task_id}", @@ -559,7 +559,7 @@ class RunwayTextToImageNode(ComfyNodeABC): node_id=node_id, ) - def api_call( + async def api_call( self, prompt: str, ratio: str, @@ -574,7 +574,7 @@ class RunwayTextToImageNode(ComfyNodeABC): reference_images = None if reference_image is not None: validate_input_image(reference_image) - download_urls = upload_images_to_comfyapi( + download_urls = await upload_images_to_comfyapi( reference_image, max_images=1, mime_type="image/png", @@ -605,19 +605,19 @@ class RunwayTextToImageNode(ComfyNodeABC): auth_kwargs=kwargs, ) - initial_response = initial_operation.execute() + initial_response = await initial_operation.execute() self.validate_task_created(initial_response) task_id = initial_response.id # Poll for completion - final_response = self.get_response( + final_response = await self.get_response( task_id, auth_kwargs=kwargs, node_id=unique_id ) self.validate_response(final_response) # Download and return image image_url = get_image_url_from_task_status(final_response) - return (download_url_to_image_tensor(image_url),) + return (await download_url_to_image_tensor(image_url),) NODE_CLASS_MAPPINGS = { diff --git a/comfy_api_nodes/nodes_stability.py b/comfy_api_nodes/nodes_stability.py index 02e421678..31309d831 100644 --- a/comfy_api_nodes/nodes_stability.py +++ b/comfy_api_nodes/nodes_stability.py @@ -124,7 +124,7 @@ class StabilityStableImageUltraNode: }, } - def api_call(self, prompt: str, aspect_ratio: str, style_preset: str, seed: int, + async def api_call(self, prompt: str, aspect_ratio: str, style_preset: str, seed: int, negative_prompt: str=None, image: torch.Tensor = None, image_denoise: float=None, **kwargs): validate_string(prompt, strip_whitespace=False) @@ -163,7 +163,7 @@ class StabilityStableImageUltraNode: content_type="multipart/form-data", auth_kwargs=kwargs, ) - response_api = operation.execute() + response_api = await operation.execute() if response_api.finish_reason != "SUCCESS": raise Exception(f"Stable Image Ultra generation failed: {response_api.finish_reason}.") @@ -257,7 +257,7 @@ class StabilityStableImageSD_3_5Node: }, } - def api_call(self, model: str, prompt: str, aspect_ratio: str, style_preset: str, seed: int, cfg_scale: float, + async def api_call(self, model: str, prompt: str, aspect_ratio: str, style_preset: str, seed: int, cfg_scale: float, negative_prompt: str=None, image: torch.Tensor = None, image_denoise: float=None, **kwargs): validate_string(prompt, strip_whitespace=False) @@ -302,7 +302,7 @@ class StabilityStableImageSD_3_5Node: content_type="multipart/form-data", auth_kwargs=kwargs, ) - response_api = operation.execute() + response_api = await operation.execute() if response_api.finish_reason != "SUCCESS": raise Exception(f"Stable Diffusion 3.5 Image generation failed: {response_api.finish_reason}.") @@ -374,7 +374,7 @@ class StabilityUpscaleConservativeNode: }, } - def api_call(self, image: torch.Tensor, prompt: str, creativity: float, seed: int, negative_prompt: str=None, + async def api_call(self, image: torch.Tensor, prompt: str, creativity: float, seed: int, negative_prompt: str=None, **kwargs): validate_string(prompt, strip_whitespace=False) image_binary = tensor_to_bytesio(image, total_pixels=1024*1024).read() @@ -403,7 +403,7 @@ class StabilityUpscaleConservativeNode: content_type="multipart/form-data", auth_kwargs=kwargs, ) - response_api = operation.execute() + response_api = await operation.execute() if response_api.finish_reason != "SUCCESS": raise Exception(f"Stability Upscale Conservative generation failed: {response_api.finish_reason}.") @@ -480,7 +480,7 @@ class StabilityUpscaleCreativeNode: }, } - def api_call(self, image: torch.Tensor, prompt: str, creativity: float, style_preset: str, seed: int, negative_prompt: str=None, + async def api_call(self, image: torch.Tensor, prompt: str, creativity: float, style_preset: str, seed: int, negative_prompt: str=None, **kwargs): validate_string(prompt, strip_whitespace=False) image_binary = tensor_to_bytesio(image, total_pixels=1024*1024).read() @@ -512,7 +512,7 @@ class StabilityUpscaleCreativeNode: content_type="multipart/form-data", auth_kwargs=kwargs, ) - response_api = operation.execute() + response_api = await operation.execute() operation = PollingOperation( poll_endpoint=ApiEndpoint( @@ -527,7 +527,7 @@ class StabilityUpscaleCreativeNode: status_extractor=lambda x: get_async_dummy_status(x), auth_kwargs=kwargs, ) - response_poll: StabilityResultsGetResponse = operation.execute() + response_poll: StabilityResultsGetResponse = await operation.execute() if response_poll.finish_reason != "SUCCESS": raise Exception(f"Stability Upscale Creative generation failed: {response_poll.finish_reason}.") @@ -563,8 +563,7 @@ class StabilityUpscaleFastNode: }, } - def api_call(self, image: torch.Tensor, - **kwargs): + async def api_call(self, image: torch.Tensor, **kwargs): image_binary = tensor_to_bytesio(image, total_pixels=4096*4096).read() files = { @@ -583,7 +582,7 @@ class StabilityUpscaleFastNode: content_type="multipart/form-data", auth_kwargs=kwargs, ) - response_api = operation.execute() + response_api = await operation.execute() if response_api.finish_reason != "SUCCESS": raise Exception(f"Stability Upscale Fast failed: {response_api.finish_reason}.") diff --git a/comfy_api_nodes/nodes_tripo.py b/comfy_api_nodes/nodes_tripo.py index 65f3b21f5..d08cf9007 100644 --- a/comfy_api_nodes/nodes_tripo.py +++ b/comfy_api_nodes/nodes_tripo.py @@ -37,8 +37,8 @@ from comfy_api_nodes.apinode_utils import ( ) -def upload_image_to_tripo(image, **kwargs): - urls = upload_images_to_comfyapi(image, max_images=1, auth_kwargs=kwargs) +async def upload_image_to_tripo(image, **kwargs): + urls = await upload_images_to_comfyapi(image, max_images=1, auth_kwargs=kwargs) return TripoFileReference(TripoUrlReference(url=urls[0], type="jpeg")) def get_model_url_from_response(response: TripoTaskResponse) -> str: @@ -49,7 +49,7 @@ def get_model_url_from_response(response: TripoTaskResponse) -> str: raise RuntimeError(f"Failed to get model url from response: {response}") -def poll_until_finished( +async def poll_until_finished( kwargs: dict[str, str], response: TripoTaskResponse, ) -> tuple[str, str]: @@ -57,7 +57,7 @@ def poll_until_finished( if response.code != 0: raise RuntimeError(f"Failed to generate mesh: {response.error}") task_id = response.data.task_id - response_poll = PollingOperation( + response_poll = await PollingOperation( poll_endpoint=ApiEndpoint( path=f"/proxy/tripo/v2/openapi/task/{task_id}", method=HttpMethod.GET, @@ -80,7 +80,7 @@ def poll_until_finished( ).execute() if response_poll.data.status == TripoTaskStatus.SUCCESS: url = get_model_url_from_response(response_poll) - bytesio = download_url_to_bytesio(url) + bytesio = await download_url_to_bytesio(url) # Save the downloaded model file model_file = f"tripo_model_{task_id}.glb" with open(os.path.join(get_output_directory(), model_file), "wb") as f: @@ -88,6 +88,7 @@ def poll_until_finished( return model_file, task_id raise RuntimeError(f"Failed to generate mesh: {response_poll}") + class TripoTextToModelNode: """ Generates 3D models synchronously based on a text prompt using Tripo's API. @@ -126,11 +127,11 @@ class TripoTextToModelNode: API_NODE = True OUTPUT_NODE = True - def generate_mesh(self, prompt, negative_prompt=None, model_version=None, style=None, texture=None, pbr=None, image_seed=None, model_seed=None, texture_seed=None, texture_quality=None, face_limit=None, quad=None, **kwargs): + async def generate_mesh(self, prompt, negative_prompt=None, model_version=None, style=None, texture=None, pbr=None, image_seed=None, model_seed=None, texture_seed=None, texture_quality=None, face_limit=None, quad=None, **kwargs): style_enum = None if style == "None" else style if not prompt: raise RuntimeError("Prompt is required") - response = SynchronousOperation( + response = await SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/tripo/v2/openapi/task", method=HttpMethod.POST, @@ -155,7 +156,8 @@ class TripoTextToModelNode: ), auth_kwargs=kwargs, ).execute() - return poll_until_finished(kwargs, response) + return await poll_until_finished(kwargs, response) + class TripoImageToModelNode: """ @@ -195,12 +197,12 @@ class TripoImageToModelNode: API_NODE = True OUTPUT_NODE = True - def generate_mesh(self, image, model_version=None, style=None, texture=None, pbr=None, model_seed=None, orientation=None, texture_alignment=None, texture_seed=None, texture_quality=None, face_limit=None, quad=None, **kwargs): + async def generate_mesh(self, image, model_version=None, style=None, texture=None, pbr=None, model_seed=None, orientation=None, texture_alignment=None, texture_seed=None, texture_quality=None, face_limit=None, quad=None, **kwargs): style_enum = None if style == "None" else style if image is None: raise RuntimeError("Image is required") - tripo_file = upload_image_to_tripo(image, **kwargs) - response = SynchronousOperation( + tripo_file = await upload_image_to_tripo(image, **kwargs) + response = await SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/tripo/v2/openapi/task", method=HttpMethod.POST, @@ -225,7 +227,8 @@ class TripoImageToModelNode: ), auth_kwargs=kwargs, ).execute() - return poll_until_finished(kwargs, response) + return await poll_until_finished(kwargs, response) + class TripoMultiviewToModelNode: """ @@ -267,7 +270,7 @@ class TripoMultiviewToModelNode: API_NODE = True OUTPUT_NODE = True - def generate_mesh(self, image, image_left=None, image_back=None, image_right=None, model_version=None, orientation=None, texture=None, pbr=None, model_seed=None, texture_seed=None, texture_quality=None, texture_alignment=None, face_limit=None, quad=None, **kwargs): + async def generate_mesh(self, image, image_left=None, image_back=None, image_right=None, model_version=None, orientation=None, texture=None, pbr=None, model_seed=None, texture_seed=None, texture_quality=None, texture_alignment=None, face_limit=None, quad=None, **kwargs): if image is None: raise RuntimeError("front image for multiview is required") images = [] @@ -282,11 +285,11 @@ class TripoMultiviewToModelNode: for image_name in ["image", "image_left", "image_back", "image_right"]: image_ = image_dict[image_name] if image_ is not None: - tripo_file = upload_image_to_tripo(image_, **kwargs) + tripo_file = await upload_image_to_tripo(image_, **kwargs) images.append(tripo_file) else: images.append(TripoFileEmptyReference()) - response = SynchronousOperation( + response = await SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/tripo/v2/openapi/task", method=HttpMethod.POST, @@ -309,7 +312,8 @@ class TripoMultiviewToModelNode: ), auth_kwargs=kwargs, ).execute() - return poll_until_finished(kwargs, response) + return await poll_until_finished(kwargs, response) + class TripoTextureNode: @classmethod @@ -340,8 +344,8 @@ class TripoTextureNode: OUTPUT_NODE = True AVERAGE_DURATION = 80 - def generate_mesh(self, model_task_id, texture=None, pbr=None, texture_seed=None, texture_quality=None, texture_alignment=None, **kwargs): - response = SynchronousOperation( + async def generate_mesh(self, model_task_id, texture=None, pbr=None, texture_seed=None, texture_quality=None, texture_alignment=None, **kwargs): + response = await SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/tripo/v2/openapi/task", method=HttpMethod.POST, @@ -358,7 +362,7 @@ class TripoTextureNode: ), auth_kwargs=kwargs, ).execute() - return poll_until_finished(kwargs, response) + return await poll_until_finished(kwargs, response) class TripoRefineNode: @@ -387,8 +391,8 @@ class TripoRefineNode: OUTPUT_NODE = True AVERAGE_DURATION = 240 - def generate_mesh(self, model_task_id, **kwargs): - response = SynchronousOperation( + async def generate_mesh(self, model_task_id, **kwargs): + response = await SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/tripo/v2/openapi/task", method=HttpMethod.POST, @@ -400,7 +404,7 @@ class TripoRefineNode: ), auth_kwargs=kwargs, ).execute() - return poll_until_finished(kwargs, response) + return await poll_until_finished(kwargs, response) class TripoRigNode: @@ -425,8 +429,8 @@ class TripoRigNode: OUTPUT_NODE = True AVERAGE_DURATION = 180 - def generate_mesh(self, original_model_task_id, **kwargs): - response = SynchronousOperation( + async def generate_mesh(self, original_model_task_id, **kwargs): + response = await SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/tripo/v2/openapi/task", method=HttpMethod.POST, @@ -440,7 +444,8 @@ class TripoRigNode: ), auth_kwargs=kwargs, ).execute() - return poll_until_finished(kwargs, response) + return await poll_until_finished(kwargs, response) + class TripoRetargetNode: @classmethod @@ -475,8 +480,8 @@ class TripoRetargetNode: OUTPUT_NODE = True AVERAGE_DURATION = 30 - def generate_mesh(self, animation, original_model_task_id, **kwargs): - response = SynchronousOperation( + async def generate_mesh(self, animation, original_model_task_id, **kwargs): + response = await SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/tripo/v2/openapi/task", method=HttpMethod.POST, @@ -491,7 +496,8 @@ class TripoRetargetNode: ), auth_kwargs=kwargs, ).execute() - return poll_until_finished(kwargs, response) + return await poll_until_finished(kwargs, response) + class TripoConversionNode: @classmethod @@ -529,10 +535,10 @@ class TripoConversionNode: OUTPUT_NODE = True AVERAGE_DURATION = 30 - def generate_mesh(self, original_model_task_id, format, quad, face_limit, texture_size, texture_format, **kwargs): + async def generate_mesh(self, original_model_task_id, format, quad, face_limit, texture_size, texture_format, **kwargs): if not original_model_task_id: raise RuntimeError("original_model_task_id is required") - response = SynchronousOperation( + response = await SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/tripo/v2/openapi/task", method=HttpMethod.POST, @@ -549,7 +555,8 @@ class TripoConversionNode: ), auth_kwargs=kwargs, ).execute() - return poll_until_finished(kwargs, response) + return await poll_until_finished(kwargs, response) + NODE_CLASS_MAPPINGS = { "TripoTextToModelNode": TripoTextToModelNode, diff --git a/comfy_api_nodes/nodes_veo2.py b/comfy_api_nodes/nodes_veo2.py index 97bfe20e6..e25dab2f5 100644 --- a/comfy_api_nodes/nodes_veo2.py +++ b/comfy_api_nodes/nodes_veo2.py @@ -1,7 +1,7 @@ import io import logging import base64 -import requests +import aiohttp import torch from typing import Optional @@ -152,7 +152,7 @@ class VeoVideoGenerationNode(ComfyNodeABC): DESCRIPTION = "Generates videos from text prompts using Google's Veo 2 API" API_NODE = True - def generate_video( + async def generate_video( self, prompt, aspect_ratio="16:9", @@ -217,7 +217,7 @@ class VeoVideoGenerationNode(ComfyNodeABC): auth_kwargs=kwargs, ) - initial_response = initial_operation.execute() + initial_response = await initial_operation.execute() operation_name = initial_response.name logging.info(f"Veo generation started with operation name: {operation_name}") @@ -256,7 +256,7 @@ class VeoVideoGenerationNode(ComfyNodeABC): ) # Execute the polling operation - poll_response = poll_operation.execute() + poll_response = await poll_operation.execute() # Now check for errors in the final response # Check for error in poll response @@ -281,7 +281,6 @@ class VeoVideoGenerationNode(ComfyNodeABC): raise Exception(error_message) # Extract video data - video_data = None if poll_response.response and hasattr(poll_response.response, 'videos') and poll_response.response.videos and len(poll_response.response.videos) > 0: video = poll_response.response.videos[0] @@ -291,9 +290,9 @@ class VeoVideoGenerationNode(ComfyNodeABC): video_data = base64.b64decode(video.bytesBase64Encoded) elif hasattr(video, 'gcsUri') and video.gcsUri: # Download from URL - video_url = video.gcsUri - video_response = requests.get(video_url) - video_data = video_response.content + async with aiohttp.ClientSession() as session: + async with session.get(video.gcsUri) as video_response: + video_data = await video_response.content.read() else: raise Exception("Video returned but no data or URL was provided") else: From 735bb4bdb186bd4f39b9c924c24b8b39a7ef8b0d Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 8 Aug 2025 01:21:00 -0700 Subject: [PATCH 111/243] Users report gfx1201 is buggy on flux with pytorch attention. (#9244) --- comfy/model_management.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 9e6149d60..dc5b4711d 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -321,9 +321,9 @@ try: if torch_version_numeric >= (2, 7): # works on 2.6 but doesn't actually seem to improve much if any((a in arch) for a in ["gfx90a", "gfx942", "gfx1100", "gfx1101", "gfx1151"]): # TODO: more arches, TODO: gfx950 ENABLE_PYTORCH_ATTENTION = True - if torch_version_numeric >= (2, 8): - if any((a in arch) for a in ["gfx1201"]): - ENABLE_PYTORCH_ATTENTION = True +# if torch_version_numeric >= (2, 8): +# if any((a in arch) for a in ["gfx1201"]): +# ENABLE_PYTORCH_ATTENTION = True if torch_version_numeric >= (2, 7) and rocm_version >= (6, 4): if any((a in arch) for a in ["gfx1201", "gfx942", "gfx950"]): # TODO: more arches SUPPORT_FP8_OPS = True From 5828607ccfef82a82931d8b66f3fd1176e04588f Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 9 Aug 2025 09:49:25 -0700 Subject: [PATCH 112/243] Not sure if AMD actually support fp16 acc but it doesn't crash. (#9258) --- comfy/model_management.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index dc5b4711d..c08f759e5 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -340,7 +340,7 @@ if ENABLE_PYTORCH_ATTENTION: PRIORITIZE_FP16 = False # TODO: remove and replace with something that shows exactly which dtype is faster than the other try: - if is_nvidia() and PerformanceFeature.Fp16Accumulation in args.fast: + if (is_nvidia() or is_amd()) and PerformanceFeature.Fp16Accumulation in args.fast: torch.backends.cuda.matmul.allow_fp16_accumulation = True PRIORITIZE_FP16 = True # TODO: limit to cards where it actually boosts performance logging.info("Enabled fp16 accumulation.") From 0552de7c7d6bcdd515da115d6756fd30494c7ff4 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 10 Aug 2025 02:03:47 -0700 Subject: [PATCH 113/243] Bump pytorch cuda and rocm versions in readme instructions. (#9273) --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 119098f5c..e4cff01a9 100644 --- a/README.md +++ b/README.md @@ -203,7 +203,7 @@ Put your VAE in: models/vae ### AMD GPUs (Linux only) AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version: -```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.3``` +```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.4``` This is the command to install the nightly with ROCm 6.4 which might have some performance improvements: @@ -237,7 +237,7 @@ Additional discussion and help can be found [here](https://github.com/comfyanony Nvidia users should install stable pytorch using this command: -```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu128``` +```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu129``` This is the command to install pytorch nightly instead which might have performance improvements. From 966f3a52061b5e300f36c6de0d07c47d6ad12f76 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 11 Aug 2025 02:53:01 -0700 Subject: [PATCH 114/243] Only show feature flags log when verbose. (#9281) --- server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server.py b/server.py index 0553a0dd7..8f9c88ebf 100644 --- a/server.py +++ b/server.py @@ -235,7 +235,7 @@ class PromptServer(): sid, ) - logging.info( + logging.debug( f"Feature flags negotiated for client {sid}: {client_flags}" ) first_message = False From fa340add552497a264071fd7f6c407ff4aa10449 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Mon, 11 Aug 2025 23:48:17 +0300 Subject: [PATCH 115/243] remove creation of non-used asyncio_loop (#9284) --- execution.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/execution.py b/execution.py index 952f0cc5c..1dc35738b 100644 --- a/execution.py +++ b/execution.py @@ -646,8 +646,6 @@ class PromptExecutor: self.add_message("execution_error", mes, broadcast=False) def execute(self, prompt, prompt_id, extra_data={}, execute_outputs=[]): - asyncio_loop = asyncio.new_event_loop() - asyncio.set_event_loop(asyncio_loop) asyncio.run(self.execute_async(prompt, prompt_id, extra_data, execute_outputs)) async def execute_async(self, prompt, prompt_id, extra_data={}, execute_outputs=[]): From 629b17383718e1f46dbba101ea83ec897fbe3082 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Tue, 12 Aug 2025 04:52:12 +0800 Subject: [PATCH 116/243] Update template & embedded docs (#9283) * Update template & embedded docs * Update embedded docs to 0.2.6 --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 2f4692b03..2fb38ef27 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ comfyui-frontend-package==1.24.4 -comfyui-workflow-templates==0.1.52 -comfyui-embedded-docs==0.2.4 +comfyui-workflow-templates==0.1.53 +comfyui-embedded-docs==0.2.6 torch torchsde torchvision From 2208aa616d3ad193cd37ef57076d4f5243cecdd3 Mon Sep 17 00:00:00 2001 From: PsychoLogicAu Date: Tue, 12 Aug 2025 06:56:16 +1000 Subject: [PATCH 117/243] Support SimpleTuner lycoris lora for Qwen-Image (#9280) --- comfy/lora.py | 1 + 1 file changed, 1 insertion(+) diff --git a/comfy/lora.py b/comfy/lora.py index 6686b7229..00358884b 100644 --- a/comfy/lora.py +++ b/comfy/lora.py @@ -301,6 +301,7 @@ def model_lora_keys_unet(model, key_map={}): key_map["{}".format(key_lora)] = k # Support transformer prefix format key_map["transformer.{}".format(key_lora)] = k + key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = k #SimpleTuner lycoris format return key_map From f4231a80b1b904b45ade0def9b37320c4adfe71b Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 12 Aug 2025 00:15:14 +0300 Subject: [PATCH 118/243] fix(Kling Image API Node): do not pass "image_type" when no image (#9271) * fix(Kling Image API Node): do not pass "image_type" when no image * fix(Kling Image API Node): raise client-side error when kling_v1 is used with reference image --- comfy_api_nodes/nodes_kling.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index 9d9eb5628..9d483bb0e 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -1690,7 +1690,11 @@ class KlingImageGenerationNode(KlingImageGenerationBase): ): self.validate_prompt(prompt, negative_prompt) - if image is not None: + if image is None: + image_type = None + elif model_name == KlingImageGenModelName.kling_v1: + raise ValueError(f"The model {KlingImageGenModelName.kling_v1.value} does not support reference images.") + else: image = tensor_to_base64_string(image) initial_operation = SynchronousOperation( From 1e3ae1eed8b925430e3b114ea6b7d08ea698e305 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Wed, 13 Aug 2025 05:14:27 +0800 Subject: [PATCH 119/243] Update template to 0.1.58 (#9302) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 2fb38ef27..82af5690b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.24.4 -comfyui-workflow-templates==0.1.53 +comfyui-workflow-templates==0.1.58 comfyui-embedded-docs==0.2.6 torch torchsde From e1d4f36d8df7446ebb1a5f2bf9c708c38a159f22 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 12 Aug 2025 17:13:04 -0700 Subject: [PATCH 120/243] Update test release package workflow with python 3.13 cu129. (#9306) --- .github/workflows/windows_release_package.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/windows_release_package.yml b/.github/workflows/windows_release_package.yml index 3926a65f3..b51746285 100644 --- a/.github/workflows/windows_release_package.yml +++ b/.github/workflows/windows_release_package.yml @@ -7,19 +7,19 @@ on: description: 'cuda version' required: true type: string - default: "128" + default: "129" python_minor: description: 'python minor version' required: true type: string - default: "12" + default: "13" python_patch: description: 'python patch version' required: true type: string - default: "10" + default: "6" # push: # branches: # - master @@ -64,6 +64,8 @@ jobs: ./python.exe get-pip.py ./python.exe -s -m pip install ../cu${{ inputs.cu }}_python_deps/* sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth + + rm ./Lib/site-packages/torch/lib/dnnl.lib #I don't think this is actually used and I need the space cd .. git clone --depth 1 https://github.com/comfyanonymous/taesd From 560d38f34c5bd532f89f2178f01ee819cf145820 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 12 Aug 2025 20:26:33 -0700 Subject: [PATCH 121/243] Wan2.2 fun control support. (#9292) --- comfy/ldm/wan/model.py | 19 +++++++++++++ comfy/model_base.py | 10 ++++++- comfy/model_detection.py | 5 ++++ comfy_extras/nodes_wan.py | 58 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 91 insertions(+), 1 deletion(-) diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index 86d0795e9..4e2d99566 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -391,6 +391,7 @@ class WanModel(torch.nn.Module): cross_attn_norm=True, eps=1e-6, flf_pos_embed_token_number=None, + in_dim_ref_conv=None, image_model=None, device=None, dtype=None, @@ -484,6 +485,11 @@ class WanModel(torch.nn.Module): else: self.img_emb = None + if in_dim_ref_conv is not None: + self.ref_conv = operations.Conv2d(in_dim_ref_conv, dim, kernel_size=patch_size[1:], stride=patch_size[1:], device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + else: + self.ref_conv = None + def forward_orig( self, x, @@ -526,6 +532,13 @@ class WanModel(torch.nn.Module): e = e.reshape(t.shape[0], -1, e.shape[-1]) e0 = self.time_projection(e).unflatten(2, (6, self.dim)) + full_ref = None + if self.ref_conv is not None: + full_ref = kwargs.get("reference_latent", None) + if full_ref is not None: + full_ref = self.ref_conv(full_ref).flatten(2).transpose(1, 2) + x = torch.concat((full_ref, x), dim=1) + # context context = self.text_embedding(context) @@ -552,6 +565,9 @@ class WanModel(torch.nn.Module): # head x = self.head(x, e) + if full_ref is not None: + x = x[:, full_ref.shape[1]:] + # unpatchify x = self.unpatchify(x, grid_sizes) return x @@ -570,6 +586,9 @@ class WanModel(torch.nn.Module): x = torch.cat([x, time_dim_concat], dim=2) t_len = ((x.shape[2] + (patch_size[0] // 2)) // patch_size[0]) + if self.ref_conv is not None and "reference_latent" in kwargs: + t_len += 1 + img_ids = torch.zeros((t_len, h_len, w_len, 3), device=x.device, dtype=x.dtype) img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(0, t_len - 1, steps=t_len, device=x.device, dtype=x.dtype).reshape(-1, 1, 1) img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).reshape(1, -1, 1) diff --git a/comfy/model_base.py b/comfy/model_base.py index 8a2d9cbe6..cde61df7c 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1124,7 +1124,11 @@ class WAN21(BaseModel): mask = mask.repeat(1, 4, 1, 1, 1) mask = utils.resize_to_batch_size(mask, noise.shape[0]) - return torch.cat((mask, image), dim=1) + concat_mask_index = kwargs.get("concat_mask_index", 0) + if concat_mask_index != 0: + return torch.cat((image[:, :concat_mask_index], mask, image[:, concat_mask_index:]), dim=1) + else: + return torch.cat((mask, image), dim=1) def extra_conds(self, **kwargs): out = super().extra_conds(**kwargs) @@ -1140,6 +1144,10 @@ class WAN21(BaseModel): if time_dim_concat is not None: out['time_dim_concat'] = comfy.conds.CONDRegular(self.process_latent_in(time_dim_concat)) + reference_latents = kwargs.get("reference_latents", None) + if reference_latents is not None: + out['reference_latent'] = comfy.conds.CONDRegular(self.process_latent_in(reference_latents[-1])[:, :, 0]) + return out diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 8b57ebd2f..8acc51e20 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -373,6 +373,11 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): flf_weight = state_dict.get('{}img_emb.emb_pos'.format(key_prefix)) if flf_weight is not None: dit_config["flf_pos_embed_token_number"] = flf_weight.shape[1] + + ref_conv_weight = state_dict.get('{}ref_conv.weight'.format(key_prefix)) + if ref_conv_weight is not None: + dit_config["in_dim_ref_conv"] = ref_conv_weight.shape[1] + return dit_config if '{}latent_in.weight'.format(key_prefix) in state_dict_keys: # Hunyuan 3D diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index 0067d054d..f80c83ba6 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -103,6 +103,63 @@ class WanFunControlToVideo: out_latent["samples"] = latent return (positive, negative, out_latent) +class Wan22FunControlToVideo: + @classmethod + def INPUT_TYPES(s): + return {"required": {"positive": ("CONDITIONING", ), + "negative": ("CONDITIONING", ), + "vae": ("VAE", ), + "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), + "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), + "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), + "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), + }, + "optional": {"ref_image": ("IMAGE", ), + "control_video": ("IMAGE", ), + # "start_image": ("IMAGE", ), + }} + + RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") + RETURN_NAMES = ("positive", "negative", "latent") + FUNCTION = "encode" + + CATEGORY = "conditioning/video_models" + + def encode(self, positive, negative, vae, width, height, length, batch_size, ref_image=None, start_image=None, control_video=None): + latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) + concat_latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) + concat_latent = comfy.latent_formats.Wan21().process_out(concat_latent) + concat_latent = concat_latent.repeat(1, 2, 1, 1, 1) + mask = torch.ones((1, 1, latent.shape[2] * 4, latent.shape[-2], latent.shape[-1])) + + if start_image is not None: + start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + concat_latent_image = vae.encode(start_image[:, :, :, :3]) + concat_latent[:,16:,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]] + mask[:, :, :start_image.shape[0] + 3] = 0.0 + + ref_latent = None + if ref_image is not None: + ref_image = comfy.utils.common_upscale(ref_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + ref_latent = vae.encode(ref_image[:, :, :, :3]) + + if control_video is not None: + control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + concat_latent_image = vae.encode(control_video[:, :, :, :3]) + concat_latent[:,:16,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]] + + mask = mask.view(1, mask.shape[2] // 4, 4, mask.shape[3], mask.shape[4]).transpose(1, 2) + positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent, "concat_mask": mask, "concat_mask_index": 16}) + negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent, "concat_mask": mask, "concat_mask_index": 16}) + + if ref_latent is not None: + positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True) + negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [ref_latent]}, append=True) + + out_latent = {} + out_latent["samples"] = latent + return (positive, negative, out_latent) + class WanFirstLastFrameToVideo: @classmethod def INPUT_TYPES(s): @@ -733,6 +790,7 @@ NODE_CLASS_MAPPINGS = { "WanTrackToVideo": WanTrackToVideo, "WanImageToVideo": WanImageToVideo, "WanFunControlToVideo": WanFunControlToVideo, + "Wan22FunControlToVideo": Wan22FunControlToVideo, "WanFunInpaintToVideo": WanFunInpaintToVideo, "WanFirstLastFrameToVideo": WanFirstLastFrameToVideo, "WanVaceToVideo": WanVaceToVideo, From 898d88e10e45f38500ca6044280bab4ca2f2d273 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Tue, 12 Aug 2025 20:34:58 -0700 Subject: [PATCH 122/243] Make torchaudio exception catching less specific (#9309) --- comfy_api/latest/_ui.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_api/latest/_ui.py b/comfy_api/latest/_ui.py index 61597038f..26a55615f 100644 --- a/comfy_api/latest/_ui.py +++ b/comfy_api/latest/_ui.py @@ -12,7 +12,7 @@ import torch try: import torchaudio TORCH_AUDIO_AVAILABLE = True -except ImportError: +except: TORCH_AUDIO_AVAILABLE = False from PIL import Image as PILImage from PIL.PngImagePlugin import PngInfo From 3294782d19c3af0c6166aafe0465fe6b59571d17 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Wed, 13 Aug 2025 14:50:50 +0800 Subject: [PATCH 123/243] Update template to 0.1.59 (#9313) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 82af5690b..bfb31a73f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.24.4 -comfyui-workflow-templates==0.1.58 +comfyui-workflow-templates==0.1.59 comfyui-embedded-docs==0.2.6 torch torchsde From 5ca8e2fac3b6826261c5991b0663b69eff60b3a1 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 13 Aug 2025 00:01:12 -0700 Subject: [PATCH 124/243] Update release workflow to python3.13 pytorch cu129 (#9315) * Try to reduce size of portable even more. * Update stable release workflow to python 3.13 cu129 * Update dependencies workflow to python3.13 cu129 --- .github/workflows/stable-release.yml | 15 ++++++++++----- .../workflows/windows_release_dependencies.yml | 6 +++--- .github/workflows/windows_release_package.yml | 2 ++ 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/.github/workflows/stable-release.yml b/.github/workflows/stable-release.yml index 61105abe4..a5a1ed2d0 100644 --- a/.github/workflows/stable-release.yml +++ b/.github/workflows/stable-release.yml @@ -12,17 +12,17 @@ on: description: 'CUDA version' required: true type: string - default: "128" + default: "129" python_minor: description: 'Python minor version' required: true type: string - default: "12" + default: "13" python_patch: description: 'Python patch version' required: true type: string - default: "10" + default: "6" jobs: @@ -66,8 +66,13 @@ jobs: curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py ./python.exe get-pip.py ./python.exe -s -m pip install ../cu${{ inputs.cu }}_python_deps/* - sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth - cd .. + sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth + + rm ./Lib/site-packages/torch/lib/dnnl.lib #I don't think this is actually used and I need the space + rm ./Lib/site-packages/torch/lib/libprotoc.lib + rm ./Lib/site-packages/torch/lib/libprotobuf.lib + + cd .. git clone --depth 1 https://github.com/comfyanonymous/taesd cp taesd/*.safetensors ./ComfyUI_copy/models/vae_approx/ diff --git a/.github/workflows/windows_release_dependencies.yml b/.github/workflows/windows_release_dependencies.yml index dfdb96d50..7761cc1ed 100644 --- a/.github/workflows/windows_release_dependencies.yml +++ b/.github/workflows/windows_release_dependencies.yml @@ -17,19 +17,19 @@ on: description: 'cuda version' required: true type: string - default: "128" + default: "129" python_minor: description: 'python minor version' required: true type: string - default: "12" + default: "13" python_patch: description: 'python patch version' required: true type: string - default: "10" + default: "6" # push: # branches: # - master diff --git a/.github/workflows/windows_release_package.yml b/.github/workflows/windows_release_package.yml index b51746285..3334e6839 100644 --- a/.github/workflows/windows_release_package.yml +++ b/.github/workflows/windows_release_package.yml @@ -66,6 +66,8 @@ jobs: sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth rm ./Lib/site-packages/torch/lib/dnnl.lib #I don't think this is actually used and I need the space + rm ./Lib/site-packages/torch/lib/libprotoc.lib + rm ./Lib/site-packages/torch/lib/libprotobuf.lib cd .. git clone --depth 1 https://github.com/comfyanonymous/taesd From e400f26c8fc9867248394616a4b58ecc4c53fbfd Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 13 Aug 2025 00:44:54 -0700 Subject: [PATCH 125/243] Downgrade frontend for release. (#9316) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index bfb31a73f..56ed85e01 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.24.4 +comfyui-frontend-package==1.23.4 comfyui-workflow-templates==0.1.59 comfyui-embedded-docs==0.2.6 torch From d5c1954d5cd4a789bbf84d2b75a955a5a3f93de8 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Wed, 13 Aug 2025 03:46:38 -0400 Subject: [PATCH 126/243] ComfyUI version 0.3.50 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 5e2d09c81..29ec07ca6 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.49" +__version__ = "0.3.50" diff --git a/pyproject.toml b/pyproject.toml index 3c530ba85..659b5730a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.49" +version = "0.3.50" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 615eb52049df98cebdd67bc672b66dc059171d7c Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 13 Aug 2025 00:48:06 -0700 Subject: [PATCH 127/243] Put back frontend version. (#9317) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 56ed85e01..bfb31a73f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.23.4 +comfyui-frontend-package==1.24.4 comfyui-workflow-templates==0.1.59 comfyui-embedded-docs==0.2.6 torch From afa0a45206832b0e64e38454b7841d1da7ca56e4 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 13 Aug 2025 11:42:08 -0700 Subject: [PATCH 128/243] Reduce portable size again. (#9323) * compress more * test * not needed --- .github/workflows/stable-release.yml | 2 +- .github/workflows/windows_release_package.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/stable-release.yml b/.github/workflows/stable-release.yml index a5a1ed2d0..2bc8e5905 100644 --- a/.github/workflows/stable-release.yml +++ b/.github/workflows/stable-release.yml @@ -90,7 +90,7 @@ jobs: cd .. - "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=512m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable + "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=768m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable mv ComfyUI_windows_portable.7z ComfyUI/ComfyUI_windows_portable_nvidia.7z cd ComfyUI_windows_portable diff --git a/.github/workflows/windows_release_package.yml b/.github/workflows/windows_release_package.yml index 3334e6839..46375698e 100644 --- a/.github/workflows/windows_release_package.yml +++ b/.github/workflows/windows_release_package.yml @@ -86,7 +86,7 @@ jobs: cd .. - "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=512m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable + "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=768m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable mv ComfyUI_windows_portable.7z ComfyUI/new_ComfyUI_windows_portable_nvidia_cu${{ inputs.cu }}_or_cpu.7z cd ComfyUI_windows_portable From 3da5a07510794c37d437cbea1d94065bb0aa8ebc Mon Sep 17 00:00:00 2001 From: contentis Date: Wed, 13 Aug 2025 20:53:27 +0200 Subject: [PATCH 129/243] SDPA backend priority (#9299) --- comfy/ldm/hunyuan3d/vae.py | 2 +- comfy/ldm/modules/attention.py | 4 ++-- comfy/ldm/modules/diffusionmodules/model.py | 2 +- comfy/ops.py | 13 +++++++++++++ 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/comfy/ldm/hunyuan3d/vae.py b/comfy/ldm/hunyuan3d/vae.py index 5eb2c6548..bea6090a2 100644 --- a/comfy/ldm/hunyuan3d/vae.py +++ b/comfy/ldm/hunyuan3d/vae.py @@ -178,7 +178,7 @@ class FourierEmbedder(nn.Module): class CrossAttentionProcessor: def __call__(self, attn, q, k, v): - out = F.scaled_dot_product_attention(q, k, v) + out = ops.scaled_dot_product_attention(q, k, v) return out diff --git a/comfy/ldm/modules/attention.py b/comfy/ldm/modules/attention.py index 35d2270ee..19c3c7af1 100644 --- a/comfy/ldm/modules/attention.py +++ b/comfy/ldm/modules/attention.py @@ -448,7 +448,7 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha mask = mask.unsqueeze(1) if SDP_BATCH_LIMIT >= b: - out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False) + out = ops.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False) if not skip_output_reshape: out = ( out.transpose(1, 2).reshape(b, -1, heads * dim_head) @@ -461,7 +461,7 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha if mask.shape[0] > 1: m = mask[i : i + SDP_BATCH_LIMIT] - out[i : i + SDP_BATCH_LIMIT] = torch.nn.functional.scaled_dot_product_attention( + out[i : i + SDP_BATCH_LIMIT] = ops.scaled_dot_product_attention( q[i : i + SDP_BATCH_LIMIT], k[i : i + SDP_BATCH_LIMIT], v[i : i + SDP_BATCH_LIMIT], diff --git a/comfy/ldm/modules/diffusionmodules/model.py b/comfy/ldm/modules/diffusionmodules/model.py index 5c0373b74..79160412f 100644 --- a/comfy/ldm/modules/diffusionmodules/model.py +++ b/comfy/ldm/modules/diffusionmodules/model.py @@ -285,7 +285,7 @@ def pytorch_attention(q, k, v): ) try: - out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False) + out = ops.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False) out = out.transpose(2, 3).reshape(orig_shape) except model_management.OOM_EXCEPTION: logging.warning("scaled_dot_product_attention OOMed: switched to slice attention") diff --git a/comfy/ops.py b/comfy/ops.py index 2cc9bbc27..8b7b662b6 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -23,9 +23,18 @@ from comfy.cli_args import args, PerformanceFeature import comfy.float import comfy.rmsnorm import contextlib +from torch.nn.attention import SDPBackend, sdpa_kernel cast_to = comfy.model_management.cast_to #TODO: remove once no more references +SDPA_BACKEND_PRIORITY = [ + SDPBackend.FLASH_ATTENTION, + SDPBackend.EFFICIENT_ATTENTION, + SDPBackend.MATH, +] +if torch.cuda.is_available(): + SDPA_BACKEND_PRIORITY.insert(0, SDPBackend.CUDNN_ATTENTION) + def cast_to_input(weight, input, non_blocking=False, copy=True): return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy) @@ -249,6 +258,10 @@ class disable_weight_init: else: raise ValueError(f"unsupported dimensions: {dims}") + @staticmethod + @sdpa_kernel(backends=SDPA_BACKEND_PRIORITY, set_priority=True) + def scaled_dot_product_attention(q, k, v, *args, **kwargs): + return torch.nn.functional.scaled_dot_product_attention(q, k, v, *args, **kwargs) class manual_cast(disable_weight_init): class Linear(disable_weight_init.Linear): From 9df8792d4b894a8ea8034414ef63f70deee4b1af Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 13 Aug 2025 12:12:41 -0700 Subject: [PATCH 130/243] Make last PR not crash comfy on old pytorch. (#9324) --- comfy/ldm/hunyuan3d/vae.py | 2 +- comfy/ldm/modules/attention.py | 4 +-- comfy/ldm/modules/diffusionmodules/model.py | 2 +- comfy/ops.py | 36 +++++++++++++-------- 4 files changed, 27 insertions(+), 17 deletions(-) diff --git a/comfy/ldm/hunyuan3d/vae.py b/comfy/ldm/hunyuan3d/vae.py index bea6090a2..6e8cbf1d9 100644 --- a/comfy/ldm/hunyuan3d/vae.py +++ b/comfy/ldm/hunyuan3d/vae.py @@ -178,7 +178,7 @@ class FourierEmbedder(nn.Module): class CrossAttentionProcessor: def __call__(self, attn, q, k, v): - out = ops.scaled_dot_product_attention(q, k, v) + out = comfy.ops.scaled_dot_product_attention(q, k, v) return out diff --git a/comfy/ldm/modules/attention.py b/comfy/ldm/modules/attention.py index 19c3c7af1..043df28df 100644 --- a/comfy/ldm/modules/attention.py +++ b/comfy/ldm/modules/attention.py @@ -448,7 +448,7 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha mask = mask.unsqueeze(1) if SDP_BATCH_LIMIT >= b: - out = ops.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False) + out = comfy.ops.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False) if not skip_output_reshape: out = ( out.transpose(1, 2).reshape(b, -1, heads * dim_head) @@ -461,7 +461,7 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha if mask.shape[0] > 1: m = mask[i : i + SDP_BATCH_LIMIT] - out[i : i + SDP_BATCH_LIMIT] = ops.scaled_dot_product_attention( + out[i : i + SDP_BATCH_LIMIT] = comfy.ops.scaled_dot_product_attention( q[i : i + SDP_BATCH_LIMIT], k[i : i + SDP_BATCH_LIMIT], v[i : i + SDP_BATCH_LIMIT], diff --git a/comfy/ldm/modules/diffusionmodules/model.py b/comfy/ldm/modules/diffusionmodules/model.py index 79160412f..1fd12b35a 100644 --- a/comfy/ldm/modules/diffusionmodules/model.py +++ b/comfy/ldm/modules/diffusionmodules/model.py @@ -285,7 +285,7 @@ def pytorch_attention(q, k, v): ) try: - out = ops.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False) + out = comfy.ops.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False) out = out.transpose(2, 3).reshape(orig_shape) except model_management.OOM_EXCEPTION: logging.warning("scaled_dot_product_attention OOMed: switched to slice attention") diff --git a/comfy/ops.py b/comfy/ops.py index 8b7b662b6..be312d714 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -23,18 +23,32 @@ from comfy.cli_args import args, PerformanceFeature import comfy.float import comfy.rmsnorm import contextlib -from torch.nn.attention import SDPBackend, sdpa_kernel + + +def scaled_dot_product_attention(q, k, v, *args, **kwargs): + return torch.nn.functional.scaled_dot_product_attention(q, k, v, *args, **kwargs) + + +try: + if torch.cuda.is_available(): + from torch.nn.attention import SDPBackend, sdpa_kernel + + SDPA_BACKEND_PRIORITY = [ + SDPBackend.FLASH_ATTENTION, + SDPBackend.EFFICIENT_ATTENTION, + SDPBackend.MATH, + ] + + SDPA_BACKEND_PRIORITY.insert(0, SDPBackend.CUDNN_ATTENTION) + + @sdpa_kernel(backends=SDPA_BACKEND_PRIORITY, set_priority=True) + def scaled_dot_product_attention(q, k, v, *args, **kwargs): + return torch.nn.functional.scaled_dot_product_attention(q, k, v, *args, **kwargs) +except (ModuleNotFoundError, TypeError): + logging.warning("Could not set sdpa backend priority.") cast_to = comfy.model_management.cast_to #TODO: remove once no more references -SDPA_BACKEND_PRIORITY = [ - SDPBackend.FLASH_ATTENTION, - SDPBackend.EFFICIENT_ATTENTION, - SDPBackend.MATH, -] -if torch.cuda.is_available(): - SDPA_BACKEND_PRIORITY.insert(0, SDPBackend.CUDNN_ATTENTION) - def cast_to_input(weight, input, non_blocking=False, copy=True): return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy) @@ -258,10 +272,6 @@ class disable_weight_init: else: raise ValueError(f"unsupported dimensions: {dims}") - @staticmethod - @sdpa_kernel(backends=SDPA_BACKEND_PRIORITY, set_priority=True) - def scaled_dot_product_attention(q, k, v, *args, **kwargs): - return torch.nn.functional.scaled_dot_product_attention(q, k, v, *args, **kwargs) class manual_cast(disable_weight_init): class Linear(disable_weight_init.Linear): From c991a5da658667cf29f2916bef096fa7b18afd47 Mon Sep 17 00:00:00 2001 From: Simon Lui <502929+simonlui@users.noreply.github.com> Date: Wed, 13 Aug 2025 16:13:35 -0700 Subject: [PATCH 131/243] Fix XPU iGPU regressions (#9322) * Change bf16 check and switch non-blocking to off default with option to force to regain speed on certain classes of iGPUs and refactor xpu check. * Turn non_blocking off by default for xpu. * Update README.md for Intel GPUs. --- README.md | 28 ++++++++++------------------ comfy/cli_args.py | 2 ++ comfy/model_management.py | 21 +++++++++++++-------- 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index e4cff01a9..fa99a8cbe 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ ComfyUI lets you design and execute advanced stable diffusion pipelines using a ## Get Started #### [Desktop Application](https://www.comfy.org/download) -- The easiest way to get started. +- The easiest way to get started. - Available on Windows & macOS. #### [Windows Portable Package](#installing) @@ -211,27 +211,19 @@ This is the command to install the nightly with ROCm 6.4 which might have some p ### Intel GPUs (Windows and Linux) -(Option 1) Intel Arc GPU users can install native PyTorch with torch.xpu support using pip (currently available in PyTorch nightly builds). More information can be found [here](https://pytorch.org/docs/main/notes/get_start_xpu.html) - -1. To install PyTorch nightly, use the following command: +(Option 1) Intel Arc GPU users can install native PyTorch with torch.xpu support using pip. More information can be found [here](https://pytorch.org/docs/main/notes/get_start_xpu.html) + +1. To install PyTorch xpu, use the following command: + +```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu``` + +This is the command to install the Pytorch xpu nightly which might have some performance improvements: ```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu``` -2. Launch ComfyUI by running `python main.py` - - (Option 2) Alternatively, Intel GPUs supported by Intel Extension for PyTorch (IPEX) can leverage IPEX for improved performance. -1. For Intel® Arc™ A-Series Graphics utilizing IPEX, create a conda environment and use the commands below: - -``` -conda install libuv -pip install torch==2.3.1.post0+cxx11.abi torchvision==0.18.1.post0+cxx11.abi torchaudio==2.3.1.post0+cxx11.abi intel-extension-for-pytorch==2.3.110.post0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ -``` - -For other supported Intel GPUs with IPEX, visit [Installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu) for more information. - -Additional discussion and help can be found [here](https://github.com/comfyanonymous/ComfyUI/discussions/476). +1. visit [Installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu) for more information. ### NVIDIA @@ -352,7 +344,7 @@ Generate a self-signed certificate (not appropriate for shared/production use) a Use `--tls-keyfile key.pem --tls-certfile cert.pem` to enable TLS/SSL, the app will now be accessible with `https://...` instead of `http://...`. -> Note: Windows users can use [alexisrolland/docker-openssl](https://github.com/alexisrolland/docker-openssl) or one of the [3rd party binary distributions](https://wiki.openssl.org/index.php/Binaries) to run the command example above. +> Note: Windows users can use [alexisrolland/docker-openssl](https://github.com/alexisrolland/docker-openssl) or one of the [3rd party binary distributions](https://wiki.openssl.org/index.php/Binaries) to run the command example above.

If you use a container, note that the volume mount `-v` can be a relative path so `... -v ".\:/openssl-certs" ...` would create the key & cert files in the current directory of your command prompt or powershell terminal. ## Support and dev channel diff --git a/comfy/cli_args.py b/comfy/cli_args.py index 0d760d524..de3e85c08 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -132,6 +132,8 @@ parser.add_argument("--reserve-vram", type=float, default=None, help="Set the am parser.add_argument("--async-offload", action="store_true", help="Use async weight offloading.") +parser.add_argument("--force-non-blocking", action="store_true", help="Force ComfyUI to use non-blocking operations for all applicable tensors. This may improve performance on some non-Nvidia systems but can cause issues with some workflows.") + parser.add_argument("--default-hashing-function", type=str, choices=['md5', 'sha1', 'sha256', 'sha512'], default='sha256', help="Allows you to choose the hash function to use for duplicate filename / contents comparison. Default is sha256.") parser.add_argument("--disable-smart-memory", action="store_true", help="Force ComfyUI to agressively offload to regular ram instead of keeping models in vram when it can.") diff --git a/comfy/model_management.py b/comfy/model_management.py index c08f759e5..2a9f18068 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -78,7 +78,6 @@ try: torch_version = torch.version.__version__ temp = torch_version.split(".") torch_version_numeric = (int(temp[0]), int(temp[1])) - xpu_available = (torch_version_numeric[0] < 2 or (torch_version_numeric[0] == 2 and torch_version_numeric[1] <= 4)) and torch.xpu.is_available() except: pass @@ -102,10 +101,14 @@ if args.directml is not None: try: import intel_extension_for_pytorch as ipex # noqa: F401 - _ = torch.xpu.device_count() - xpu_available = xpu_available or torch.xpu.is_available() except: - xpu_available = xpu_available or (hasattr(torch, "xpu") and torch.xpu.is_available()) + pass + +try: + _ = torch.xpu.device_count() + xpu_available = torch.xpu.is_available() +except: + xpu_available = False try: if torch.backends.mps.is_available(): @@ -946,10 +949,12 @@ def pick_weight_dtype(dtype, fallback_dtype, device=None): return dtype def device_supports_non_blocking(device): + if args.force_non_blocking: + return True if is_device_mps(device): return False #pytorch bug? mps doesn't support non blocking - if is_intel_xpu(): - return True + if is_intel_xpu(): #xpu does support non blocking but it is slower on iGPUs for some reason so disable by default until situation changes + return False if args.deterministic: #TODO: figure out why deterministic breaks non blocking from gpu to cpu (previews) return False if directml_enabled: @@ -1282,10 +1287,10 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma return False if is_intel_xpu(): - if torch_version_numeric < (2, 6): + if torch_version_numeric < (2, 3): return True else: - return torch.xpu.get_device_capability(device)['has_bfloat16_conversions'] + return torch.xpu.is_bf16_supported() if is_ascend_npu(): return True From e4f7ea105f4b3034593f316560d952b80453e344 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Wed, 13 Aug 2025 18:33:05 -0700 Subject: [PATCH 132/243] Added context window support to core sampling code (#9238) * Added initial support for basic context windows - in progress * Add prepare_sampling wrapper for context window to more accurately estimate latent memory requirements, fixed merging wrappers/callbacks dicts in prepare_model_patcher * Made context windows compatible with different dimensions; works for WAN, but results are bad * Fix comfy.patcher_extension.merge_nested_dicts calls in prepare_model_patcher in sampler_helpers.py * Considering adding some callbacks to context window code to allow extensions of behavior without the need to rewrite code * Made dim slicing cleaner * Add Wan Context WIndows node for testing * Made context schedule and fuse method functions be stored on the handler instead of needing to be registered in core code to be found * Moved some code around between node_context_windows.py and context_windows.py * Change manual context window nodes names/ids * Added callbacks to IndexListContexHandler * Adjusted default values for context_length and context_overlap, made schema.inputs definition for WAN Context Windows less annoying * Make get_resized_cond more robust for various dim sizes * Fix typo * Another small fix --- comfy/context_windows.py | 537 ++++++++++++++++++++++++++ comfy/sampler_helpers.py | 6 +- comfy/samplers.py | 11 +- comfy_extras/nodes_context_windows.py | 89 +++++ nodes.py | 1 + 5 files changed, 639 insertions(+), 5 deletions(-) create mode 100644 comfy/context_windows.py create mode 100644 comfy_extras/nodes_context_windows.py diff --git a/comfy/context_windows.py b/comfy/context_windows.py new file mode 100644 index 000000000..928b111df --- /dev/null +++ b/comfy/context_windows.py @@ -0,0 +1,537 @@ +from __future__ import annotations +from typing import TYPE_CHECKING, Callable +import torch +import numpy as np +import collections +from dataclasses import dataclass +from abc import ABC, abstractmethod +import logging +import comfy.model_management +import comfy.patcher_extension +if TYPE_CHECKING: + from comfy.model_base import BaseModel + from comfy.model_patcher import ModelPatcher + from comfy.controlnet import ControlBase + + +class ContextWindowABC(ABC): + def __init__(self): + ... + + @abstractmethod + def get_tensor(self, full: torch.Tensor) -> torch.Tensor: + """ + Get torch.Tensor applicable to current window. + """ + raise NotImplementedError("Not implemented.") + + @abstractmethod + def add_window(self, full: torch.Tensor, to_add: torch.Tensor) -> torch.Tensor: + """ + Apply torch.Tensor of window to the full tensor, in place. Returns reference to updated full tensor, not a copy. + """ + raise NotImplementedError("Not implemented.") + +class ContextHandlerABC(ABC): + def __init__(self): + ... + + @abstractmethod + def should_use_context(self, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]) -> bool: + raise NotImplementedError("Not implemented.") + + @abstractmethod + def get_resized_cond(self, cond_in: list[dict], x_in: torch.Tensor, window: ContextWindowABC, device=None) -> list: + raise NotImplementedError("Not implemented.") + + @abstractmethod + def execute(self, calc_cond_batch: Callable, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]): + raise NotImplementedError("Not implemented.") + + + +class IndexListContextWindow(ContextWindowABC): + def __init__(self, index_list: list[int], dim: int=0): + self.index_list = index_list + self.context_length = len(index_list) + self.dim = dim + + def get_tensor(self, full: torch.Tensor, device=None, dim=None) -> torch.Tensor: + if dim is None: + dim = self.dim + if dim == 0 and full.shape[dim] == 1: + return full + idx = [slice(None)] * dim + [self.index_list] + return full[idx].to(device) + + def add_window(self, full: torch.Tensor, to_add: torch.Tensor, dim=None) -> torch.Tensor: + if dim is None: + dim = self.dim + idx = [slice(None)] * dim + [self.index_list] + full[idx] += to_add + return full + + +class IndexListCallbacks: + EVALUATE_CONTEXT_WINDOWS = "evaluate_context_windows" + COMBINE_CONTEXT_WINDOW_RESULTS = "combine_context_window_results" + EXECUTE_START = "execute_start" + EXECUTE_CLEANUP = "execute_cleanup" + + def init_callbacks(self): + return {} + + +@dataclass +class ContextSchedule: + name: str + func: Callable + +@dataclass +class ContextFuseMethod: + name: str + func: Callable + +ContextResults = collections.namedtuple("ContextResults", ['window_idx', 'sub_conds_out', 'sub_conds', 'window']) +class IndexListContextHandler(ContextHandlerABC): + def __init__(self, context_schedule: ContextSchedule, fuse_method: ContextFuseMethod, context_length: int=1, context_overlap: int=0, context_stride: int=1, closed_loop=False, dim=0): + self.context_schedule = context_schedule + self.fuse_method = fuse_method + self.context_length = context_length + self.context_overlap = context_overlap + self.context_stride = context_stride + self.closed_loop = closed_loop + self.dim = dim + self._step = 0 + + self.callbacks = {} + + def should_use_context(self, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]) -> bool: + # for now, assume first dim is batch - should have stored on BaseModel in actual implementation + if x_in.size(self.dim) > self.context_length: + logging.info(f"Using context windows {self.context_length} for {x_in.size(self.dim)} frames.") + return True + return False + + def prepare_control_objects(self, control: ControlBase, device=None) -> ControlBase: + if control.previous_controlnet is not None: + self.prepare_control_objects(control.previous_controlnet, device) + return control + + def get_resized_cond(self, cond_in: list[dict], x_in: torch.Tensor, window: IndexListContextWindow, device=None) -> list: + if cond_in is None: + return None + # reuse or resize cond items to match context requirements + resized_cond = [] + # cond object is a list containing a dict - outer list is irrelevant, so just loop through it + for actual_cond in cond_in: + resized_actual_cond = actual_cond.copy() + # now we are in the inner dict - "pooled_output" is a tensor, "control" is a ControlBase object, "model_conds" is dictionary + for key in actual_cond: + try: + cond_item = actual_cond[key] + if isinstance(cond_item, torch.Tensor): + # check that tensor is the expected length - x.size(0) + if self.dim < cond_item.ndim and cond_item.size(self.dim) == x_in.size(self.dim): + # if so, it's subsetting time - tell controls the expected indeces so they can handle them + actual_cond_item = window.get_tensor(cond_item) + resized_actual_cond[key] = actual_cond_item.to(device) + else: + resized_actual_cond[key] = cond_item.to(device) + # look for control + elif key == "control": + resized_actual_cond[key] = self.prepare_control_objects(cond_item, device) + elif isinstance(cond_item, dict): + new_cond_item = cond_item.copy() + # when in dictionary, look for tensors and CONDCrossAttn [comfy/conds.py] (has cond attr that is a tensor) + for cond_key, cond_value in new_cond_item.items(): + if isinstance(cond_value, torch.Tensor): + if cond_value.ndim < self.dim and cond_value.size(0) == x_in.size(self.dim): + new_cond_item[cond_key] = window.get_tensor(cond_value, device) + # if has cond that is a Tensor, check if needs to be subset + elif hasattr(cond_value, "cond") and isinstance(cond_value.cond, torch.Tensor): + if cond_value.cond.ndim < self.dim and cond_value.cond.size(0) == x_in.size(self.dim): + new_cond_item[cond_key] = cond_value._copy_with(window.get_tensor(cond_value.cond, device)) + elif cond_key == "num_video_frames": # for SVD + new_cond_item[cond_key] = cond_value._copy_with(cond_value.cond) + new_cond_item[cond_key].cond = window.context_length + resized_actual_cond[key] = new_cond_item + else: + resized_actual_cond[key] = cond_item + finally: + del cond_item # just in case to prevent VRAM issues + resized_cond.append(resized_actual_cond) + return resized_cond + + def set_step(self, timestep: torch.Tensor, model_options: dict[str]): + indexes = torch.where(model_options["transformer_options"]["sample_sigmas"] == timestep[0]) + self._step = int(indexes[0]) + + def get_context_windows(self, model: BaseModel, x_in: torch.Tensor, model_options: dict[str]) -> list[IndexListContextWindow]: + full_length = x_in.size(self.dim) # TODO: choose dim based on model + context_windows = self.context_schedule.func(full_length, self, model_options) + context_windows = [IndexListContextWindow(window, dim=self.dim) for window in context_windows] + return context_windows + + def execute(self, calc_cond_batch: Callable, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]): + self.set_step(timestep, model_options) + context_windows = self.get_context_windows(model, x_in, model_options) + enumerated_context_windows = list(enumerate(context_windows)) + + conds_final = [torch.zeros_like(x_in) for _ in conds] + if self.fuse_method.name == ContextFuseMethods.RELATIVE: + counts_final = [torch.ones(get_shape_for_dim(x_in, self.dim), device=x_in.device) for _ in conds] + else: + counts_final = [torch.zeros(get_shape_for_dim(x_in, self.dim), device=x_in.device) for _ in conds] + biases_final = [([0.0] * x_in.shape[self.dim]) for _ in conds] + + for callback in comfy.patcher_extension.get_all_callbacks(IndexListCallbacks.EXECUTE_START, self.callbacks): + callback(self, model, x_in, conds, timestep, model_options) + + for enum_window in enumerated_context_windows: + results = self.evaluate_context_windows(calc_cond_batch, model, x_in, conds, timestep, [enum_window], model_options) + for result in results: + self.combine_context_window_results(x_in, result.sub_conds_out, result.sub_conds, result.window, result.window_idx, len(enumerated_context_windows), timestep, + conds_final, counts_final, biases_final) + try: + # finalize conds + if self.fuse_method.name == ContextFuseMethods.RELATIVE: + # relative is already normalized, so return as is + del counts_final + return conds_final + else: + # normalize conds via division by context usage counts + for i in range(len(conds_final)): + conds_final[i] /= counts_final[i] + del counts_final + return conds_final + finally: + for callback in comfy.patcher_extension.get_all_callbacks(IndexListCallbacks.EXECUTE_CLEANUP, self.callbacks): + callback(self, model, x_in, conds, timestep, model_options) + + def evaluate_context_windows(self, calc_cond_batch: Callable, model: BaseModel, x_in: torch.Tensor, conds, timestep: torch.Tensor, enumerated_context_windows: list[tuple[int, IndexListContextWindow]], + model_options, device=None, first_device=None): + results: list[ContextResults] = [] + for window_idx, window in enumerated_context_windows: + # allow processing to end between context window executions for faster Cancel + comfy.model_management.throw_exception_if_processing_interrupted() + + for callback in comfy.patcher_extension.get_all_callbacks(IndexListCallbacks.EVALUATE_CONTEXT_WINDOWS, self.callbacks): + callback(self, model, x_in, conds, timestep, model_options, window_idx, window, model_options, device, first_device) + + # update exposed params + model_options["transformer_options"]["context_window"] = window + # get subsections of x, timestep, conds + sub_x = window.get_tensor(x_in, device) + sub_timestep = window.get_tensor(timestep, device, dim=0) + sub_conds = [self.get_resized_cond(cond, x_in, window, device) for cond in conds] + + sub_conds_out = calc_cond_batch(model, sub_conds, sub_x, sub_timestep, model_options) + if device is not None: + for i in range(len(sub_conds_out)): + sub_conds_out[i] = sub_conds_out[i].to(x_in.device) + results.append(ContextResults(window_idx, sub_conds_out, sub_conds, window)) + return results + + + def combine_context_window_results(self, x_in: torch.Tensor, sub_conds_out, sub_conds, window: IndexListContextWindow, window_idx: int, total_windows: int, timestep: torch.Tensor, + conds_final: list[torch.Tensor], counts_final: list[torch.Tensor], biases_final: list[torch.Tensor]): + if self.fuse_method.name == ContextFuseMethods.RELATIVE: + for pos, idx in enumerate(window.index_list): + # bias is the influence of a specific index in relation to the whole context window + bias = 1 - abs(idx - (window.index_list[0] + window.index_list[-1]) / 2) / ((window.index_list[-1] - window.index_list[0] + 1e-2) / 2) + bias = max(1e-2, bias) + # take weighted average relative to total bias of current idx + for i in range(len(sub_conds_out)): + bias_total = biases_final[i][idx] + prev_weight = (bias_total / (bias_total + bias)) + new_weight = (bias / (bias_total + bias)) + # account for dims of tensors + idx_window = [slice(None)] * self.dim + [idx] + pos_window = [slice(None)] * self.dim + [pos] + # apply new values + conds_final[i][idx_window] = conds_final[i][idx_window] * prev_weight + sub_conds_out[i][pos_window] * new_weight + biases_final[i][idx] = bias_total + bias + else: + # add conds and counts based on weights of fuse method + weights = get_context_weights(window.context_length, x_in.shape[self.dim], window.index_list, self, sigma=timestep) + weights_tensor = match_weights_to_dim(weights, x_in, self.dim, device=x_in.device) + for i in range(len(sub_conds_out)): + window.add_window(conds_final[i], sub_conds_out[i] * weights_tensor) + window.add_window(counts_final[i], weights_tensor) + + for callback in comfy.patcher_extension.get_all_callbacks(IndexListCallbacks.COMBINE_CONTEXT_WINDOW_RESULTS, self.callbacks): + callback(self, x_in, sub_conds_out, sub_conds, window, window_idx, total_windows, timestep, conds_final, counts_final, biases_final) + + +def _prepare_sampling_wrapper(executor, model, noise_shape: torch.Tensor, *args, **kwargs): + # limit noise_shape length to context_length for more accurate vram use estimation + model_options = kwargs.get("model_options", None) + if model_options is None: + raise Exception("model_options not found in prepare_sampling_wrapper; this should never happen, something went wrong.") + handler: IndexListContextHandler = model_options.get("context_handler", None) + if handler is not None: + noise_shape = list(noise_shape) + noise_shape[handler.dim] = min(noise_shape[handler.dim], handler.context_length) + return executor(model, noise_shape, *args, **kwargs) + + +def create_prepare_sampling_wrapper(model: ModelPatcher): + model.add_wrapper_with_key( + comfy.patcher_extension.WrappersMP.PREPARE_SAMPLING, + "ContextWindows_prepare_sampling", + _prepare_sampling_wrapper + ) + + +def match_weights_to_dim(weights: list[float], x_in: torch.Tensor, dim: int, device=None) -> torch.Tensor: + total_dims = len(x_in.shape) + weights_tensor = torch.Tensor(weights).to(device=device) + for _ in range(dim): + weights_tensor = weights_tensor.unsqueeze(0) + for _ in range(total_dims - dim - 1): + weights_tensor = weights_tensor.unsqueeze(-1) + return weights_tensor + +def get_shape_for_dim(x_in: torch.Tensor, dim: int) -> list[int]: + total_dims = len(x_in.shape) + shape = [] + for _ in range(dim): + shape.append(1) + shape.append(x_in.shape[dim]) + for _ in range(total_dims - dim - 1): + shape.append(1) + return shape + +class ContextSchedules: + UNIFORM_LOOPED = "looped_uniform" + UNIFORM_STANDARD = "standard_uniform" + STATIC_STANDARD = "standard_static" + BATCHED = "batched" + + +# from https://github.com/neggles/animatediff-cli/blob/main/src/animatediff/pipelines/context.py +def create_windows_uniform_looped(num_frames: int, handler: IndexListContextHandler, model_options: dict[str]): + windows = [] + if num_frames < handler.context_length: + windows.append(list(range(num_frames))) + return windows + + context_stride = min(handler.context_stride, int(np.ceil(np.log2(num_frames / handler.context_length))) + 1) + # obtain uniform windows as normal, looping and all + for context_step in 1 << np.arange(context_stride): + pad = int(round(num_frames * ordered_halving(handler._step))) + for j in range( + int(ordered_halving(handler._step) * context_step) + pad, + num_frames + pad + (0 if handler.closed_loop else -handler.context_overlap), + (handler.context_length * context_step - handler.context_overlap), + ): + windows.append([e % num_frames for e in range(j, j + handler.context_length * context_step, context_step)]) + + return windows + +def create_windows_uniform_standard(num_frames: int, handler: IndexListContextHandler, model_options: dict[str]): + # unlike looped, uniform_straight does NOT allow windows that loop back to the beginning; + # instead, they get shifted to the corresponding end of the frames. + # in the case that a window (shifted or not) is identical to the previous one, it gets skipped. + windows = [] + if num_frames <= handler.context_length: + windows.append(list(range(num_frames))) + return windows + + context_stride = min(handler.context_stride, int(np.ceil(np.log2(num_frames / handler.context_length))) + 1) + # first, obtain uniform windows as normal, looping and all + for context_step in 1 << np.arange(context_stride): + pad = int(round(num_frames * ordered_halving(handler._step))) + for j in range( + int(ordered_halving(handler._step) * context_step) + pad, + num_frames + pad + (-handler.context_overlap), + (handler.context_length * context_step - handler.context_overlap), + ): + windows.append([e % num_frames for e in range(j, j + handler.context_length * context_step, context_step)]) + + # now that windows are created, shift any windows that loop, and delete duplicate windows + delete_idxs = [] + win_i = 0 + while win_i < len(windows): + # if window is rolls over itself, need to shift it + is_roll, roll_idx = does_window_roll_over(windows[win_i], num_frames) + if is_roll: + roll_val = windows[win_i][roll_idx] # roll_val might not be 0 for windows of higher strides + shift_window_to_end(windows[win_i], num_frames=num_frames) + # check if next window (cyclical) is missing roll_val + if roll_val not in windows[(win_i+1) % len(windows)]: + # need to insert new window here - just insert window starting at roll_val + windows.insert(win_i+1, list(range(roll_val, roll_val + handler.context_length))) + # delete window if it's not unique + for pre_i in range(0, win_i): + if windows[win_i] == windows[pre_i]: + delete_idxs.append(win_i) + break + win_i += 1 + + # reverse delete_idxs so that they will be deleted in an order that doesn't break idx correlation + delete_idxs.reverse() + for i in delete_idxs: + windows.pop(i) + + return windows + + +def create_windows_static_standard(num_frames: int, handler: IndexListContextHandler, model_options: dict[str]): + windows = [] + if num_frames <= handler.context_length: + windows.append(list(range(num_frames))) + return windows + # always return the same set of windows + delta = handler.context_length - handler.context_overlap + for start_idx in range(0, num_frames, delta): + # if past the end of frames, move start_idx back to allow same context_length + ending = start_idx + handler.context_length + if ending >= num_frames: + final_delta = ending - num_frames + final_start_idx = start_idx - final_delta + windows.append(list(range(final_start_idx, final_start_idx + handler.context_length))) + break + windows.append(list(range(start_idx, start_idx + handler.context_length))) + return windows + + +def create_windows_batched(num_frames: int, handler: IndexListContextHandler, model_options: dict[str]): + windows = [] + if num_frames <= handler.context_length: + windows.append(list(range(num_frames))) + return windows + # always return the same set of windows; + # no overlap, just cut up based on context_length; + # last window size will be different if num_frames % opts.context_length != 0 + for start_idx in range(0, num_frames, handler.context_length): + windows.append(list(range(start_idx, min(start_idx + handler.context_length, num_frames)))) + return windows + + +def create_windows_default(num_frames: int, handler: IndexListContextHandler): + return [list(range(num_frames))] + + +CONTEXT_MAPPING = { + ContextSchedules.UNIFORM_LOOPED: create_windows_uniform_looped, + ContextSchedules.UNIFORM_STANDARD: create_windows_uniform_standard, + ContextSchedules.STATIC_STANDARD: create_windows_static_standard, + ContextSchedules.BATCHED: create_windows_batched, +} + + +def get_matching_context_schedule(context_schedule: str) -> ContextSchedule: + func = CONTEXT_MAPPING.get(context_schedule, None) + if func is None: + raise ValueError(f"Unknown context_schedule '{context_schedule}'.") + return ContextSchedule(context_schedule, func) + + +def get_context_weights(length: int, full_length: int, idxs: list[int], handler: IndexListContextHandler, sigma: torch.Tensor=None): + return handler.fuse_method.func(length, sigma=sigma, handler=handler, full_length=full_length, idxs=idxs) + + +def create_weights_flat(length: int, **kwargs) -> list[float]: + # weight is the same for all + return [1.0] * length + +def create_weights_pyramid(length: int, **kwargs) -> list[float]: + # weight is based on the distance away from the edge of the context window; + # based on weighted average concept in FreeNoise paper + if length % 2 == 0: + max_weight = length // 2 + weight_sequence = list(range(1, max_weight + 1, 1)) + list(range(max_weight, 0, -1)) + else: + max_weight = (length + 1) // 2 + weight_sequence = list(range(1, max_weight, 1)) + [max_weight] + list(range(max_weight - 1, 0, -1)) + return weight_sequence + +def create_weights_overlap_linear(length: int, full_length: int, idxs: list[int], handler: IndexListContextHandler, **kwargs): + # based on code in Kijai's WanVideoWrapper: https://github.com/kijai/ComfyUI-WanVideoWrapper/blob/dbb2523b37e4ccdf45127e5ae33e31362f755c8e/nodes.py#L1302 + # only expected overlap is given different weights + weights_torch = torch.ones((length)) + # blend left-side on all except first window + if min(idxs) > 0: + ramp_up = torch.linspace(1e-37, 1, handler.context_overlap) + weights_torch[:handler.context_overlap] = ramp_up + # blend right-side on all except last window + if max(idxs) < full_length-1: + ramp_down = torch.linspace(1, 1e-37, handler.context_overlap) + weights_torch[-handler.context_overlap:] = ramp_down + return weights_torch + +class ContextFuseMethods: + FLAT = "flat" + PYRAMID = "pyramid" + RELATIVE = "relative" + OVERLAP_LINEAR = "overlap-linear" + + LIST = [PYRAMID, FLAT, OVERLAP_LINEAR] + LIST_STATIC = [PYRAMID, RELATIVE, FLAT, OVERLAP_LINEAR] + + +FUSE_MAPPING = { + ContextFuseMethods.FLAT: create_weights_flat, + ContextFuseMethods.PYRAMID: create_weights_pyramid, + ContextFuseMethods.RELATIVE: create_weights_pyramid, + ContextFuseMethods.OVERLAP_LINEAR: create_weights_overlap_linear, +} + +def get_matching_fuse_method(fuse_method: str) -> ContextFuseMethod: + func = FUSE_MAPPING.get(fuse_method, None) + if func is None: + raise ValueError(f"Unknown fuse_method '{fuse_method}'.") + return ContextFuseMethod(fuse_method, func) + +# Returns fraction that has denominator that is a power of 2 +def ordered_halving(val): + # get binary value, padded with 0s for 64 bits + bin_str = f"{val:064b}" + # flip binary value, padding included + bin_flip = bin_str[::-1] + # convert binary to int + as_int = int(bin_flip, 2) + # divide by 1 << 64, equivalent to 2**64, or 18446744073709551616, + # or b10000000000000000000000000000000000000000000000000000000000000000 (1 with 64 zero's) + return as_int / (1 << 64) + + +def get_missing_indexes(windows: list[list[int]], num_frames: int) -> list[int]: + all_indexes = list(range(num_frames)) + for w in windows: + for val in w: + try: + all_indexes.remove(val) + except ValueError: + pass + return all_indexes + + +def does_window_roll_over(window: list[int], num_frames: int) -> tuple[bool, int]: + prev_val = -1 + for i, val in enumerate(window): + val = val % num_frames + if val < prev_val: + return True, i + prev_val = val + return False, -1 + + +def shift_window_to_start(window: list[int], num_frames: int): + start_val = window[0] + for i in range(len(window)): + # 1) subtract each element by start_val to move vals relative to the start of all frames + # 2) add num_frames and take modulus to get adjusted vals + window[i] = ((window[i] - start_val) + num_frames) % num_frames + + +def shift_window_to_end(window: list[int], num_frames: int): + # 1) shift window to start + shift_window_to_start(window, num_frames) + end_val = window[-1] + end_delta = num_frames - end_val - 1 + for i in range(len(window)): + # 2) add end_delta to each val to slide windows to end + window[i] = window[i] + end_delta diff --git a/comfy/sampler_helpers.py b/comfy/sampler_helpers.py index 8dbc41455..e46971afb 100644 --- a/comfy/sampler_helpers.py +++ b/comfy/sampler_helpers.py @@ -149,7 +149,7 @@ def cleanup_models(conds, models): cleanup_additional_models(set(control_cleanup)) -def prepare_model_patcher(model: 'ModelPatcher', conds, model_options: dict): +def prepare_model_patcher(model: ModelPatcher, conds, model_options: dict): ''' Registers hooks from conds. ''' @@ -158,8 +158,8 @@ def prepare_model_patcher(model: 'ModelPatcher', conds, model_options: dict): for k in conds: get_hooks_from_cond(conds[k], hooks) # add wrappers and callbacks from ModelPatcher to transformer_options - model_options["transformer_options"]["wrappers"] = comfy.patcher_extension.copy_nested_dicts(model.wrappers) - model_options["transformer_options"]["callbacks"] = comfy.patcher_extension.copy_nested_dicts(model.callbacks) + comfy.patcher_extension.merge_nested_dicts(model_options["transformer_options"].setdefault("wrappers", {}), model.wrappers, copy_dict1=False) + comfy.patcher_extension.merge_nested_dicts(model_options["transformer_options"].setdefault("callbacks", {}), model.callbacks, copy_dict1=False) # begin registering hooks registered = comfy.hooks.HookGroup() target_dict = comfy.hooks.create_target_dict(comfy.hooks.EnumWeightTarget.Model) diff --git a/comfy/samplers.py b/comfy/samplers.py index ad2f40cdc..d5390d64e 100644 --- a/comfy/samplers.py +++ b/comfy/samplers.py @@ -16,6 +16,7 @@ import comfy.sampler_helpers import comfy.model_patcher import comfy.patcher_extension import comfy.hooks +import comfy.context_windows import scipy.stats import numpy @@ -198,14 +199,20 @@ def finalize_default_conds(model: 'BaseModel', hooked_to_run: dict[comfy.hooks.H hooked_to_run.setdefault(p.hooks, list()) hooked_to_run[p.hooks] += [(p, i)] -def calc_cond_batch(model: 'BaseModel', conds: list[list[dict]], x_in: torch.Tensor, timestep, model_options): +def calc_cond_batch(model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep, model_options: dict[str]): + handler: comfy.context_windows.ContextHandlerABC = model_options.get("context_handler", None) + if handler is None or not handler.should_use_context(model, conds, x_in, timestep, model_options): + return _calc_cond_batch_outer(model, conds, x_in, timestep, model_options) + return handler.execute(_calc_cond_batch_outer, model, conds, x_in, timestep, model_options) + +def _calc_cond_batch_outer(model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep, model_options): executor = comfy.patcher_extension.WrapperExecutor.new_executor( _calc_cond_batch, comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.CALC_COND_BATCH, model_options, is_model_options=True) ) return executor.execute(model, conds, x_in, timestep, model_options) -def _calc_cond_batch(model: 'BaseModel', conds: list[list[dict]], x_in: torch.Tensor, timestep, model_options): +def _calc_cond_batch(model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep, model_options): out_conds = [] out_counts = [] # separate conds by matching hooks diff --git a/comfy_extras/nodes_context_windows.py b/comfy_extras/nodes_context_windows.py new file mode 100644 index 000000000..1c3d9e697 --- /dev/null +++ b/comfy_extras/nodes_context_windows.py @@ -0,0 +1,89 @@ +from __future__ import annotations +from comfy_api.latest import ComfyExtension, io +import comfy.context_windows +import nodes + + +class ContextWindowsManualNode(io.ComfyNode): + @classmethod + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="ContextWindowsManual", + display_name="Context Windows (Manual)", + category="context", + description="Manually set context windows.", + inputs=[ + io.Model.Input("model", tooltip="The model to apply context windows to during sampling."), + io.Int.Input("context_length", min=1, default=16, tooltip="The length of the context window."), + io.Int.Input("context_overlap", min=0, default=4, tooltip="The overlap of the context window."), + io.Combo.Input("context_schedule", options=[ + comfy.context_windows.ContextSchedules.STATIC_STANDARD, + comfy.context_windows.ContextSchedules.UNIFORM_STANDARD, + comfy.context_windows.ContextSchedules.UNIFORM_LOOPED, + comfy.context_windows.ContextSchedules.BATCHED, + ], tooltip="The stride of the context window."), + io.Int.Input("context_stride", min=1, default=1, tooltip="The stride of the context window; only applicable to uniform schedules."), + io.Boolean.Input("closed_loop", default=False, tooltip="Whether to close the context window loop; only applicable to looped schedules."), + io.Combo.Input("fuse_method", options=comfy.context_windows.ContextFuseMethods.LIST_STATIC, default=comfy.context_windows.ContextFuseMethods.PYRAMID, tooltip="The method to use to fuse the context windows."), + io.Int.Input("dim", min=0, max=5, default=0, tooltip="The dimension to apply the context windows to."), + ], + outputs=[ + io.Model.Output(tooltip="The model with context windows applied during sampling."), + ], + is_experimental=True, + ) + + @classmethod + def execute(cls, model: io.Model.Type, context_length: int, context_overlap: int, context_schedule: str, context_stride: int, closed_loop: bool, fuse_method: str, dim: int) -> io.Model: + model = model.clone() + model.model_options["context_handler"] = comfy.context_windows.IndexListContextHandler( + context_schedule=comfy.context_windows.get_matching_context_schedule(context_schedule), + fuse_method=comfy.context_windows.get_matching_fuse_method(fuse_method), + context_length=context_length, + context_overlap=context_overlap, + context_stride=context_stride, + closed_loop=closed_loop, + dim=dim) + # make memory usage calculation only take into account the context window latents + comfy.context_windows.create_prepare_sampling_wrapper(model) + return io.NodeOutput(model) + +class WanContextWindowsManualNode(ContextWindowsManualNode): + @classmethod + def define_schema(cls) -> io.Schema: + schema = super().define_schema() + schema.node_id = "WanContextWindowsManual" + schema.display_name = "WAN Context Windows (Manual)" + schema.description = "Manually set context windows for WAN-like models (dim=2)." + schema.inputs = [ + io.Model.Input("model", tooltip="The model to apply context windows to during sampling."), + io.Int.Input("context_length", min=1, max=nodes.MAX_RESOLUTION, step=4, default=81, tooltip="The length of the context window."), + io.Int.Input("context_overlap", min=0, default=30, tooltip="The overlap of the context window."), + io.Combo.Input("context_schedule", options=[ + comfy.context_windows.ContextSchedules.STATIC_STANDARD, + comfy.context_windows.ContextSchedules.UNIFORM_STANDARD, + comfy.context_windows.ContextSchedules.UNIFORM_LOOPED, + comfy.context_windows.ContextSchedules.BATCHED, + ], tooltip="The stride of the context window."), + io.Int.Input("context_stride", min=1, default=1, tooltip="The stride of the context window; only applicable to uniform schedules."), + io.Boolean.Input("closed_loop", default=False, tooltip="Whether to close the context window loop; only applicable to looped schedules."), + io.Combo.Input("fuse_method", options=comfy.context_windows.ContextFuseMethods.LIST_STATIC, default=comfy.context_windows.ContextFuseMethods.PYRAMID, tooltip="The method to use to fuse the context windows."), + ] + return schema + + @classmethod + def execute(cls, model: io.Model.Type, context_length: int, context_overlap: int, context_schedule: str, context_stride: int, closed_loop: bool, fuse_method: str) -> io.Model: + context_length = max(((context_length - 1) // 4) + 1, 1) # at least length 1 + context_overlap = max(((context_overlap - 1) // 4) + 1, 0) # at least overlap 0 + return super().execute(model, context_length, context_overlap, context_schedule, context_stride, closed_loop, fuse_method, dim=2) + + +class ContextWindowsExtension(ComfyExtension): + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + ContextWindowsManualNode, + WanContextWindowsManualNode, + ] + +def comfy_entrypoint(): + return ContextWindowsExtension() diff --git a/nodes.py b/nodes.py index 9448f9c1b..860a236aa 100644 --- a/nodes.py +++ b/nodes.py @@ -2320,6 +2320,7 @@ async def init_builtin_extra_nodes(): "nodes_camera_trajectory.py", "nodes_edit_model.py", "nodes_tcfg.py", + "nodes_context_windows.py", ] import_failed = [] From 72fd4d22b6a4fa11a3f737c9a633e7d635a42181 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 14 Aug 2025 13:03:21 -0700 Subject: [PATCH 133/243] av is an essential dependency. (#9341) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index bfb31a73f..551002b5b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,11 +20,11 @@ tqdm psutil alembic SQLAlchemy +av>=14.2.0 #non essential dependencies: kornia>=0.7.1 spandrel soundfile -av>=14.2.0 pydantic~=2.0 pydantic-settings~=2.0 From 644b23ac0b92442b475e44397c62aa8de929d546 Mon Sep 17 00:00:00 2001 From: filtered <176114999+webfiltered@users.noreply.github.com> Date: Fri, 15 Aug 2025 07:36:53 +1000 Subject: [PATCH 134/243] Make custom node testing checkbox optional in issue templates (#9342) The checkbox for confirming custom node testing is now optional in both bug report and user support templates. This allows users to submit issues even if they haven't been able to test with custom nodes disabled, making the reporting process more accessible. --- .github/ISSUE_TEMPLATE/bug-report.yml | 2 +- .github/ISSUE_TEMPLATE/user-support.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index 69ce998eb..3cf2717b7 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -22,7 +22,7 @@ body: description: Please confirm you have tried to reproduce the issue with all custom nodes disabled. options: - label: I have tried disabling custom nodes and the issue persists (see [how to disable custom nodes](https://docs.comfy.org/troubleshooting/custom-node-issues#step-1%3A-test-with-all-custom-nodes-disabled) if you need help) - required: true + required: false - type: textarea attributes: label: Expected Behavior diff --git a/.github/ISSUE_TEMPLATE/user-support.yml b/.github/ISSUE_TEMPLATE/user-support.yml index 50657d493..281661f92 100644 --- a/.github/ISSUE_TEMPLATE/user-support.yml +++ b/.github/ISSUE_TEMPLATE/user-support.yml @@ -18,7 +18,7 @@ body: description: Please confirm you have tried to reproduce the issue with all custom nodes disabled. options: - label: I have tried disabling custom nodes and the issue persists (see [how to disable custom nodes](https://docs.comfy.org/troubleshooting/custom-node-issues#step-1%3A-test-with-all-custom-nodes-disabled) if you need help) - required: true + required: false - type: textarea attributes: label: Your question From fa570cbf599657e73c636872616c0b1f8e74f692 Mon Sep 17 00:00:00 2001 From: Yoland Yan <4950057+yoland68@users.noreply.github.com> Date: Thu, 14 Aug 2025 16:44:22 -0700 Subject: [PATCH 135/243] Update CODEOWNERS (#9343) --- CODEOWNERS | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/CODEOWNERS b/CODEOWNERS index c4acbf06e..c8acd66d5 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -5,20 +5,21 @@ # Inlined the team members for now. # Maintainers -*.md @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne -/tests/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne -/tests-unit/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne -/notebooks/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne -/script_examples/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne -/.github/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne -/requirements.txt @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne -/pyproject.toml @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne +*.md @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill +/tests/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill +/tests-unit/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill +/notebooks/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill +/script_examples/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill +/.github/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill +/requirements.txt @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill +/pyproject.toml @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill # Python web server -/api_server/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @christian-byrne -/app/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @christian-byrne -/utils/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @christian-byrne +/api_server/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @christian-byrne @guill +/app/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @christian-byrne @guill +/utils/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @christian-byrne @guill # Node developers -/comfy_extras/ @yoland68 @robinjhuang @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne -/comfy/comfy_types/ @yoland68 @robinjhuang @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne +/comfy_extras/ @yoland68 @robinjhuang @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne @guill +/comfy/comfy_types/ @yoland68 @robinjhuang @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne @guill +/comfy_api_nodes/ @yoland68 @robinjhuang @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne @guill From deebee4ff6fd1b2713683d22e5e2e07170daa867 Mon Sep 17 00:00:00 2001 From: guill Date: Thu, 14 Aug 2025 18:46:55 -0700 Subject: [PATCH 136/243] Update default parameters for Moonvalley video nodes (#9290) * Update default parameters for Moonvalley video nodes - Changed default negative prompts to a more extensive list for both BaseMoonvalleyVideoNode and MoonvalleyVideo2VideoNode. - Updated default guidance scale values for both nodes to enhance prompt adherence. - Set a fixed default seed value for consistency in video generation. * no message * ruff fix --------- Co-authored-by: thorsten --- comfy_api_nodes/nodes_moonvalley.py | 128 ++++++++++++++++++++-------- 1 file changed, 91 insertions(+), 37 deletions(-) diff --git a/comfy_api_nodes/nodes_moonvalley.py b/comfy_api_nodes/nodes_moonvalley.py index 164ca3ea5..806a70e06 100644 --- a/comfy_api_nodes/nodes_moonvalley.py +++ b/comfy_api_nodes/nodes_moonvalley.py @@ -1,6 +1,5 @@ import logging from typing import Any, Callable, Optional, TypeVar -import random import torch from comfy_api_nodes.util.validation_utils import ( get_image_dimensions, @@ -208,20 +207,29 @@ def _get_video_dimensions(video: VideoInput) -> tuple[int, int]: def _validate_video_dimensions(width: int, height: int) -> None: """Validates video dimensions meet Moonvalley V2V requirements.""" supported_resolutions = { - (1920, 1080), (1080, 1920), (1152, 1152), - (1536, 1152), (1152, 1536) + (1920, 1080), + (1080, 1920), + (1152, 1152), + (1536, 1152), + (1152, 1536), } if (width, height) not in supported_resolutions: - supported_list = ', '.join([f'{w}x{h}' for w, h in sorted(supported_resolutions)]) - raise ValueError(f"Resolution {width}x{height} not supported. Supported: {supported_list}") + supported_list = ", ".join( + [f"{w}x{h}" for w, h in sorted(supported_resolutions)] + ) + raise ValueError( + f"Resolution {width}x{height} not supported. Supported: {supported_list}" + ) def _validate_container_format(video: VideoInput) -> None: """Validates video container format is MP4.""" container_format = video.get_container_format() - if container_format not in ['mp4', 'mov,mp4,m4a,3gp,3g2,mj2']: - raise ValueError(f"Only MP4 container format supported. Got: {container_format}") + if container_format not in ["mp4", "mov,mp4,m4a,3gp,3g2,mj2"]: + raise ValueError( + f"Only MP4 container format supported. Got: {container_format}" + ) def _validate_and_trim_duration(video: VideoInput) -> VideoInput: @@ -244,7 +252,6 @@ def _trim_if_too_long(video: VideoInput, duration: float) -> VideoInput: return video - def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: """ Returns a new VideoInput object trimmed from the beginning to the specified duration, @@ -302,7 +309,9 @@ def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: # Calculate target frame count that's divisible by 16 fps = input_container.streams.video[0].average_rate estimated_frames = int(duration_sec * fps) - target_frames = (estimated_frames // 16) * 16 # Round down to nearest multiple of 16 + target_frames = ( + estimated_frames // 16 + ) * 16 # Round down to nearest multiple of 16 if target_frames == 0: raise ValueError("Video too short: need at least 16 frames for Moonvalley") @@ -424,7 +433,7 @@ class BaseMoonvalleyVideoNode: MoonvalleyTextToVideoInferenceParams, "negative_prompt", multiline=True, - default="low-poly, flat shader, bad rigging, stiff animation, uncanny eyes, low-quality textures, looping glitch, cheap effect, overbloom, bloom spam, default lighting, game asset, stiff face, ugly specular, AI artifacts", + default=" gopro, bright, contrast, static, overexposed, vignette, artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, flare, saturation, distorted, warped, wide angle, saturated, vibrant, glowing, cross dissolve, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, blown out, horrible, blurry, worst quality, bad, dissolve, melt, fade in, fade out, wobbly, weird, low quality, plastic, stock footage, video camera, boring", ), "resolution": ( IO.COMBO, @@ -441,12 +450,11 @@ class BaseMoonvalleyVideoNode: "tooltip": "Resolution of the output video", }, ), - # "length": (IO.COMBO,{"options":['5s','10s'], "default": '5s'}), "prompt_adherence": model_field_to_node_input( IO.FLOAT, MoonvalleyTextToVideoInferenceParams, "guidance_scale", - default=7.0, + default=10.0, step=1, min=1, max=20, @@ -455,13 +463,12 @@ class BaseMoonvalleyVideoNode: IO.INT, MoonvalleyTextToVideoInferenceParams, "seed", - default=random.randint(0, 2**32 - 1), + default=9, min=0, max=4294967295, step=1, display="number", tooltip="Random seed value", - control_after_generate=True, ), "steps": model_field_to_node_input( IO.INT, @@ -532,9 +539,11 @@ class MoonvalleyImg2VideoNode(BaseMoonvalleyVideoNode): # Get MIME type from tensor - assuming PNG format for image tensors mime_type = "image/png" - image_url = (await upload_images_to_comfyapi( - image, max_images=1, auth_kwargs=kwargs, mime_type=mime_type - ))[0] + image_url = ( + await upload_images_to_comfyapi( + image, max_images=1, auth_kwargs=kwargs, mime_type=mime_type + ) + )[0] request = MoonvalleyTextToVideoRequest( image_url=image_url, prompt_text=prompt, inference_params=inference_params @@ -570,17 +579,39 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): return { "required": { "prompt": model_field_to_node_input( - IO.STRING, MoonvalleyVideoToVideoRequest, "prompt_text", - multiline=True + IO.STRING, + MoonvalleyVideoToVideoRequest, + "prompt_text", + multiline=True, ), "negative_prompt": model_field_to_node_input( IO.STRING, MoonvalleyVideoToVideoInferenceParams, "negative_prompt", multiline=True, - default="low-poly, flat shader, bad rigging, stiff animation, uncanny eyes, low-quality textures, looping glitch, cheap effect, overbloom, bloom spam, default lighting, game asset, stiff face, ugly specular, AI artifacts" + default=" gopro, bright, contrast, static, overexposed, vignette, artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, flare, saturation, distorted, warped, wide angle, saturated, vibrant, glowing, cross dissolve, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, blown out, horrible, blurry, worst quality, bad, dissolve, melt, fade in, fade out, wobbly, weird, low quality, plastic, stock footage, video camera, boring", + ), + "seed": model_field_to_node_input( + IO.INT, + MoonvalleyVideoToVideoInferenceParams, + "seed", + default=9, + min=0, + max=4294967295, + step=1, + display="number", + tooltip="Random seed value", + control_after_generate=False, + ), + "prompt_adherence": model_field_to_node_input( + IO.FLOAT, + MoonvalleyVideoToVideoInferenceParams, + "guidance_scale", + default=10.0, + step=1, + min=1, + max=20, ), - "seed": model_field_to_node_input(IO.INT,MoonvalleyVideoToVideoInferenceParams, "seed", default=random.randint(0, 2**32 - 1), min=0, max=4294967295, step=1, display="number", tooltip="Random seed value", control_after_generate=True), }, "hidden": { "auth_token": "AUTH_TOKEN_COMFY_ORG", @@ -588,7 +619,14 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): "unique_id": "UNIQUE_ID", }, "optional": { - "video": (IO.VIDEO, {"default": "", "multiline": False, "tooltip": "The reference video used to generate the output video. Must be at least 5 seconds long. Videos longer than 5s will be automatically trimmed. Only MP4 format supported."}), + "video": ( + IO.VIDEO, + { + "default": "", + "multiline": False, + "tooltip": "The reference video used to generate the output video. Must be at least 5 seconds long. Videos longer than 5s will be automatically trimmed. Only MP4 format supported.", + }, + ), "control_type": ( ["Motion Transfer", "Pose Transfer"], {"default": "Motion Transfer"}, @@ -602,8 +640,14 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): "max": 100, "tooltip": "Only used if control_type is 'Motion Transfer'", }, - ) - } + ), + "image": model_field_to_node_input( + IO.IMAGE, + MoonvalleyTextToVideoRequest, + "image_url", + tooltip="The reference image used to generate the video", + ), + }, } RETURN_TYPES = ("VIDEO",) @@ -613,6 +657,7 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs ): video = kwargs.get("video") + image = kwargs.get("image", None) if not video: raise MoonvalleyApiError("video is required") @@ -620,8 +665,16 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): video_url = "" if video: validated_video = validate_video_to_video_input(video) - video_url = await upload_video_to_comfyapi(validated_video, auth_kwargs=kwargs) + video_url = await upload_video_to_comfyapi( + validated_video, auth_kwargs=kwargs + ) + mime_type = "image/png" + if not image is None: + validate_input_image(image, with_frame_conditioning=True) + image_url = await upload_images_to_comfyapi( + image=image, auth_kwargs=kwargs, max_images=1, mime_type=mime_type + ) control_type = kwargs.get("control_type") motion_intensity = kwargs.get("motion_intensity") @@ -631,12 +684,12 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): # Only include motion_intensity for Motion Transfer control_params = {} if control_type == "Motion Transfer" and motion_intensity is not None: - control_params['motion_intensity'] = motion_intensity + control_params["motion_intensity"] = motion_intensity - inference_params=MoonvalleyVideoToVideoInferenceParams( + inference_params = MoonvalleyVideoToVideoInferenceParams( negative_prompt=negative_prompt, seed=kwargs.get("seed"), - control_params=control_params + control_params=control_params, ) control = self.parseControlParameter(control_type) @@ -647,6 +700,7 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): prompt_text=prompt, inference_params=inference_params, ) + request.image_url = image_url if not image is None else None initial_operation = SynchronousOperation( endpoint=ApiEndpoint( @@ -694,15 +748,15 @@ class MoonvalleyTxt2VideoNode(BaseMoonvalleyVideoNode): validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) width_height = self.parseWidthHeightFromRes(kwargs.get("resolution")) - inference_params=MoonvalleyTextToVideoInferenceParams( - negative_prompt=negative_prompt, - steps=kwargs.get("steps"), - seed=kwargs.get("seed"), - guidance_scale=kwargs.get("prompt_adherence"), - num_frames=128, - width=width_height.get("width"), - height=width_height.get("height"), - ) + inference_params = MoonvalleyTextToVideoInferenceParams( + negative_prompt=negative_prompt, + steps=kwargs.get("steps"), + seed=kwargs.get("seed"), + guidance_scale=kwargs.get("prompt_adherence"), + num_frames=128, + width=width_height.get("width"), + height=width_height.get("height"), + ) request = MoonvalleyTextToVideoRequest( prompt_text=prompt, inference_params=inference_params ) From 5d65d6753b195d674ce16522d6c34f9a33f36269 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 15 Aug 2025 04:48:41 +0300 Subject: [PATCH 137/243] convert WAN nodes to V3 schema (#9201) --- comfy_extras/nodes_wan.py | 549 +++++++++++++++++++++----------------- 1 file changed, 298 insertions(+), 251 deletions(-) diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index f80c83ba6..694a183f6 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -9,29 +9,35 @@ import comfy.clip_vision import json import numpy as np from typing import Tuple +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io -class WanImageToVideo: +class WanImageToVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "vae": ("VAE", ), - "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - }, - "optional": {"clip_vision_output": ("CLIP_VISION_OUTPUT", ), - "start_image": ("IMAGE", ), - }} + def define_schema(cls): + return io.Schema( + node_id="WanImageToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.ClipVisionOutput.Input("clip_vision_output", optional=True), + io.Image.Input("start_image", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") - FUNCTION = "encode" - - CATEGORY = "conditioning/video_models" - - def encode(self, positive, negative, vae, width, height, length, batch_size, start_image=None, clip_vision_output=None): + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, clip_vision_output=None) -> io.NodeOutput: latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) if start_image is not None: start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) @@ -51,32 +57,36 @@ class WanImageToVideo: out_latent = {} out_latent["samples"] = latent - return (positive, negative, out_latent) + return io.NodeOutput(positive, negative, out_latent) -class WanFunControlToVideo: +class WanFunControlToVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "vae": ("VAE", ), - "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - }, - "optional": {"clip_vision_output": ("CLIP_VISION_OUTPUT", ), - "start_image": ("IMAGE", ), - "control_video": ("IMAGE", ), - }} + def define_schema(cls): + return io.Schema( + node_id="WanFunControlToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.ClipVisionOutput.Input("clip_vision_output", optional=True), + io.Image.Input("start_image", optional=True), + io.Image.Input("control_video", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") - FUNCTION = "encode" - - CATEGORY = "conditioning/video_models" - - def encode(self, positive, negative, vae, width, height, length, batch_size, start_image=None, clip_vision_output=None, control_video=None): + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, clip_vision_output=None, control_video=None) -> io.NodeOutput: latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) concat_latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) concat_latent = comfy.latent_formats.Wan21().process_out(concat_latent) @@ -101,31 +111,34 @@ class WanFunControlToVideo: out_latent = {} out_latent["samples"] = latent - return (positive, negative, out_latent) + return io.NodeOutput(positive, negative, out_latent) -class Wan22FunControlToVideo: +class Wan22FunControlToVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "vae": ("VAE", ), - "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - }, - "optional": {"ref_image": ("IMAGE", ), - "control_video": ("IMAGE", ), - # "start_image": ("IMAGE", ), - }} + def define_schema(cls): + return io.Schema( + node_id="Wan22FunControlToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Image.Input("ref_image", optional=True), + io.Image.Input("control_video", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") - FUNCTION = "encode" - - CATEGORY = "conditioning/video_models" - - def encode(self, positive, negative, vae, width, height, length, batch_size, ref_image=None, start_image=None, control_video=None): + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, ref_image=None, start_image=None, control_video=None) -> io.NodeOutput: latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) concat_latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) concat_latent = comfy.latent_formats.Wan21().process_out(concat_latent) @@ -158,32 +171,36 @@ class Wan22FunControlToVideo: out_latent = {} out_latent["samples"] = latent - return (positive, negative, out_latent) + return io.NodeOutput(positive, negative, out_latent) -class WanFirstLastFrameToVideo: +class WanFirstLastFrameToVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "vae": ("VAE", ), - "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - }, - "optional": {"clip_vision_start_image": ("CLIP_VISION_OUTPUT", ), - "clip_vision_end_image": ("CLIP_VISION_OUTPUT", ), - "start_image": ("IMAGE", ), - "end_image": ("IMAGE", ), - }} + def define_schema(cls): + return io.Schema( + node_id="WanFirstLastFrameToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.ClipVisionOutput.Input("clip_vision_start_image", optional=True), + io.ClipVisionOutput.Input("clip_vision_end_image", optional=True), + io.Image.Input("start_image", optional=True), + io.Image.Input("end_image", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") - FUNCTION = "encode" - - CATEGORY = "conditioning/video_models" - - def encode(self, positive, negative, vae, width, height, length, batch_size, start_image=None, end_image=None, clip_vision_start_image=None, clip_vision_end_image=None): + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, end_image=None, clip_vision_start_image=None, clip_vision_end_image=None) -> io.NodeOutput: latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) if start_image is not None: start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) @@ -224,62 +241,70 @@ class WanFirstLastFrameToVideo: out_latent = {} out_latent["samples"] = latent - return (positive, negative, out_latent) + return io.NodeOutput(positive, negative, out_latent) -class WanFunInpaintToVideo: +class WanFunInpaintToVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "vae": ("VAE", ), - "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - }, - "optional": {"clip_vision_output": ("CLIP_VISION_OUTPUT", ), - "start_image": ("IMAGE", ), - "end_image": ("IMAGE", ), - }} + def define_schema(cls): + return io.Schema( + node_id="WanFunInpaintToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.ClipVisionOutput.Input("clip_vision_output", optional=True), + io.Image.Input("start_image", optional=True), + io.Image.Input("end_image", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") - FUNCTION = "encode" - - CATEGORY = "conditioning/video_models" - - def encode(self, positive, negative, vae, width, height, length, batch_size, start_image=None, end_image=None, clip_vision_output=None): + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, end_image=None, clip_vision_output=None) -> io.NodeOutput: flfv = WanFirstLastFrameToVideo() - return flfv.encode(positive, negative, vae, width, height, length, batch_size, start_image=start_image, end_image=end_image, clip_vision_start_image=clip_vision_output) + return flfv.execute(positive, negative, vae, width, height, length, batch_size, start_image=start_image, end_image=end_image, clip_vision_start_image=clip_vision_output) -class WanVaceToVideo: +class WanVaceToVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "vae": ("VAE", ), - "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1000.0, "step": 0.01}), - }, - "optional": {"control_video": ("IMAGE", ), - "control_masks": ("MASK", ), - "reference_image": ("IMAGE", ), - }} + def define_schema(cls): + return io.Schema( + node_id="WanVaceToVideo", + category="conditioning/video_models", + is_experimental=True, + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Float.Input("strength", default=1.0, min=0.0, max=1000.0, step=0.01), + io.Image.Input("control_video", optional=True), + io.Mask.Input("control_masks", optional=True), + io.Image.Input("reference_image", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + io.Int.Output(display_name="trim_latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT", "INT") - RETURN_NAMES = ("positive", "negative", "latent", "trim_latent") - FUNCTION = "encode" - - CATEGORY = "conditioning/video_models" - - EXPERIMENTAL = True - - def encode(self, positive, negative, vae, width, height, length, batch_size, strength, control_video=None, control_masks=None, reference_image=None): + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, strength, control_video=None, control_masks=None, reference_image=None) -> io.NodeOutput: latent_length = ((length - 1) // 4) + 1 if control_video is not None: control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) @@ -336,52 +361,59 @@ class WanVaceToVideo: latent = torch.zeros([batch_size, 16, latent_length, height // 8, width // 8], device=comfy.model_management.intermediate_device()) out_latent = {} out_latent["samples"] = latent - return (positive, negative, out_latent, trim_latent) + return io.NodeOutput(positive, negative, out_latent, trim_latent) -class TrimVideoLatent: +class TrimVideoLatent(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "samples": ("LATENT",), - "trim_amount": ("INT", {"default": 0, "min": 0, "max": 99999}), - }} + def define_schema(cls): + return io.Schema( + node_id="TrimVideoLatent", + category="latent/video", + is_experimental=True, + inputs=[ + io.Latent.Input("samples"), + io.Int.Input("trim_amount", default=0, min=0, max=99999), + ], + outputs=[ + io.Latent.Output(), + ], + ) - RETURN_TYPES = ("LATENT",) - FUNCTION = "op" - - CATEGORY = "latent/video" - - EXPERIMENTAL = True - - def op(self, samples, trim_amount): + @classmethod + def execute(cls, samples, trim_amount) -> io.NodeOutput: samples_out = samples.copy() s1 = samples["samples"] samples_out["samples"] = s1[:, :, trim_amount:] - return (samples_out,) + return io.NodeOutput(samples_out) -class WanCameraImageToVideo: +class WanCameraImageToVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "vae": ("VAE", ), - "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - }, - "optional": {"clip_vision_output": ("CLIP_VISION_OUTPUT", ), - "start_image": ("IMAGE", ), - "camera_conditions": ("WAN_CAMERA_EMBEDDING", ), - }} + def define_schema(cls): + return io.Schema( + node_id="WanCameraImageToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.ClipVisionOutput.Input("clip_vision_output", optional=True), + io.Image.Input("start_image", optional=True), + io.WanCameraEmbedding.Input("camera_conditions", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") - FUNCTION = "encode" - - CATEGORY = "conditioning/video_models" - - def encode(self, positive, negative, vae, width, height, length, batch_size, start_image=None, clip_vision_output=None, camera_conditions=None): + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, clip_vision_output=None, camera_conditions=None) -> io.NodeOutput: latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) concat_latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) concat_latent = comfy.latent_formats.Wan21().process_out(concat_latent) @@ -404,29 +436,34 @@ class WanCameraImageToVideo: out_latent = {} out_latent["samples"] = latent - return (positive, negative, out_latent) + return io.NodeOutput(positive, negative, out_latent) -class WanPhantomSubjectToVideo: +class WanPhantomSubjectToVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "vae": ("VAE", ), - "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - }, - "optional": {"images": ("IMAGE", ), - }} + def define_schema(cls): + return io.Schema( + node_id="WanPhantomSubjectToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Image.Input("images", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative_text"), + io.Conditioning.Output(display_name="negative_img_text"), + io.Latent.Output(display_name="latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative_text", "negative_img_text", "latent") - FUNCTION = "encode" - - CATEGORY = "conditioning/video_models" - - def encode(self, positive, negative, vae, width, height, length, batch_size, images): + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, images) -> io.NodeOutput: latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) cond2 = negative if images is not None: @@ -442,7 +479,7 @@ class WanPhantomSubjectToVideo: out_latent = {} out_latent["samples"] = latent - return (positive, cond2, negative, out_latent) + return io.NodeOutput(positive, cond2, negative, out_latent) def parse_json_tracks(tracks): """Parse JSON track data into a standardized format""" @@ -655,39 +692,41 @@ def patch_motion( return out_mask_full, out_feature_full -class WanTrackToVideo: +class WanTrackToVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "vae": ("VAE", ), - "tracks": ("STRING", {"multiline": True, "default": "[]"}), - "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - "temperature": ("FLOAT", {"default": 220.0, "min": 1.0, "max": 1000.0, "step": 0.1}), - "topk": ("INT", {"default": 2, "min": 1, "max": 10}), - "start_image": ("IMAGE", ), - }, - "optional": { - "clip_vision_output": ("CLIP_VISION_OUTPUT", ), - }} + def define_schema(cls): + return io.Schema( + node_id="WanPhantomSubjectToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.String.Input("tracks", multiline=True, default="[]"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Float.Input("temperature", default=220.0, min=1.0, max=1000.0, step=0.1), + io.Int.Input("topk", default=2, min=1, max=10), + io.Image.Input("start_image"), + io.ClipVisionOutput.Input("clip_vision_output", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") - FUNCTION = "encode" - - CATEGORY = "conditioning/video_models" - - def encode(self, positive, negative, vae, tracks, width, height, length, batch_size, - temperature, topk, start_image=None, clip_vision_output=None): + @classmethod + def execute(cls, positive, negative, vae, tracks, width, height, length, batch_size, + temperature, topk, start_image=None, clip_vision_output=None) -> io.NodeOutput: tracks_data = parse_json_tracks(tracks) if not tracks_data: - return WanImageToVideo().encode(positive, negative, vae, width, height, length, batch_size, start_image=start_image, clip_vision_output=clip_vision_output) + return WanImageToVideo().execute(positive, negative, vae, width, height, length, batch_size, start_image=start_image, clip_vision_output=clip_vision_output) latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) @@ -741,34 +780,36 @@ class WanTrackToVideo: out_latent = {} out_latent["samples"] = latent - return (positive, negative, out_latent) + return io.NodeOutput(positive, negative, out_latent) -class Wan22ImageToVideoLatent: +class Wan22ImageToVideoLatent(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"vae": ("VAE", ), - "width": ("INT", {"default": 1280, "min": 32, "max": nodes.MAX_RESOLUTION, "step": 32}), - "height": ("INT", {"default": 704, "min": 32, "max": nodes.MAX_RESOLUTION, "step": 32}), - "length": ("INT", {"default": 49, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - }, - "optional": {"start_image": ("IMAGE", ), - }} + def define_schema(cls): + return io.Schema( + node_id="Wan22ImageToVideoLatent", + category="conditioning/inpaint", + inputs=[ + io.Vae.Input("vae"), + io.Int.Input("width", default=1280, min=32, max=nodes.MAX_RESOLUTION, step=32), + io.Int.Input("height", default=704, min=32, max=nodes.MAX_RESOLUTION, step=32), + io.Int.Input("length", default=49, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Image.Input("start_image", optional=True), + ], + outputs=[ + io.Latent.Output(), + ], + ) - - RETURN_TYPES = ("LATENT",) - FUNCTION = "encode" - - CATEGORY = "conditioning/inpaint" - - def encode(self, vae, width, height, length, batch_size, start_image=None): + @classmethod + def execute(cls, vae, width, height, length, batch_size, start_image=None) -> io.NodeOutput: latent = torch.zeros([1, 48, ((length - 1) // 4) + 1, height // 16, width // 16], device=comfy.model_management.intermediate_device()) if start_image is None: out_latent = {} out_latent["samples"] = latent - return (out_latent,) + return io.NodeOutput(out_latent) mask = torch.ones([latent.shape[0], 1, ((length - 1) // 4) + 1, latent.shape[-2], latent.shape[-1]], device=comfy.model_management.intermediate_device()) @@ -783,19 +824,25 @@ class Wan22ImageToVideoLatent: latent = latent_format.process_out(latent) * mask + latent * (1.0 - mask) out_latent["samples"] = latent.repeat((batch_size, ) + (1,) * (latent.ndim - 1)) out_latent["noise_mask"] = mask.repeat((batch_size, ) + (1,) * (mask.ndim - 1)) - return (out_latent,) + return io.NodeOutput(out_latent) -NODE_CLASS_MAPPINGS = { - "WanTrackToVideo": WanTrackToVideo, - "WanImageToVideo": WanImageToVideo, - "WanFunControlToVideo": WanFunControlToVideo, - "Wan22FunControlToVideo": Wan22FunControlToVideo, - "WanFunInpaintToVideo": WanFunInpaintToVideo, - "WanFirstLastFrameToVideo": WanFirstLastFrameToVideo, - "WanVaceToVideo": WanVaceToVideo, - "TrimVideoLatent": TrimVideoLatent, - "WanCameraImageToVideo": WanCameraImageToVideo, - "WanPhantomSubjectToVideo": WanPhantomSubjectToVideo, - "Wan22ImageToVideoLatent": Wan22ImageToVideoLatent, -} +class WanExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + WanTrackToVideo, + WanImageToVideo, + WanFunControlToVideo, + Wan22FunControlToVideo, + WanFunInpaintToVideo, + WanFirstLastFrameToVideo, + WanVaceToVideo, + TrimVideoLatent, + WanCameraImageToVideo, + WanPhantomSubjectToVideo, + Wan22ImageToVideoLatent, + ] + +async def comfy_entrypoint() -> WanExtension: + return WanExtension() From ad19a069f68a19566632b9bda3e72f4eed8a22d8 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 14 Aug 2025 20:16:01 -0700 Subject: [PATCH 138/243] Make SLG nodes work on Qwen Image model. (#9345) --- comfy/ldm/qwen_image/model.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index c15ab8e40..99843f88d 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -356,6 +356,7 @@ class QwenImageTransformer2DModel(nn.Module): context, attention_mask=None, guidance: torch.Tensor = None, + transformer_options={}, **kwargs ): timestep = timesteps @@ -383,14 +384,26 @@ class QwenImageTransformer2DModel(nn.Module): else self.time_text_embed(timestep, guidance, hidden_states) ) - for block in self.transformer_blocks: - encoder_hidden_states, hidden_states = block( - hidden_states=hidden_states, - encoder_hidden_states=encoder_hidden_states, - encoder_hidden_states_mask=encoder_hidden_states_mask, - temb=temb, - image_rotary_emb=image_rotary_emb, - ) + patches_replace = transformer_options.get("patches_replace", {}) + blocks_replace = patches_replace.get("dit", {}) + + for i, block in enumerate(self.transformer_blocks): + if ("double_block", i) in blocks_replace: + def block_wrap(args): + out = {} + out["txt"], out["img"] = block(hidden_states=args["img"], encoder_hidden_states=args["txt"], encoder_hidden_states_mask=encoder_hidden_states_mask, temb=args["vec"], image_rotary_emb=args["pe"]) + return out + out = blocks_replace[("double_block", i)]({"img": hidden_states, "txt": encoder_hidden_states, "vec": temb, "pe": image_rotary_emb}, {"original_block": block_wrap}) + hidden_states = out["img"] + encoder_hidden_states = out["txt"] + else: + encoder_hidden_states, hidden_states = block( + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + encoder_hidden_states_mask=encoder_hidden_states_mask, + temb=temb, + image_rotary_emb=image_rotary_emb, + ) hidden_states = self.norm_out(hidden_states, temb) hidden_states = self.proj_out(hidden_states) From f0d5d0111f1f78bc8ce5d1f3968f19e40cd2ce7b Mon Sep 17 00:00:00 2001 From: "Xiangxi Guo (Ryan)" Date: Thu, 14 Aug 2025 20:41:37 -0700 Subject: [PATCH 139/243] Avoid torch compile graphbreak for older pytorch versions (#9344) Turns out torch.compile has some gaps in context manager decorator syntax support. I've sent patches to fix that in PyTorch, but it won't be available for all the folks running older versions of PyTorch, hence this trivial patch. --- comfy/ops.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/comfy/ops.py b/comfy/ops.py index be312d714..2be35f76a 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -41,9 +41,11 @@ try: SDPA_BACKEND_PRIORITY.insert(0, SDPBackend.CUDNN_ATTENTION) - @sdpa_kernel(backends=SDPA_BACKEND_PRIORITY, set_priority=True) def scaled_dot_product_attention(q, k, v, *args, **kwargs): - return torch.nn.functional.scaled_dot_product_attention(q, k, v, *args, **kwargs) + # Use this (rather than the decorator syntax) to eliminate graph + # break for pytorch < 2.9 + with sdpa_kernel(SDPA_BACKEND_PRIORITY, set_priority=True): + return torch.nn.functional.scaled_dot_product_attention(q, k, v, *args, **kwargs) except (ModuleNotFoundError, TypeError): logging.warning("Could not set sdpa backend priority.") From 4e5c230f6a957962961794c07f02be748076c771 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 14 Aug 2025 20:44:02 -0700 Subject: [PATCH 140/243] Fix last commit not working on older pytorch. (#9346) --- comfy/ops.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/comfy/ops.py b/comfy/ops.py index 2be35f76a..18e7db705 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -32,20 +32,21 @@ def scaled_dot_product_attention(q, k, v, *args, **kwargs): try: if torch.cuda.is_available(): from torch.nn.attention import SDPBackend, sdpa_kernel + import inspect + if "set_priority" in inspect.signature(sdpa_kernel).parameters: + SDPA_BACKEND_PRIORITY = [ + SDPBackend.FLASH_ATTENTION, + SDPBackend.EFFICIENT_ATTENTION, + SDPBackend.MATH, + ] - SDPA_BACKEND_PRIORITY = [ - SDPBackend.FLASH_ATTENTION, - SDPBackend.EFFICIENT_ATTENTION, - SDPBackend.MATH, - ] + SDPA_BACKEND_PRIORITY.insert(0, SDPBackend.CUDNN_ATTENTION) - SDPA_BACKEND_PRIORITY.insert(0, SDPBackend.CUDNN_ATTENTION) - - def scaled_dot_product_attention(q, k, v, *args, **kwargs): - # Use this (rather than the decorator syntax) to eliminate graph - # break for pytorch < 2.9 - with sdpa_kernel(SDPA_BACKEND_PRIORITY, set_priority=True): - return torch.nn.functional.scaled_dot_product_attention(q, k, v, *args, **kwargs) + def scaled_dot_product_attention(q, k, v, *args, **kwargs): + with sdpa_kernel(SDPA_BACKEND_PRIORITY, set_priority=True): + return torch.nn.functional.scaled_dot_product_attention(q, k, v, *args, **kwargs) + else: + logging.warning("Torch version too old to set sdpa backend priority.") except (ModuleNotFoundError, TypeError): logging.warning("Could not set sdpa backend priority.") From e08ecfbd8a9deda8939b14d7f1ff7d7139f1a4ed Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 14 Aug 2025 21:22:26 -0700 Subject: [PATCH 141/243] Add warning when using old pytorch. (#9347) --- comfy/rmsnorm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comfy/rmsnorm.py b/comfy/rmsnorm.py index 66ae8321d..555542a46 100644 --- a/comfy/rmsnorm.py +++ b/comfy/rmsnorm.py @@ -1,6 +1,7 @@ import torch import comfy.model_management import numbers +import logging RMSNorm = None @@ -9,6 +10,7 @@ try: RMSNorm = torch.nn.RMSNorm except: rms_norm_torch = None + logging.warning("Please update pytorch to use native RMSNorm") def rms_norm(x, weight=None, eps=1e-6): From 027c63f63a7f5f380a4df1057c548410b0a87606 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 15 Aug 2025 21:57:47 +0300 Subject: [PATCH 142/243] fix(OpenAIGPTImage1): set correct MIME type for multipart uploads to OpenAI edits (#9348) --- comfy_api_nodes/nodes_openai.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/comfy_api_nodes/nodes_openai.py b/comfy_api_nodes/nodes_openai.py index ab3c5363b..cbff2b2d2 100644 --- a/comfy_api_nodes/nodes_openai.py +++ b/comfy_api_nodes/nodes_openai.py @@ -464,8 +464,6 @@ class OpenAIGPTImage1(ComfyNodeABC): path = "/proxy/openai/images/generations" content_type = "application/json" request_class = OpenAIImageGenerationRequest - img_binaries = [] - mask_binary = None files = [] if image is not None: @@ -484,14 +482,11 @@ class OpenAIGPTImage1(ComfyNodeABC): img_byte_arr = io.BytesIO() img.save(img_byte_arr, format="PNG") img_byte_arr.seek(0) - img_binary = img_byte_arr - img_binary.name = f"image_{i}.png" - img_binaries.append(img_binary) if batch_size == 1: - files.append(("image", img_binary)) + files.append(("image", (f"image_{i}.png", img_byte_arr, "image/png"))) else: - files.append(("image[]", img_binary)) + files.append(("image[]", (f"image_{i}.png", img_byte_arr, "image/png"))) if mask is not None: if image is None: @@ -511,9 +506,7 @@ class OpenAIGPTImage1(ComfyNodeABC): mask_img_byte_arr = io.BytesIO() mask_img.save(mask_img_byte_arr, format="PNG") mask_img_byte_arr.seek(0) - mask_binary = mask_img_byte_arr - mask_binary.name = "mask.png" - files.append(("mask", mask_binary)) + files.append(("mask", ("mask.png", mask_img_byte_arr, "image/png"))) # Build the operation operation = SynchronousOperation( From c308a8840aebf06649364e8e175862250a2d8823 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 15 Aug 2025 12:50:39 -0700 Subject: [PATCH 143/243] Add FluxKontextMultiReferenceLatentMethod node. (#9356) This node is only useful if someone trains the kontext model to properly use multiple reference images via the index method. The default is the offset method which feeds the multiple images like if they were stitched together as one. This method works with the current flux kontext model. --- comfy/ldm/flux/model.py | 24 ++++++++++++++++-------- comfy/model_base.py | 4 ++++ comfy_extras/nodes_flux.py | 19 +++++++++++++++++++ 3 files changed, 39 insertions(+), 8 deletions(-) diff --git a/comfy/ldm/flux/model.py b/comfy/ldm/flux/model.py index 8f4d99f54..c4de82795 100644 --- a/comfy/ldm/flux/model.py +++ b/comfy/ldm/flux/model.py @@ -224,19 +224,27 @@ class Flux(nn.Module): if ref_latents is not None: h = 0 w = 0 + index = 0 + index_ref_method = kwargs.get("ref_latents_method", "offset") == "index" for ref in ref_latents: - h_offset = 0 - w_offset = 0 - if ref.shape[-2] + h > ref.shape[-1] + w: - w_offset = w + if index_ref_method: + index += 1 + h_offset = 0 + w_offset = 0 else: - h_offset = h + index = 1 + h_offset = 0 + w_offset = 0 + if ref.shape[-2] + h > ref.shape[-1] + w: + w_offset = w + else: + h_offset = h + h = max(h, ref.shape[-2] + h_offset) + w = max(w, ref.shape[-1] + w_offset) - kontext, kontext_ids = self.process_img(ref, index=1, h_offset=h_offset, w_offset=w_offset) + kontext, kontext_ids = self.process_img(ref, index=index, h_offset=h_offset, w_offset=w_offset) img = torch.cat([img, kontext], dim=1) img_ids = torch.cat([img_ids, kontext_ids], dim=1) - h = max(h, ref.shape[-2] + h_offset) - w = max(w, ref.shape[-1] + w_offset) txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype) out = self.forward_orig(img, img_ids, context, txt_ids, timestep, y, guidance, control, transformer_options, attn_mask=kwargs.get("attention_mask", None)) diff --git a/comfy/model_base.py b/comfy/model_base.py index cde61df7c..bf874b875 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -890,6 +890,10 @@ class Flux(BaseModel): for lat in ref_latents: latents.append(self.process_latent_in(lat)) out['ref_latents'] = comfy.conds.CONDList(latents) + + ref_latents_method = kwargs.get("reference_latents_method", None) + if ref_latents_method is not None: + out['ref_latents_method'] = comfy.conds.CONDConstant(ref_latents_method) return out def extra_conds_shapes(self, **kwargs): diff --git a/comfy_extras/nodes_flux.py b/comfy_extras/nodes_flux.py index 8a8a17698..c8db75bb3 100644 --- a/comfy_extras/nodes_flux.py +++ b/comfy_extras/nodes_flux.py @@ -100,9 +100,28 @@ class FluxKontextImageScale: return (image, ) +class FluxKontextMultiReferenceLatentMethod: + @classmethod + def INPUT_TYPES(s): + return {"required": { + "conditioning": ("CONDITIONING", ), + "reference_latents_method": (("offset", "index"), ), + }} + + RETURN_TYPES = ("CONDITIONING",) + FUNCTION = "append" + EXPERIMENTAL = True + + CATEGORY = "advanced/conditioning/flux" + + def append(self, conditioning, reference_latents_method): + c = node_helpers.conditioning_set_values(conditioning, {"reference_latents_method": reference_latents_method}) + return (c, ) + NODE_CLASS_MAPPINGS = { "CLIPTextEncodeFlux": CLIPTextEncodeFlux, "FluxGuidance": FluxGuidance, "FluxDisableGuidance": FluxDisableGuidance, "FluxKontextImageScale": FluxKontextImageScale, + "FluxKontextMultiReferenceLatentMethod": FluxKontextMultiReferenceLatentMethod, } From 1702e6df16b0a52e147f19e3d5c5548c25a64339 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 15 Aug 2025 14:29:58 -0700 Subject: [PATCH 144/243] Implement wan2.2 camera model. (#9357) Use the old WanCameraImageToVideo node. --- comfy/ldm/wan/model.py | 7 ++++++- comfy/model_detection.py | 5 ++++- comfy/supported_models.py | 14 +++++++++++++- comfy_extras/nodes_wan.py | 7 +++++-- 4 files changed, 28 insertions(+), 5 deletions(-) diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index 4e2d99566..9d3741be3 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -768,7 +768,12 @@ class CameraWanModel(WanModel): operations=None, ): - super().__init__(model_type='i2v', patch_size=patch_size, text_len=text_len, in_dim=in_dim, dim=dim, ffn_dim=ffn_dim, freq_dim=freq_dim, text_dim=text_dim, out_dim=out_dim, num_heads=num_heads, num_layers=num_layers, window_size=window_size, qk_norm=qk_norm, cross_attn_norm=cross_attn_norm, eps=eps, flf_pos_embed_token_number=flf_pos_embed_token_number, image_model=image_model, device=device, dtype=dtype, operations=operations) + if model_type == 'camera': + model_type = 'i2v' + else: + model_type = 't2v' + + super().__init__(model_type=model_type, patch_size=patch_size, text_len=text_len, in_dim=in_dim, dim=dim, ffn_dim=ffn_dim, freq_dim=freq_dim, text_dim=text_dim, out_dim=out_dim, num_heads=num_heads, num_layers=num_layers, window_size=window_size, qk_norm=qk_norm, cross_attn_norm=cross_attn_norm, eps=eps, flf_pos_embed_token_number=flf_pos_embed_token_number, image_model=image_model, device=device, dtype=dtype, operations=operations) operation_settings = {"operations": operations, "device": device, "dtype": dtype} self.control_adapter = WanCamAdapter(in_dim_control_adapter, dim, kernel_size=patch_size[1:], stride=patch_size[1:], operation_settings=operation_settings) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 8acc51e20..2bec0541e 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -364,7 +364,10 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["vace_in_dim"] = state_dict['{}vace_patch_embedding.weight'.format(key_prefix)].shape[1] dit_config["vace_layers"] = count_blocks(state_dict_keys, '{}vace_blocks.'.format(key_prefix) + '{}.') elif '{}control_adapter.conv.weight'.format(key_prefix) in state_dict_keys: - dit_config["model_type"] = "camera" + if '{}img_emb.proj.0.bias'.format(key_prefix) in state_dict_keys: + dit_config["model_type"] = "camera" + else: + dit_config["model_type"] = "camera_2.2" else: if '{}img_emb.proj.0.bias'.format(key_prefix) in state_dict_keys: dit_config["model_type"] = "i2v" diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 156ff9e26..7ed6dfd69 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1046,6 +1046,18 @@ class WAN21_Camera(WAN21_T2V): def get_model(self, state_dict, prefix="", device=None): out = model_base.WAN21_Camera(self, image_to_video=False, device=device) return out + +class WAN22_Camera(WAN21_T2V): + unet_config = { + "image_model": "wan2.1", + "model_type": "camera_2.2", + "in_dim": 36, + } + + def get_model(self, state_dict, prefix="", device=None): + out = model_base.WAN21_Camera(self, image_to_video=False, device=device) + return out + class WAN21_Vace(WAN21_T2V): unet_config = { "image_model": "wan2.1", @@ -1260,6 +1272,6 @@ class QwenImage(supported_models_base.BASE): return supported_models_base.ClipTarget(comfy.text_encoders.qwen_image.QwenImageTokenizer, comfy.text_encoders.qwen_image.te(**hunyuan_detect)) -models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep, Omnigen2, QwenImage] +models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep, Omnigen2, QwenImage] models += [SVD_img2vid] diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index 694a183f6..83a990688 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -422,9 +422,12 @@ class WanCameraImageToVideo(io.ComfyNode): start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) concat_latent_image = vae.encode(start_image[:, :, :, :3]) concat_latent[:,:,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]] + mask = torch.ones((1, 1, latent.shape[2] * 4, latent.shape[-2], latent.shape[-1])) + mask[:, :, :start_image.shape[0] + 3] = 0.0 + mask = mask.view(1, mask.shape[2] // 4, 4, mask.shape[3], mask.shape[4]).transpose(1, 2) - positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent}) - negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent}) + positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent, "concat_mask": mask}) + negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent, "concat_mask": mask}) if camera_conditions is not None: positive = node_helpers.conditioning_set_values(positive, {'camera_conditions': camera_conditions}) From ed2e33c69a291094c4fcc13d8426c49844a6363c Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Fri, 15 Aug 2025 20:32:58 -0700 Subject: [PATCH 145/243] bump frontend version to 1.25.8 (#9361) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 551002b5b..2ae44ebe1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.24.4 +comfyui-frontend-package==1.25.8 comfyui-workflow-templates==0.1.59 comfyui-embedded-docs==0.2.6 torch From 20a84166d0d37dd6833caa6cadf3bfac8c241b48 Mon Sep 17 00:00:00 2001 From: Terry Jia Date: Sat, 16 Aug 2025 02:07:12 -0400 Subject: [PATCH 146/243] record audio node (#8716) * record audio node * sf --- comfy_extras/nodes_audio.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py index a90b31779..3b23f65d8 100644 --- a/comfy_extras/nodes_audio.py +++ b/comfy_extras/nodes_audio.py @@ -346,6 +346,24 @@ class LoadAudio: return "Invalid audio file: {}".format(audio) return True +class RecordAudio: + @classmethod + def INPUT_TYPES(s): + return {"required": {"audio": ("AUDIO_RECORD", {})}} + + CATEGORY = "audio" + + RETURN_TYPES = ("AUDIO", ) + FUNCTION = "load" + + def load(self, audio): + audio_path = folder_paths.get_annotated_filepath(audio) + + waveform, sample_rate = torchaudio.load(audio_path) + audio = {"waveform": waveform.unsqueeze(0), "sample_rate": sample_rate} + return (audio, ) + + NODE_CLASS_MAPPINGS = { "EmptyLatentAudio": EmptyLatentAudio, "VAEEncodeAudio": VAEEncodeAudio, @@ -356,6 +374,7 @@ NODE_CLASS_MAPPINGS = { "LoadAudio": LoadAudio, "PreviewAudio": PreviewAudio, "ConditioningStableAudio": ConditioningStableAudio, + "RecordAudio": RecordAudio, } NODE_DISPLAY_NAME_MAPPINGS = { @@ -367,4 +386,5 @@ NODE_DISPLAY_NAME_MAPPINGS = { "SaveAudio": "Save Audio (FLAC)", "SaveAudioMP3": "Save Audio (MP3)", "SaveAudioOpus": "Save Audio (Opus)", + "RecordAudio": "Record Audio", } From 0f2b8525bcafe213e8421a49564a90f926e81f2e Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 16 Aug 2025 14:51:28 -0700 Subject: [PATCH 147/243] Qwen image model refactor. (#9375) --- comfy/ldm/qwen_image/model.py | 36 +++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index 99843f88d..40d8fd979 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -333,21 +333,25 @@ class QwenImageTransformer2DModel(nn.Module): self.proj_out = operations.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True, dtype=dtype, device=device) self.gradient_checkpointing = False - def pos_embeds(self, x, context): + def process_img(self, x, index=0, h_offset=0, w_offset=0): bs, c, t, h, w = x.shape patch_size = self.patch_size + hidden_states = comfy.ldm.common_dit.pad_to_patch_size(x, (1, self.patch_size, self.patch_size)) + orig_shape = hidden_states.shape + hidden_states = hidden_states.view(orig_shape[0], orig_shape[1], orig_shape[-2] // 2, 2, orig_shape[-1] // 2, 2) + hidden_states = hidden_states.permute(0, 2, 4, 1, 3, 5) + hidden_states = hidden_states.reshape(orig_shape[0], (orig_shape[-2] // 2) * (orig_shape[-1] // 2), orig_shape[1] * 4) h_len = ((h + (patch_size // 2)) // patch_size) w_len = ((w + (patch_size // 2)) // patch_size) - img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype) - img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1) - img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) - img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs) + h_offset = ((h_offset + (patch_size // 2)) // patch_size) + w_offset = ((w_offset + (patch_size // 2)) // patch_size) - txt_start = round(max(h_len, w_len)) - txt_ids = torch.linspace(txt_start, txt_start + context.shape[1], steps=context.shape[1], device=x.device, dtype=x.dtype).reshape(1, -1, 1).repeat(bs, 1, 3) - ids = torch.cat((txt_ids, img_ids), dim=1) - return self.pe_embedder(ids).squeeze(1).unsqueeze(2).to(x.dtype) + img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype) + img_ids[:, :, 0] = img_ids[:, :, 1] + index + img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1) + img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) + return hidden_states, repeat(img_ids, "h w c -> b (h w) c", b=bs), orig_shape def forward( self, @@ -363,13 +367,13 @@ class QwenImageTransformer2DModel(nn.Module): encoder_hidden_states = context encoder_hidden_states_mask = attention_mask - image_rotary_emb = self.pos_embeds(x, context) + hidden_states, img_ids, orig_shape = self.process_img(x) + num_embeds = hidden_states.shape[1] - hidden_states = comfy.ldm.common_dit.pad_to_patch_size(x, (1, self.patch_size, self.patch_size)) - orig_shape = hidden_states.shape - hidden_states = hidden_states.view(orig_shape[0], orig_shape[1], orig_shape[-2] // 2, 2, orig_shape[-1] // 2, 2) - hidden_states = hidden_states.permute(0, 2, 4, 1, 3, 5) - hidden_states = hidden_states.reshape(orig_shape[0], (orig_shape[-2] // 2) * (orig_shape[-1] // 2), orig_shape[1] * 4) + txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size), ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size))) + txt_ids = torch.linspace(txt_start, txt_start + context.shape[1], steps=context.shape[1], device=x.device, dtype=x.dtype).reshape(1, -1, 1).repeat(x.shape[0], 1, 3) + ids = torch.cat((txt_ids, img_ids), dim=1) + image_rotary_emb = self.pe_embedder(ids).squeeze(1).unsqueeze(2).to(x.dtype) hidden_states = self.img_in(hidden_states) encoder_hidden_states = self.txt_norm(encoder_hidden_states) @@ -408,6 +412,6 @@ class QwenImageTransformer2DModel(nn.Module): hidden_states = self.norm_out(hidden_states, temb) hidden_states = self.proj_out(hidden_states) - hidden_states = hidden_states.view(orig_shape[0], orig_shape[-2] // 2, orig_shape[-1] // 2, orig_shape[1], 2, 2) + hidden_states = hidden_states[:, :num_embeds].view(orig_shape[0], orig_shape[-2] // 2, orig_shape[-1] // 2, orig_shape[1], 2, 2) hidden_states = hidden_states.permute(0, 3, 1, 4, 2, 5) return hidden_states.reshape(orig_shape)[:, :, :, :x.shape[-2], :x.shape[-1]] From ed43784b0d04e5b8e8ff2c057fa84b9c5132aaf2 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 17 Aug 2025 13:45:39 -0700 Subject: [PATCH 148/243] WIP Qwen edit model: The diffusion model part. (#9383) --- comfy/ldm/qwen_image/model.py | 26 ++++++++++++++++++++++++++ comfy/model_base.py | 10 ++++++++++ 2 files changed, 36 insertions(+) diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index 40d8fd979..a3c726299 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -360,6 +360,7 @@ class QwenImageTransformer2DModel(nn.Module): context, attention_mask=None, guidance: torch.Tensor = None, + ref_latents=None, transformer_options={}, **kwargs ): @@ -370,6 +371,31 @@ class QwenImageTransformer2DModel(nn.Module): hidden_states, img_ids, orig_shape = self.process_img(x) num_embeds = hidden_states.shape[1] + if ref_latents is not None: + h = 0 + w = 0 + index = 0 + index_ref_method = kwargs.get("ref_latents_method", "index") == "index" + for ref in ref_latents: + if index_ref_method: + index += 1 + h_offset = 0 + w_offset = 0 + else: + index = 1 + h_offset = 0 + w_offset = 0 + if ref.shape[-2] + h > ref.shape[-1] + w: + w_offset = w + else: + h_offset = h + h = max(h, ref.shape[-2] + h_offset) + w = max(w, ref.shape[-1] + w_offset) + + kontext, kontext_ids, _ = self.process_img(ref, index=index, h_offset=h_offset, w_offset=w_offset) + hidden_states = torch.cat([hidden_states, kontext], dim=1) + img_ids = torch.cat([img_ids, kontext_ids], dim=1) + txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size), ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size))) txt_ids = torch.linspace(txt_start, txt_start + context.shape[1], steps=context.shape[1], device=x.device, dtype=x.dtype).reshape(1, -1, 1).repeat(x.shape[0], 1, 3) ids = torch.cat((txt_ids, img_ids), dim=1) diff --git a/comfy/model_base.py b/comfy/model_base.py index bf874b875..15bd7abef 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1331,4 +1331,14 @@ class QwenImage(BaseModel): cross_attn = kwargs.get("cross_attn", None) if cross_attn is not None: out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) + ref_latents = kwargs.get("reference_latents", None) + if ref_latents is not None: + latents = [] + for lat in ref_latents: + latents.append(self.process_latent_in(lat)) + out['ref_latents'] = comfy.conds.CONDList(latents) + + ref_latents_method = kwargs.get("reference_latents_method", None) + if ref_latents_method is not None: + out['ref_latents_method'] = comfy.conds.CONDConstant(ref_latents_method) return out From d4e353a94ec5a8cb15ed151990a9518b890e5d4f Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Mon, 18 Aug 2025 05:38:40 +0800 Subject: [PATCH 149/243] Update template to 0.1.60 (#9377) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 2ae44ebe1..72a700028 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.25.8 -comfyui-workflow-templates==0.1.59 +comfyui-workflow-templates==0.1.60 comfyui-embedded-docs==0.2.6 torch torchsde From 7f3b9b16c6636cb1201213574892d33c2a35e4ba Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Sun, 17 Aug 2025 15:54:07 -0700 Subject: [PATCH 150/243] Make step index detection much more robust (#9392) --- comfy/context_windows.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/comfy/context_windows.py b/comfy/context_windows.py index 928b111df..041f380f9 100644 --- a/comfy/context_windows.py +++ b/comfy/context_windows.py @@ -164,8 +164,11 @@ class IndexListContextHandler(ContextHandlerABC): return resized_cond def set_step(self, timestep: torch.Tensor, model_options: dict[str]): - indexes = torch.where(model_options["transformer_options"]["sample_sigmas"] == timestep[0]) - self._step = int(indexes[0]) + mask = torch.isclose(model_options["transformer_options"]["sample_sigmas"], timestep, rtol=0.0001) + matches = torch.nonzero(mask) + if torch.numel(matches) == 0: + raise Exception("No sample_sigmas matched current timestep; something went wrong.") + self._step = int(matches[0].item()) def get_context_windows(self, model: BaseModel, x_in: torch.Tensor, model_options: dict[str]) -> list[IndexListContextWindow]: full_length = x_in.size(self.dim) # TODO: choose dim based on model From da2efeaec6609265051165bfb413a2a4c84cf4bb Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Sun, 17 Aug 2025 20:21:02 -0700 Subject: [PATCH 151/243] Bump frontend to 1.25.9 (#9394) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 72a700028..c7a5c47ab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.25.8 +comfyui-frontend-package==1.25.9 comfyui-workflow-templates==0.1.60 comfyui-embedded-docs==0.2.6 torch From bd2ab73976a4e245db3e057795328c89bfd98a88 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Mon, 18 Aug 2025 10:26:55 +0300 Subject: [PATCH 152/243] fix(WAN-nodes): invalid nodeid for WanTrackToVideo (#9396) --- comfy_extras/nodes_wan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index 83a990688..0fff02f76 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -699,7 +699,7 @@ class WanTrackToVideo(io.ComfyNode): @classmethod def define_schema(cls): return io.Schema( - node_id="WanPhantomSubjectToVideo", + node_id="WanTrackToVideo", category="conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), From 4977f203fa8e9e3ab22884c8ace8f9b540d48952 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 18 Aug 2025 19:38:34 -0700 Subject: [PATCH 153/243] P2 of qwen edit model. (#9412) * P2 of qwen edit model. * Typo. * Fix normal qwen. * Fix. * Make the TextEncodeQwenImageEdit also set the ref latent. If you don't want it to set the ref latent and want to use the ReferenceLatent node with your custom latent instead just disconnect the VAE. --- comfy/clip_model.py | 2 +- comfy/model_base.py | 8 + comfy/sd1_clip.py | 11 +- comfy/text_encoders/bert.py | 2 +- comfy/text_encoders/llama.py | 43 ++- comfy/text_encoders/qwen_image.py | 20 +- comfy/text_encoders/qwen_vl.py | 428 ++++++++++++++++++++++++++++++ comfy/text_encoders/t5.py | 2 +- comfy_extras/nodes_qwen.py | 63 +++++ nodes.py | 1 + 10 files changed, 565 insertions(+), 15 deletions(-) create mode 100644 comfy/text_encoders/qwen_vl.py create mode 100644 comfy_extras/nodes_qwen.py diff --git a/comfy/clip_model.py b/comfy/clip_model.py index c8294d483..7e47d8a55 100644 --- a/comfy/clip_model.py +++ b/comfy/clip_model.py @@ -97,7 +97,7 @@ class CLIPTextModel_(torch.nn.Module): self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations) self.final_layer_norm = operations.LayerNorm(embed_dim, dtype=dtype, device=device) - def forward(self, input_tokens=None, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32): + def forward(self, input_tokens=None, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32, embeds_info=[]): if embeds is not None: x = embeds + comfy.ops.cast_to(self.embeddings.position_embedding.weight, dtype=dtype, device=embeds.device) else: diff --git a/comfy/model_base.py b/comfy/model_base.py index 15bd7abef..6c861b15e 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1325,6 +1325,7 @@ class Omnigen2(BaseModel): class QwenImage(BaseModel): def __init__(self, model_config, model_type=ModelType.FLUX, device=None): super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.qwen_image.model.QwenImageTransformer2DModel) + self.memory_usage_factor_conds = ("ref_latents",) def extra_conds(self, **kwargs): out = super().extra_conds(**kwargs) @@ -1342,3 +1343,10 @@ class QwenImage(BaseModel): if ref_latents_method is not None: out['ref_latents_method'] = comfy.conds.CONDConstant(ref_latents_method) return out + + def extra_conds_shapes(self, **kwargs): + out = {} + ref_latents = kwargs.get("reference_latents", None) + if ref_latents is not None: + out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()), ref_latents)) // 16]) + return out diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py index ade340fd1..1e8adbe69 100644 --- a/comfy/sd1_clip.py +++ b/comfy/sd1_clip.py @@ -204,17 +204,19 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder): tokens_embed = self.transformer.get_input_embeddings()(tokens_embed, out_dtype=torch.float32) index = 0 pad_extra = 0 + embeds_info = [] for o in other_embeds: emb = o[1] if torch.is_tensor(emb): emb = {"type": "embedding", "data": emb} + extra = None emb_type = emb.get("type", None) if emb_type == "embedding": emb = emb.get("data", None) else: if hasattr(self.transformer, "preprocess_embed"): - emb = self.transformer.preprocess_embed(emb, device=device) + emb, extra = self.transformer.preprocess_embed(emb, device=device) else: emb = None @@ -229,6 +231,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder): tokens_embed = torch.cat([tokens_embed[:, :ind], emb, tokens_embed[:, ind:]], dim=1) attention_mask = attention_mask[:ind] + [1] * emb_shape + attention_mask[ind:] index += emb_shape - 1 + embeds_info.append({"type": emb_type, "index": ind, "size": emb_shape, "extra": extra}) else: index += -1 pad_extra += emb_shape @@ -243,11 +246,11 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder): attention_masks.append(attention_mask) num_tokens.append(sum(attention_mask)) - return torch.cat(embeds_out), torch.tensor(attention_masks, device=device, dtype=torch.long), num_tokens + return torch.cat(embeds_out), torch.tensor(attention_masks, device=device, dtype=torch.long), num_tokens, embeds_info def forward(self, tokens): device = self.transformer.get_input_embeddings().weight.device - embeds, attention_mask, num_tokens = self.process_tokens(tokens, device) + embeds, attention_mask, num_tokens, embeds_info = self.process_tokens(tokens, device) attention_mask_model = None if self.enable_attention_masks: @@ -258,7 +261,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder): else: intermediate_output = self.layer_idx - outputs = self.transformer(None, attention_mask_model, embeds=embeds, num_tokens=num_tokens, intermediate_output=intermediate_output, final_layer_norm_intermediate=self.layer_norm_hidden_state, dtype=torch.float32) + outputs = self.transformer(None, attention_mask_model, embeds=embeds, num_tokens=num_tokens, intermediate_output=intermediate_output, final_layer_norm_intermediate=self.layer_norm_hidden_state, dtype=torch.float32, embeds_info=embeds_info) if self.layer == "last": z = outputs[0].float() diff --git a/comfy/text_encoders/bert.py b/comfy/text_encoders/bert.py index 551b03162..ed4638a9a 100644 --- a/comfy/text_encoders/bert.py +++ b/comfy/text_encoders/bert.py @@ -116,7 +116,7 @@ class BertModel_(torch.nn.Module): self.embeddings = BertEmbeddings(config_dict["vocab_size"], config_dict["max_position_embeddings"], config_dict["type_vocab_size"], config_dict["pad_token_id"], embed_dim, layer_norm_eps, dtype, device, operations) self.encoder = BertEncoder(config_dict["num_hidden_layers"], embed_dim, config_dict["intermediate_size"], config_dict["num_attention_heads"], layer_norm_eps, dtype, device, operations) - def forward(self, input_tokens, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None): + def forward(self, input_tokens, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, embeds_info=[]): x = self.embeddings(input_tokens, embeds=embeds, dtype=dtype) mask = None if attention_mask is not None: diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py index 1da6a0c94..9d90d5a61 100644 --- a/comfy/text_encoders/llama.py +++ b/comfy/text_encoders/llama.py @@ -2,12 +2,14 @@ import torch import torch.nn as nn from dataclasses import dataclass from typing import Optional, Any +import math from comfy.ldm.modules.attention import optimized_attention_for_device import comfy.model_management import comfy.ldm.common_dit import comfy.model_management +from . import qwen_vl @dataclass class Llama2Config: @@ -100,12 +102,10 @@ def rotate_half(x): return torch.cat((-x2, x1), dim=-1) -def precompute_freqs_cis(head_dim, seq_len, theta, device=None): +def precompute_freqs_cis(head_dim, position_ids, theta, device=None): theta_numerator = torch.arange(0, head_dim, 2, device=device).float() inv_freq = 1.0 / (theta ** (theta_numerator / head_dim)) - position_ids = torch.arange(0, seq_len, device=device).unsqueeze(0) - inv_freq_expanded = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) position_ids_expanded = position_ids[:, None, :].float() freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) @@ -277,7 +277,7 @@ class Llama2_(nn.Module): self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype) # self.lm_head = ops.Linear(config.hidden_size, config.vocab_size, bias=False, device=device, dtype=dtype) - def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None): + def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, position_ids=None, embeds_info=[]): if embeds is not None: x = embeds else: @@ -286,8 +286,11 @@ class Llama2_(nn.Module): if self.normalize_in: x *= self.config.hidden_size ** 0.5 + if position_ids is None: + position_ids = torch.arange(0, x.shape[1], device=x.device).unsqueeze(0) + freqs_cis = precompute_freqs_cis(self.config.head_dim, - x.shape[1], + position_ids, self.config.rope_theta, device=x.device) @@ -372,8 +375,38 @@ class Qwen25_7BVLI(BaseLlama, torch.nn.Module): self.num_layers = config.num_hidden_layers self.model = Llama2_(config, device=device, dtype=dtype, ops=operations) + self.visual = qwen_vl.Qwen2VLVisionTransformer(hidden_size=1280, output_hidden_size=config.hidden_size, device=device, dtype=dtype, ops=operations) self.dtype = dtype + def preprocess_embed(self, embed, device): + if embed["type"] == "image": + image, grid = qwen_vl.process_qwen2vl_images(embed["data"]) + return self.visual(image.to(device, dtype=torch.float32), grid), grid + return None, None + + def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, embeds_info=[]): + grid = None + for e in embeds_info: + if e.get("type") == "image": + grid = e.get("extra", None) + position_ids = torch.zeros((3, embeds.shape[1]), device=embeds.device) + start = e.get("index") + position_ids[:, :start] = torch.arange(0, start, device=embeds.device) + end = e.get("size") + start + len_max = int(grid.max()) // 2 + start_next = len_max + start + position_ids[:, end:] = torch.arange(start_next, start_next + (embeds.shape[1] - end), device=embeds.device) + position_ids[0, start:end] = start + max_d = int(grid[0][1]) // 2 + position_ids[1, start:end] = torch.arange(start, start + max_d, device=embeds.device).unsqueeze(1).repeat(1, math.ceil((end - start) / max_d)).flatten(0)[:end - start] + max_d = int(grid[0][2]) // 2 + position_ids[2, start:end] = torch.arange(start, start + max_d, device=embeds.device).unsqueeze(0).repeat(math.ceil((end - start) / max_d), 1).flatten(0)[:end - start] + + if grid is None: + position_ids = None + + return super().forward(x, attention_mask=attention_mask, embeds=embeds, num_tokens=num_tokens, intermediate_output=intermediate_output, final_layer_norm_intermediate=final_layer_norm_intermediate, dtype=dtype, position_ids=position_ids) + class Gemma2_2B(BaseLlama, torch.nn.Module): def __init__(self, config_dict, dtype, device, operations): super().__init__() diff --git a/comfy/text_encoders/qwen_image.py b/comfy/text_encoders/qwen_image.py index ce5c98097..f07318d6c 100644 --- a/comfy/text_encoders/qwen_image.py +++ b/comfy/text_encoders/qwen_image.py @@ -15,13 +15,27 @@ class QwenImageTokenizer(sd1_clip.SD1Tokenizer): def __init__(self, embedding_directory=None, tokenizer_data={}): super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="qwen25_7b", tokenizer=Qwen25_7BVLITokenizer) self.llama_template = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" + self.llama_template_images = "<|im_start|>system\nDescribe the key features of the input image \\(color, shape, size, texture, objects, background\\), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n" - def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None,**kwargs): + def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], **kwargs): if llama_template is None: - llama_text = self.llama_template.format(text) + if len(images) > 0: + llama_text = self.llama_template_images.format(text) + else: + llama_text = self.llama_template.format(text) else: llama_text = llama_template.format(text) - return super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, **kwargs) + tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, **kwargs) + key_name = next(iter(tokens)) + embed_count = 0 + qwen_tokens = tokens[key_name] + for r in qwen_tokens: + for i in range(len(r)): + if r[i][0] == 151655: + if len(images) > embed_count: + r[i] = ({"type": "image", "data": images[embed_count], "original_type": "image"},) + r[i][1:] + embed_count += 1 + return tokens class Qwen25_7BVLIModel(sd1_clip.SDClipModel): diff --git a/comfy/text_encoders/qwen_vl.py b/comfy/text_encoders/qwen_vl.py new file mode 100644 index 000000000..3b18ce730 --- /dev/null +++ b/comfy/text_encoders/qwen_vl.py @@ -0,0 +1,428 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from typing import Optional, Tuple +import math +from comfy.ldm.modules.attention import optimized_attention_for_device + + +def process_qwen2vl_images( + images: torch.Tensor, + min_pixels: int = 3136, + max_pixels: int = 12845056, + patch_size: int = 14, + temporal_patch_size: int = 2, + merge_size: int = 2, + image_mean: list = None, + image_std: list = None, +): + if image_mean is None: + image_mean = [0.48145466, 0.4578275, 0.40821073] + if image_std is None: + image_std = [0.26862954, 0.26130258, 0.27577711] + + batch_size, height, width, channels = images.shape + device = images.device + # dtype = images.dtype + + images = images.permute(0, 3, 1, 2) + + grid_thw_list = [] + img = images[0] + + factor = patch_size * merge_size + + h_bar = round(height / factor) * factor + w_bar = round(width / factor) * factor + + if h_bar * w_bar > max_pixels: + beta = math.sqrt((height * width) / max_pixels) + h_bar = max(factor, math.floor(height / beta / factor) * factor) + w_bar = max(factor, math.floor(width / beta / factor) * factor) + elif h_bar * w_bar < min_pixels: + beta = math.sqrt(min_pixels / (height * width)) + h_bar = math.ceil(height * beta / factor) * factor + w_bar = math.ceil(width * beta / factor) * factor + + img_resized = F.interpolate( + img.unsqueeze(0), + size=(h_bar, w_bar), + mode='bilinear', + align_corners=False + ).squeeze(0) + + normalized = img_resized.clone() + for c in range(3): + normalized[c] = (img_resized[c] - image_mean[c]) / image_std[c] + + grid_h = h_bar // patch_size + grid_w = w_bar // patch_size + grid_thw = torch.tensor([1, grid_h, grid_w], device=device, dtype=torch.long) + + pixel_values = normalized + grid_thw_list.append(grid_thw) + image_grid_thw = torch.stack(grid_thw_list) + + grid_t = 1 + channel = pixel_values.shape[0] + pixel_values = pixel_values.unsqueeze(0).repeat(2, 1, 1, 1) + + patches = pixel_values.reshape( + grid_t, + temporal_patch_size, + channel, + grid_h // merge_size, + merge_size, + patch_size, + grid_w // merge_size, + merge_size, + patch_size, + ) + + patches = patches.permute(0, 3, 6, 4, 7, 2, 1, 5, 8) + flatten_patches = patches.reshape( + grid_t * grid_h * grid_w, + channel * temporal_patch_size * patch_size * patch_size + ) + + return flatten_patches, image_grid_thw + + +class VisionPatchEmbed(nn.Module): + def __init__( + self, + patch_size: int = 14, + temporal_patch_size: int = 2, + in_channels: int = 3, + embed_dim: int = 3584, + device=None, + dtype=None, + ops=None, + ): + super().__init__() + self.patch_size = patch_size + self.temporal_patch_size = temporal_patch_size + self.in_channels = in_channels + self.embed_dim = embed_dim + + kernel_size = [temporal_patch_size, patch_size, patch_size] + self.proj = ops.Conv3d( + in_channels, + embed_dim, + kernel_size=kernel_size, + stride=kernel_size, + bias=False, + device=device, + dtype=dtype + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = hidden_states.view( + -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size + ) + hidden_states = self.proj(hidden_states) + return hidden_states.view(-1, self.embed_dim) + + +def rotate_half(x): + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb_vision(q, k, cos, sin): + cos, sin = cos.unsqueeze(-2).float(), sin.unsqueeze(-2).float() + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class VisionRotaryEmbedding(nn.Module): + def __init__(self, dim: int, theta: float = 10000.0): + super().__init__() + self.dim = dim + self.theta = theta + + def forward(self, seqlen: int, device) -> torch.Tensor: + inv_freq = 1.0 / (self.theta ** (torch.arange(0, self.dim, 2, dtype=torch.float, device=device) / self.dim)) + seq = torch.arange(seqlen, device=inv_freq.device, dtype=inv_freq.dtype) + freqs = torch.outer(seq, inv_freq) + return freqs + + +class PatchMerger(nn.Module): + def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2, device=None, dtype=None, ops=None): + super().__init__() + self.hidden_size = context_dim * (spatial_merge_size ** 2) + self.ln_q = ops.RMSNorm(context_dim, eps=1e-6, device=device, dtype=dtype) + self.mlp = nn.Sequential( + ops.Linear(self.hidden_size, self.hidden_size, device=device, dtype=dtype), + nn.GELU(), + ops.Linear(self.hidden_size, dim, device=device, dtype=dtype), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.ln_q(x).reshape(-1, self.hidden_size) + x = self.mlp(x) + return x + + +class VisionAttention(nn.Module): + def __init__(self, hidden_size: int, num_heads: int, device=None, dtype=None, ops=None): + super().__init__() + self.hidden_size = hidden_size + self.num_heads = num_heads + self.head_dim = hidden_size // num_heads + self.scaling = self.head_dim ** -0.5 + + self.qkv = ops.Linear(hidden_size, hidden_size * 3, bias=True, device=device, dtype=dtype) + self.proj = ops.Linear(hidden_size, hidden_size, bias=True, device=device, dtype=dtype) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + cu_seqlens=None, + optimized_attention=None, + ) -> torch.Tensor: + if hidden_states.dim() == 2: + seq_length, _ = hidden_states.shape + batch_size = 1 + hidden_states = hidden_states.unsqueeze(0) + else: + batch_size, seq_length, _ = hidden_states.shape + + qkv = self.qkv(hidden_states) + qkv = qkv.reshape(batch_size, seq_length, 3, self.num_heads, self.head_dim) + query_states, key_states, value_states = qkv.reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0) + + if position_embeddings is not None: + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb_vision(query_states, key_states, cos, sin) + + query_states = query_states.transpose(0, 1).unsqueeze(0) + key_states = key_states.transpose(0, 1).unsqueeze(0) + value_states = value_states.transpose(0, 1).unsqueeze(0) + + lengths = cu_seqlens[1:] - cu_seqlens[:-1] + splits = [ + torch.split(tensor, lengths.tolist(), dim=2) for tensor in (query_states, key_states, value_states) + ] + + attn_outputs = [ + optimized_attention(q, k, v, self.num_heads, skip_reshape=True) + for q, k, v in zip(*splits) + ] + attn_output = torch.cat(attn_outputs, dim=1) + attn_output = attn_output.reshape(seq_length, -1) + attn_output = self.proj(attn_output) + + return attn_output + + +class VisionMLP(nn.Module): + def __init__(self, hidden_size: int, intermediate_size: int, device=None, dtype=None, ops=None): + super().__init__() + self.gate_proj = ops.Linear(hidden_size, intermediate_size, bias=True, device=device, dtype=dtype) + self.up_proj = ops.Linear(hidden_size, intermediate_size, bias=True, device=device, dtype=dtype) + self.down_proj = ops.Linear(intermediate_size, hidden_size, bias=True, device=device, dtype=dtype) + self.act_fn = nn.SiLU() + + def forward(self, hidden_state): + return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state)) + + +class VisionBlock(nn.Module): + def __init__(self, hidden_size: int, intermediate_size: int, num_heads: int, device=None, dtype=None, ops=None): + super().__init__() + self.norm1 = ops.RMSNorm(hidden_size, eps=1e-6, device=device, dtype=dtype) + self.norm2 = ops.RMSNorm(hidden_size, eps=1e-6, device=device, dtype=dtype) + self.attn = VisionAttention(hidden_size, num_heads, device=device, dtype=dtype, ops=ops) + self.mlp = VisionMLP(hidden_size, intermediate_size, device=device, dtype=dtype, ops=ops) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + cu_seqlens=None, + optimized_attention=None, + ) -> torch.Tensor: + residual = hidden_states + hidden_states = self.norm1(hidden_states) + hidden_states = self.attn(hidden_states, position_embeddings, cu_seqlens, optimized_attention) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + +class Qwen2VLVisionTransformer(nn.Module): + def __init__( + self, + hidden_size: int = 3584, + output_hidden_size: int = 3584, + intermediate_size: int = 3420, + num_heads: int = 16, + num_layers: int = 32, + patch_size: int = 14, + temporal_patch_size: int = 2, + spatial_merge_size: int = 2, + window_size: int = 112, + device=None, + dtype=None, + ops=None + ): + super().__init__() + self.hidden_size = hidden_size + self.patch_size = patch_size + self.spatial_merge_size = spatial_merge_size + self.window_size = window_size + self.fullatt_block_indexes = [7, 15, 23, 31] + + self.patch_embed = VisionPatchEmbed( + patch_size=patch_size, + temporal_patch_size=temporal_patch_size, + in_channels=3, + embed_dim=hidden_size, + device=device, + dtype=dtype, + ops=ops, + ) + + head_dim = hidden_size // num_heads + self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2) + + self.blocks = nn.ModuleList([ + VisionBlock(hidden_size, intermediate_size, num_heads, device, dtype, ops) + for _ in range(num_layers) + ]) + + self.merger = PatchMerger( + dim=output_hidden_size, + context_dim=hidden_size, + spatial_merge_size=spatial_merge_size, + device=device, + dtype=dtype, + ops=ops, + ) + + def get_window_index(self, grid_thw): + window_index = [] + cu_window_seqlens = [0] + window_index_id = 0 + vit_merger_window_size = self.window_size // self.spatial_merge_size // self.patch_size + + for grid_t, grid_h, grid_w in grid_thw: + llm_grid_h = grid_h // self.spatial_merge_size + llm_grid_w = grid_w // self.spatial_merge_size + + index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(grid_t, llm_grid_h, llm_grid_w) + + pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size + pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size + num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size + num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size + + index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100) + index_padded = index_padded.reshape( + grid_t, + num_windows_h, + vit_merger_window_size, + num_windows_w, + vit_merger_window_size, + ) + index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape( + grid_t, + num_windows_h * num_windows_w, + vit_merger_window_size, + vit_merger_window_size, + ) + + seqlens = (index_padded != -100).sum([2, 3]).reshape(-1) + index_padded = index_padded.reshape(-1) + index_new = index_padded[index_padded != -100] + window_index.append(index_new + window_index_id) + + cu_seqlens_tmp = seqlens.cumsum(0) * self.spatial_merge_size * self.spatial_merge_size + cu_window_seqlens[-1] + cu_window_seqlens.extend(cu_seqlens_tmp.tolist()) + window_index_id += (grid_t * llm_grid_h * llm_grid_w).item() + + window_index = torch.cat(window_index, dim=0) + return window_index, cu_window_seqlens + + def get_position_embeddings(self, grid_thw, device): + pos_ids = [] + + for t, h, w in grid_thw: + hpos_ids = torch.arange(h, device=device).unsqueeze(1).expand(-1, w) + hpos_ids = hpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ) + hpos_ids = hpos_ids.permute(0, 2, 1, 3).flatten() + + wpos_ids = torch.arange(w, device=device).unsqueeze(0).expand(h, -1) + wpos_ids = wpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ) + wpos_ids = wpos_ids.permute(0, 2, 1, 3).flatten() + + pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) + + pos_ids = torch.cat(pos_ids, dim=0) + max_grid_size = grid_thw[:, 1:].max() + rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size, device) + return rotary_pos_emb_full[pos_ids].flatten(1) + + def forward( + self, + pixel_values: torch.Tensor, + image_grid_thw: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + optimized_attention = optimized_attention_for_device(pixel_values.device, mask=False, small_input=True) + + hidden_states = self.patch_embed(pixel_values) + + window_index, cu_window_seqlens = self.get_window_index(image_grid_thw) + cu_window_seqlens = torch.tensor(cu_window_seqlens, device=hidden_states.device) + cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens) + + position_embeddings = self.get_position_embeddings(image_grid_thw, hidden_states.device) + + seq_len, _ = hidden_states.size() + spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size + + hidden_states = hidden_states.reshape(seq_len // spatial_merge_unit, spatial_merge_unit, -1) + hidden_states = hidden_states[window_index, :, :] + hidden_states = hidden_states.reshape(seq_len, -1) + + position_embeddings = position_embeddings.reshape(seq_len // spatial_merge_unit, spatial_merge_unit, -1) + position_embeddings = position_embeddings[window_index, :, :] + position_embeddings = position_embeddings.reshape(seq_len, -1) + position_embeddings = torch.cat((position_embeddings, position_embeddings), dim=-1) + position_embeddings = (position_embeddings.cos(), position_embeddings.sin()) + + cu_seqlens = torch.repeat_interleave(image_grid_thw[:, 1] * image_grid_thw[:, 2], image_grid_thw[:, 0]).cumsum( + dim=0, + dtype=torch.int32, + ) + cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0) + + for i, block in enumerate(self.blocks): + if i in self.fullatt_block_indexes: + cu_seqlens_now = cu_seqlens + else: + cu_seqlens_now = cu_window_seqlens + hidden_states = block(hidden_states, position_embeddings, cu_seqlens_now, optimized_attention=optimized_attention) + + hidden_states = self.merger(hidden_states) + return hidden_states diff --git a/comfy/text_encoders/t5.py b/comfy/text_encoders/t5.py index 36bf35309..e8588992a 100644 --- a/comfy/text_encoders/t5.py +++ b/comfy/text_encoders/t5.py @@ -199,7 +199,7 @@ class T5Stack(torch.nn.Module): self.final_layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device, operations=operations) # self.dropout = nn.Dropout(config.dropout_rate) - def forward(self, x, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None): + def forward(self, x, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, embeds_info=[]): mask = None if attention_mask is not None: mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]) diff --git a/comfy_extras/nodes_qwen.py b/comfy_extras/nodes_qwen.py new file mode 100644 index 000000000..b5088fae2 --- /dev/null +++ b/comfy_extras/nodes_qwen.py @@ -0,0 +1,63 @@ +import node_helpers +import comfy.utils + +PREFERRED_QWENIMAGE_RESOLUTIONS = [ + (672, 1568), + (688, 1504), + (720, 1456), + (752, 1392), + (800, 1328), + (832, 1248), + (880, 1184), + (944, 1104), + (1024, 1024), + (1104, 944), + (1184, 880), + (1248, 832), + (1328, 800), + (1392, 752), + (1456, 720), + (1504, 688), + (1568, 672), +] + + +class TextEncodeQwenImageEdit: + @classmethod + def INPUT_TYPES(s): + return {"required": { + "clip": ("CLIP", ), + "prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}), + }, + "optional": {"vae": ("VAE", ), + "image": ("IMAGE", ),}} + + RETURN_TYPES = ("CONDITIONING",) + FUNCTION = "encode" + + CATEGORY = "advanced/conditioning" + + def encode(self, clip, prompt, vae=None, image=None): + ref_latent = None + if image is None: + images = [] + else: + images = [image] + if vae is not None: + width = image.shape[2] + height = image.shape[1] + aspect_ratio = width / height + _, width, height = min((abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_QWENIMAGE_RESOLUTIONS) + image = comfy.utils.common_upscale(image.movedim(-1, 1), width, height, "lanczos", "center").movedim(1, -1) + ref_latent = vae.encode(image[:, :, :, :3]) + + tokens = clip.tokenize(prompt, images=images) + conditioning = clip.encode_from_tokens_scheduled(tokens) + if ref_latent is not None: + conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_latents": [ref_latent]}, append=True) + return (conditioning, ) + + +NODE_CLASS_MAPPINGS = { + "TextEncodeQwenImageEdit": TextEncodeQwenImageEdit, +} diff --git a/nodes.py b/nodes.py index 860a236aa..b3fa9c51a 100644 --- a/nodes.py +++ b/nodes.py @@ -2321,6 +2321,7 @@ async def init_builtin_extra_nodes(): "nodes_edit_model.py", "nodes_tcfg.py", "nodes_context_windows.py", + "nodes_qwen.py", ] import_failed = [] From 36b5127fd3eee8eaf95ff7296a61269ed56d53c0 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 19 Aug 2025 23:28:07 +0300 Subject: [PATCH 154/243] api_nodes: add kling-v2-1 and v2-1-master (#9257) --- comfy_api_nodes/apis/__init__.py | 3 +++ comfy_api_nodes/nodes_kling.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/comfy_api_nodes/apis/__init__.py b/comfy_api_nodes/apis/__init__.py index 54298e8a9..c6f91e9d6 100644 --- a/comfy_api_nodes/apis/__init__.py +++ b/comfy_api_nodes/apis/__init__.py @@ -1315,6 +1315,7 @@ class KlingTaskStatus(str, Enum): class KlingTextToVideoModelName(str, Enum): kling_v1 = 'kling-v1' kling_v1_6 = 'kling-v1-6' + kling_v2_1_master = 'kling-v2-1-master' class KlingVideoGenAspectRatio(str, Enum): @@ -1347,6 +1348,8 @@ class KlingVideoGenModelName(str, Enum): kling_v1_5 = 'kling-v1-5' kling_v1_6 = 'kling-v1-6' kling_v2_master = 'kling-v2-master' + kling_v2_1 = 'kling-v2-1' + kling_v2_1_master = 'kling-v2-1-master' class KlingVideoResult(BaseModel): diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index 9d483bb0e..9fa390985 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -421,6 +421,8 @@ class KlingTextToVideoNode(KlingNodeBase): "pro mode / 10s duration / kling-v2-master": ("pro", "10", "kling-v2-master"), "standard mode / 5s duration / kling-v2-master": ("std", "5", "kling-v2-master"), "standard mode / 10s duration / kling-v2-master": ("std", "10", "kling-v2-master"), + "pro mode / 5s duration / kling-v2-1-master": ("pro", "5", "kling-v2-1-master"), + "pro mode / 10s duration / kling-v2-1-master": ("pro", "10", "kling-v2-1-master"), } @classmethod From f16a70ba670e11de549af188663a87c77c5bc0c2 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 19 Aug 2025 23:28:27 +0300 Subject: [PATCH 155/243] api_nodes: add MinimaxHailuoVideoNode node (#9262) --- comfy_api_nodes/apis/__init__.py | 13 ++- comfy_api_nodes/nodes_minimax.py | 185 ++++++++++++++++++++++++++++++- 2 files changed, 191 insertions(+), 7 deletions(-) diff --git a/comfy_api_nodes/apis/__init__.py b/comfy_api_nodes/apis/__init__.py index c6f91e9d6..7a09df55b 100644 --- a/comfy_api_nodes/apis/__init__.py +++ b/comfy_api_nodes/apis/__init__.py @@ -1623,13 +1623,14 @@ class MinimaxTaskResultResponse(BaseModel): task_id: str = Field(..., description='The task ID being queried.') -class Model(str, Enum): +class MiniMaxModel(str, Enum): T2V_01_Director = 'T2V-01-Director' I2V_01_Director = 'I2V-01-Director' S2V_01 = 'S2V-01' I2V_01 = 'I2V-01' I2V_01_live = 'I2V-01-live' T2V_01 = 'T2V-01' + Hailuo_02 = 'MiniMax-Hailuo-02' class SubjectReferenceItem(BaseModel): @@ -1651,7 +1652,7 @@ class MinimaxVideoGenerationRequest(BaseModel): None, description='URL or base64 encoding of the first frame image. Required when model is I2V-01, I2V-01-Director, or I2V-01-live.', ) - model: Model = Field( + model: MiniMaxModel = Field( ..., description='Required. ID of model. Options: T2V-01-Director, I2V-01-Director, S2V-01, I2V-01, I2V-01-live, T2V-01', ) @@ -1668,6 +1669,14 @@ class MinimaxVideoGenerationRequest(BaseModel): None, description='Only available when model is S2V-01. The model will generate a video based on the subject uploaded through this parameter.', ) + duration: Optional[int] = Field( + None, + description="The length of the output video in seconds." + ) + resolution: Optional[str] = Field( + None, + description="The dimensions of the video display. 1080p corresponds to 1920 x 1080 pixels, 768p corresponds to 1366 x 768 pixels." + ) class MinimaxVideoGenerationResponse(BaseModel): diff --git a/comfy_api_nodes/nodes_minimax.py b/comfy_api_nodes/nodes_minimax.py index 58d2ed90c..bb3c9e710 100644 --- a/comfy_api_nodes/nodes_minimax.py +++ b/comfy_api_nodes/nodes_minimax.py @@ -1,3 +1,4 @@ +from inspect import cleandoc from typing import Union import logging import torch @@ -10,7 +11,7 @@ from comfy_api_nodes.apis import ( MinimaxFileRetrieveResponse, MinimaxTaskResultResponse, SubjectReferenceItem, - Model + MiniMaxModel ) from comfy_api_nodes.apis.client import ( ApiEndpoint, @@ -84,7 +85,6 @@ class MinimaxTextToVideoNode: FUNCTION = "generate_video" CATEGORY = "api node/video/MiniMax" API_NODE = True - OUTPUT_NODE = True async def generate_video( self, @@ -121,7 +121,7 @@ class MinimaxTextToVideoNode: response_model=MinimaxVideoGenerationResponse, ), request=MinimaxVideoGenerationRequest( - model=Model(model), + model=MiniMaxModel(model), prompt=prompt_text, callback_url=None, first_frame_image=image_url, @@ -251,7 +251,6 @@ class MinimaxImageToVideoNode(MinimaxTextToVideoNode): FUNCTION = "generate_video" CATEGORY = "api node/video/MiniMax" API_NODE = True - OUTPUT_NODE = True class MinimaxSubjectToVideoNode(MinimaxTextToVideoNode): @@ -313,7 +312,181 @@ class MinimaxSubjectToVideoNode(MinimaxTextToVideoNode): FUNCTION = "generate_video" CATEGORY = "api node/video/MiniMax" API_NODE = True - OUTPUT_NODE = True + + +class MinimaxHailuoVideoNode: + """Generates videos from prompt, with optional start frame using the new MiniMax Hailuo-02 model.""" + + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "prompt_text": ( + "STRING", + { + "multiline": True, + "default": "", + "tooltip": "Text prompt to guide the video generation.", + }, + ), + }, + "optional": { + "seed": ( + IO.INT, + { + "default": 0, + "min": 0, + "max": 0xFFFFFFFFFFFFFFFF, + "control_after_generate": True, + "tooltip": "The random seed used for creating the noise.", + }, + ), + "first_frame_image": ( + IO.IMAGE, + { + "tooltip": "Optional image to use as the first frame to generate a video." + }, + ), + "prompt_optimizer": ( + IO.BOOLEAN, + { + "tooltip": "Optimize prompt to improve generation quality when needed.", + "default": True, + }, + ), + "duration": ( + IO.COMBO, + { + "tooltip": "The length of the output video in seconds.", + "default": 6, + "options": [6, 10], + }, + ), + "resolution": ( + IO.COMBO, + { + "tooltip": "The dimensions of the video display. " + "1080p corresponds to 1920 x 1080 pixels, 768p corresponds to 1366 x 768 pixels.", + "default": "768P", + "options": ["768P", "1080P"], + }, + ), + }, + "hidden": { + "auth_token": "AUTH_TOKEN_COMFY_ORG", + "comfy_api_key": "API_KEY_COMFY_ORG", + "unique_id": "UNIQUE_ID", + }, + } + + RETURN_TYPES = ("VIDEO",) + DESCRIPTION = cleandoc(__doc__ or "") + FUNCTION = "generate_video" + CATEGORY = "api node/video/MiniMax" + API_NODE = True + + async def generate_video( + self, + prompt_text, + seed=0, + first_frame_image: torch.Tensor=None, # used for ImageToVideo + prompt_optimizer=True, + duration=6, + resolution="768P", + model="MiniMax-Hailuo-02", + unique_id: Union[str, None]=None, + **kwargs, + ): + if first_frame_image is None: + validate_string(prompt_text, field_name="prompt_text") + + if model == "MiniMax-Hailuo-02" and resolution.upper() == "1080P" and duration != 6: + raise Exception( + "When model is MiniMax-Hailuo-02 and resolution is 1080P, duration is limited to 6 seconds." + ) + + # upload image, if passed in + image_url = None + if first_frame_image is not None: + image_url = (await upload_images_to_comfyapi(first_frame_image, max_images=1, auth_kwargs=kwargs))[0] + + video_generate_operation = SynchronousOperation( + endpoint=ApiEndpoint( + path="/proxy/minimax/video_generation", + method=HttpMethod.POST, + request_model=MinimaxVideoGenerationRequest, + response_model=MinimaxVideoGenerationResponse, + ), + request=MinimaxVideoGenerationRequest( + model=MiniMaxModel(model), + prompt=prompt_text, + callback_url=None, + first_frame_image=image_url, + prompt_optimizer=prompt_optimizer, + duration=duration, + resolution=resolution, + ), + auth_kwargs=kwargs, + ) + response = await video_generate_operation.execute() + + task_id = response.task_id + if not task_id: + raise Exception(f"MiniMax generation failed: {response.base_resp}") + + average_duration = 120 if resolution == "768P" else 240 + video_generate_operation = PollingOperation( + poll_endpoint=ApiEndpoint( + path="/proxy/minimax/query/video_generation", + method=HttpMethod.GET, + request_model=EmptyRequest, + response_model=MinimaxTaskResultResponse, + query_params={"task_id": task_id}, + ), + completed_statuses=["Success"], + failed_statuses=["Fail"], + status_extractor=lambda x: x.status.value, + estimated_duration=average_duration, + node_id=unique_id, + auth_kwargs=kwargs, + ) + task_result = await video_generate_operation.execute() + + file_id = task_result.file_id + if file_id is None: + raise Exception("Request was not successful. Missing file ID.") + file_retrieve_operation = SynchronousOperation( + endpoint=ApiEndpoint( + path="/proxy/minimax/files/retrieve", + method=HttpMethod.GET, + request_model=EmptyRequest, + response_model=MinimaxFileRetrieveResponse, + query_params={"file_id": int(file_id)}, + ), + request=EmptyRequest(), + auth_kwargs=kwargs, + ) + file_result = await file_retrieve_operation.execute() + + file_url = file_result.file.download_url + if file_url is None: + raise Exception( + f"No video was found in the response. Full response: {file_result.model_dump()}" + ) + logging.info(f"Generated video URL: {file_url}") + if unique_id: + if hasattr(file_result.file, "backup_download_url"): + message = f"Result URL: {file_url}\nBackup URL: {file_result.file.backup_download_url}" + else: + message = f"Result URL: {file_url}" + PromptServer.instance.send_progress_text(message, unique_id) + + video_io = await download_url_to_bytesio(file_url) + if video_io is None: + error_msg = f"Failed to download video from {file_url}" + logging.error(error_msg) + raise Exception(error_msg) + return (VideoFromFile(video_io),) # A dictionary that contains all nodes you want to export with their names @@ -322,6 +495,7 @@ NODE_CLASS_MAPPINGS = { "MinimaxTextToVideoNode": MinimaxTextToVideoNode, "MinimaxImageToVideoNode": MinimaxImageToVideoNode, # "MinimaxSubjectToVideoNode": MinimaxSubjectToVideoNode, + "MinimaxHailuoVideoNode": MinimaxHailuoVideoNode, } # A dictionary that contains the friendly/humanly readable titles for the nodes @@ -329,4 +503,5 @@ NODE_DISPLAY_NAME_MAPPINGS = { "MinimaxTextToVideoNode": "MiniMax Text to Video", "MinimaxImageToVideoNode": "MiniMax Image to Video", "MinimaxSubjectToVideoNode": "MiniMax Subject to Video", + "MinimaxHailuoVideoNode": "MiniMax Hailuo Video", } From 07a927517cfaf099fec3903e8973f758e62d65f9 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 19 Aug 2025 23:29:01 +0300 Subject: [PATCH 156/243] api_nodes: add GPT-5 series models (#9325) --- comfy_api_nodes/nodes_openai.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/comfy_api_nodes/nodes_openai.py b/comfy_api_nodes/nodes_openai.py index cbff2b2d2..674c9ede0 100644 --- a/comfy_api_nodes/nodes_openai.py +++ b/comfy_api_nodes/nodes_openai.py @@ -80,6 +80,9 @@ class SupportedOpenAIModel(str, Enum): gpt_4_1 = "gpt-4.1" gpt_4_1_mini = "gpt-4.1-mini" gpt_4_1_nano = "gpt-4.1-nano" + gpt_5 = "gpt-5" + gpt_5_mini = "gpt-5-mini" + gpt_5_nano = "gpt-5-nano" class OpenAIDalle2(ComfyNodeABC): From d844d8b13bfd6a83b0a7d0491aa2978ac44a6158 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 19 Aug 2025 23:29:24 +0300 Subject: [PATCH 157/243] api_nodes: added release version of google's models (#9304) --- comfy_api_nodes/nodes_gemini.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index 3751fb2a1..ba4167a50 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -46,6 +46,8 @@ class GeminiModel(str, Enum): gemini_2_5_pro_preview_05_06 = "gemini-2.5-pro-preview-05-06" gemini_2_5_flash_preview_04_17 = "gemini-2.5-flash-preview-04-17" + gemini_2_5_pro = "gemini-2.5-pro" + gemini_2_5_flash = "gemini-2.5-flash" def get_gemini_endpoint( @@ -97,7 +99,7 @@ class GeminiNode(ComfyNodeABC): { "tooltip": "The Gemini model to use for generating responses.", "options": [model.value for model in GeminiModel], - "default": GeminiModel.gemini_2_5_pro_preview_05_06.value, + "default": GeminiModel.gemini_2_5_pro.value, }, ), "seed": ( From 54d8fdbed0a7b171ab8cfb02e29a7e0dc5fe78fd Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 19 Aug 2025 23:30:06 +0300 Subject: [PATCH 158/243] feat(api-nodes): add Vidu Video nodes (#9368) --- comfy_api_nodes/nodes_vidu.py | 622 +++++++++++++++++++++++ comfy_api_nodes/util/validation_utils.py | 53 ++ nodes.py | 1 + 3 files changed, 676 insertions(+) create mode 100644 comfy_api_nodes/nodes_vidu.py diff --git a/comfy_api_nodes/nodes_vidu.py b/comfy_api_nodes/nodes_vidu.py new file mode 100644 index 000000000..2f441948c --- /dev/null +++ b/comfy_api_nodes/nodes_vidu.py @@ -0,0 +1,622 @@ +import logging +from enum import Enum +from typing import Any, Callable, Optional, Literal, TypeVar +from typing_extensions import override + +import torch +from pydantic import BaseModel, Field + +from comfy_api.latest import ComfyExtension, io as comfy_io +from comfy_api_nodes.util.validation_utils import ( + validate_aspect_ratio_closeness, + validate_image_dimensions, + validate_image_aspect_ratio_range, + get_number_of_images, +) +from comfy_api_nodes.apis.client import ( + ApiEndpoint, + HttpMethod, + SynchronousOperation, + PollingOperation, + EmptyRequest, +) +from comfy_api_nodes.apinode_utils import download_url_to_video_output, upload_images_to_comfyapi + + +VIDU_TEXT_TO_VIDEO = "/proxy/vidu/text2video" +VIDU_IMAGE_TO_VIDEO = "/proxy/vidu/img2video" +VIDU_REFERENCE_VIDEO = "/proxy/vidu/reference2video" +VIDU_START_END_VIDEO = "/proxy/vidu/start-end2video" +VIDU_GET_GENERATION_STATUS = "/proxy/vidu/tasks/%s/creations" + +R = TypeVar("R") + +class VideoModelName(str, Enum): + vidu_q1 = 'viduq1' + + +class AspectRatio(str, Enum): + r_16_9 = "16:9" + r_9_16 = "9:16" + r_1_1 = "1:1" + + +class Resolution(str, Enum): + r_1080p = "1080p" + + +class MovementAmplitude(str, Enum): + auto = "auto" + small = "small" + medium = "medium" + large = "large" + + +class TaskCreationRequest(BaseModel): + model: VideoModelName = VideoModelName.vidu_q1 + prompt: Optional[str] = Field(None, max_length=1500) + duration: Optional[Literal[5]] = 5 + seed: Optional[int] = Field(0, ge=0, le=2147483647) + aspect_ratio: Optional[AspectRatio] = AspectRatio.r_16_9 + resolution: Optional[Resolution] = Resolution.r_1080p + movement_amplitude: Optional[MovementAmplitude] = MovementAmplitude.auto + images: Optional[list[str]] = Field(None, description="Base64 encoded string or image URL") + + +class TaskStatus(str, Enum): + created = "created" + queueing = "queueing" + processing = "processing" + success = "success" + failed = "failed" + + +class TaskCreationResponse(BaseModel): + task_id: str = Field(...) + state: TaskStatus = Field(...) + created_at: str = Field(...) + code: Optional[int] = Field(None, description="Error code") + + +class TaskResult(BaseModel): + id: str = Field(..., description="Creation id") + url: str = Field(..., description="The URL of the generated results, valid for one hour") + cover_url: str = Field(..., description="The cover URL of the generated results, valid for one hour") + + +class TaskStatusResponse(BaseModel): + state: TaskStatus = Field(...) + err_code: Optional[str] = Field(None) + creations: list[TaskResult] = Field(..., description="Generated results") + + +async def poll_until_finished( + auth_kwargs: dict[str, str], + api_endpoint: ApiEndpoint[Any, R], + result_url_extractor: Optional[Callable[[R], str]] = None, + estimated_duration: Optional[int] = None, + node_id: Optional[str] = None, +) -> R: + return await PollingOperation( + poll_endpoint=api_endpoint, + completed_statuses=[TaskStatus.success.value], + failed_statuses=[TaskStatus.failed.value], + status_extractor=lambda response: response.state.value, + auth_kwargs=auth_kwargs, + result_url_extractor=result_url_extractor, + estimated_duration=estimated_duration, + node_id=node_id, + poll_interval=16.0, + max_poll_attempts=256, + ).execute() + + +def get_video_url_from_response(response) -> Optional[str]: + if response.creations: + return response.creations[0].url + return None + + +def get_video_from_response(response) -> TaskResult: + if not response.creations: + error_msg = f"Vidu request does not contain results. State: {response.state}, Error Code: {response.err_code}" + logging.info(error_msg) + raise RuntimeError(error_msg) + logging.info("Vidu task %s succeeded. Video URL: %s", response.creations[0].id, response.creations[0].url) + return response.creations[0] + + +async def execute_task( + vidu_endpoint: str, + auth_kwargs: Optional[dict[str, str]], + payload: TaskCreationRequest, + estimated_duration: int, + node_id: str, +) -> R: + response = await SynchronousOperation( + endpoint=ApiEndpoint( + path=vidu_endpoint, + method=HttpMethod.POST, + request_model=TaskCreationRequest, + response_model=TaskCreationResponse, + ), + request=payload, + auth_kwargs=auth_kwargs, + ).execute() + if response.state == TaskStatus.failed: + error_msg = f"Vidu request failed. Code: {response.code}" + logging.error(error_msg) + raise RuntimeError(error_msg) + return await poll_until_finished( + auth_kwargs, + ApiEndpoint( + path=VIDU_GET_GENERATION_STATUS % response.task_id, + method=HttpMethod.GET, + request_model=EmptyRequest, + response_model=TaskStatusResponse, + ), + result_url_extractor=get_video_url_from_response, + estimated_duration=estimated_duration, + node_id=node_id, + ) + + +class ViduTextToVideoNode(comfy_io.ComfyNode): + + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="ViduTextToVideoNode", + display_name="Vidu Text To Video Generation", + category="api node/video/Vidu", + description="Generate video from text prompt", + inputs=[ + comfy_io.Combo.Input( + "model", + options=[model.value for model in VideoModelName], + default=VideoModelName.vidu_q1.value, + tooltip="Model name", + ), + comfy_io.String.Input( + "prompt", + multiline=True, + tooltip="A textual description for video generation", + ), + comfy_io.Int.Input( + "duration", + default=5, + min=5, + max=5, + step=1, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Duration of the output video in seconds", + optional=True, + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed for video generation (0 for random)", + optional=True, + ), + comfy_io.Combo.Input( + "aspect_ratio", + options=[model.value for model in AspectRatio], + default=AspectRatio.r_16_9.value, + tooltip="The aspect ratio of the output video", + optional=True, + ), + comfy_io.Combo.Input( + "resolution", + options=[model.value for model in Resolution], + default=Resolution.r_1080p.value, + tooltip="Supported values may vary by model & duration", + optional=True, + ), + comfy_io.Combo.Input( + "movement_amplitude", + options=[model.value for model in MovementAmplitude], + default=MovementAmplitude.auto.value, + tooltip="The movement amplitude of objects in the frame", + optional=True, + ), + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + prompt: str, + duration: int, + seed: int, + aspect_ratio: str, + resolution: str, + movement_amplitude: str, + ) -> comfy_io.NodeOutput: + if not prompt: + raise ValueError("The prompt field is required and cannot be empty.") + payload = TaskCreationRequest( + model_name=model, + prompt=prompt, + duration=duration, + seed=seed, + aspect_ratio=aspect_ratio, + resolution=resolution, + movement_amplitude=movement_amplitude, + ) + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + results = await execute_task(VIDU_TEXT_TO_VIDEO, auth, payload, 320, cls.hidden.unique_id) + return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) + + +class ViduImageToVideoNode(comfy_io.ComfyNode): + + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="ViduImageToVideoNode", + display_name="Vidu Image To Video Generation", + category="api node/video/Vidu", + description="Generate video from image and optional prompt", + inputs=[ + comfy_io.Combo.Input( + "model", + options=[model.value for model in VideoModelName], + default=VideoModelName.vidu_q1.value, + tooltip="Model name", + ), + comfy_io.Image.Input( + "image", + tooltip="An image to be used as the start frame of the generated video", + ), + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="A textual description for video generation", + optional=True, + ), + comfy_io.Int.Input( + "duration", + default=5, + min=5, + max=5, + step=1, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Duration of the output video in seconds", + optional=True, + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed for video generation (0 for random)", + optional=True, + ), + comfy_io.Combo.Input( + "resolution", + options=[model.value for model in Resolution], + default=Resolution.r_1080p.value, + tooltip="Supported values may vary by model & duration", + optional=True, + ), + comfy_io.Combo.Input( + "movement_amplitude", + options=[model.value for model in MovementAmplitude], + default=MovementAmplitude.auto.value, + tooltip="The movement amplitude of objects in the frame", + optional=True, + ), + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + image: torch.Tensor, + prompt: str, + duration: int, + seed: int, + resolution: str, + movement_amplitude: str, + ) -> comfy_io.NodeOutput: + if get_number_of_images(image) > 1: + raise ValueError("Only one input image is allowed.") + validate_image_aspect_ratio_range(image, (1, 4), (4, 1)) + payload = TaskCreationRequest( + model_name=model, + prompt=prompt, + duration=duration, + seed=seed, + resolution=resolution, + movement_amplitude=movement_amplitude, + ) + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + payload.images = await upload_images_to_comfyapi( + image, + max_images=1, + mime_type="image/png", + auth_kwargs=auth, + ) + results = await execute_task(VIDU_IMAGE_TO_VIDEO, auth, payload, 120, cls.hidden.unique_id) + return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) + + +class ViduReferenceVideoNode(comfy_io.ComfyNode): + + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="ViduReferenceVideoNode", + display_name="Vidu Reference To Video Generation", + category="api node/video/Vidu", + description="Generate video from multiple images and prompt", + inputs=[ + comfy_io.Combo.Input( + "model", + options=[model.value for model in VideoModelName], + default=VideoModelName.vidu_q1.value, + tooltip="Model name", + ), + comfy_io.Image.Input( + "images", + tooltip="Images to use as references to generate a video with consistent subjects (max 7 images).", + ), + comfy_io.String.Input( + "prompt", + multiline=True, + tooltip="A textual description for video generation", + ), + comfy_io.Int.Input( + "duration", + default=5, + min=5, + max=5, + step=1, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Duration of the output video in seconds", + optional=True, + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed for video generation (0 for random)", + optional=True, + ), + comfy_io.Combo.Input( + "aspect_ratio", + options=[model.value for model in AspectRatio], + default=AspectRatio.r_16_9.value, + tooltip="The aspect ratio of the output video", + optional=True, + ), + comfy_io.Combo.Input( + "resolution", + options=[model.value for model in Resolution], + default=Resolution.r_1080p.value, + tooltip="Supported values may vary by model & duration", + optional=True, + ), + comfy_io.Combo.Input( + "movement_amplitude", + options=[model.value for model in MovementAmplitude], + default=MovementAmplitude.auto.value, + tooltip="The movement amplitude of objects in the frame", + optional=True, + ), + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + images: torch.Tensor, + prompt: str, + duration: int, + seed: int, + aspect_ratio: str, + resolution: str, + movement_amplitude: str, + ) -> comfy_io.NodeOutput: + if not prompt: + raise ValueError("The prompt field is required and cannot be empty.") + a = get_number_of_images(images) + if a > 7: + raise ValueError("Too many images, maximum allowed is 7.") + for image in images: + validate_image_aspect_ratio_range(image, (1, 4), (4, 1)) + validate_image_dimensions(image, min_width=128, min_height=128) + payload = TaskCreationRequest( + model_name=model, + prompt=prompt, + duration=duration, + seed=seed, + aspect_ratio=aspect_ratio, + resolution=resolution, + movement_amplitude=movement_amplitude, + ) + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + payload.images = await upload_images_to_comfyapi( + images, + max_images=7, + mime_type="image/png", + auth_kwargs=auth, + ) + results = await execute_task(VIDU_REFERENCE_VIDEO, auth, payload, 120, cls.hidden.unique_id) + return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) + + +class ViduStartEndToVideoNode(comfy_io.ComfyNode): + + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="ViduStartEndToVideoNode", + display_name="Vidu Start End To Video Generation", + category="api node/video/Vidu", + description="Generate a video from start and end frames and a prompt", + inputs=[ + comfy_io.Combo.Input( + "model", + options=[model.value for model in VideoModelName], + default=VideoModelName.vidu_q1.value, + tooltip="Model name", + ), + comfy_io.Image.Input( + "first_frame", + tooltip="Start frame", + ), + comfy_io.Image.Input( + "end_frame", + tooltip="End frame", + ), + comfy_io.String.Input( + "prompt", + multiline=True, + tooltip="A textual description for video generation", + optional=True, + ), + comfy_io.Int.Input( + "duration", + default=5, + min=5, + max=5, + step=1, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Duration of the output video in seconds", + optional=True, + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed for video generation (0 for random)", + optional=True, + ), + comfy_io.Combo.Input( + "resolution", + options=[model.value for model in Resolution], + default=Resolution.r_1080p.value, + tooltip="Supported values may vary by model & duration", + optional=True, + ), + comfy_io.Combo.Input( + "movement_amplitude", + options=[model.value for model in MovementAmplitude], + default=MovementAmplitude.auto.value, + tooltip="The movement amplitude of objects in the frame", + optional=True, + ), + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + first_frame: torch.Tensor, + end_frame: torch.Tensor, + prompt: str, + duration: int, + seed: int, + resolution: str, + movement_amplitude: str, + ) -> comfy_io.NodeOutput: + validate_aspect_ratio_closeness(first_frame, end_frame, min_rel=0.8, max_rel=1.25, strict=False) + payload = TaskCreationRequest( + model_name=model, + prompt=prompt, + duration=duration, + seed=seed, + resolution=resolution, + movement_amplitude=movement_amplitude, + ) + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + payload.images = [ + (await upload_images_to_comfyapi(frame, max_images=1, mime_type="image/png", auth_kwargs=auth))[0] + for frame in (first_frame, end_frame) + ] + results = await execute_task(VIDU_START_END_VIDEO, auth, payload, 96, cls.hidden.unique_id) + return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) + + +class ViduExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + return [ + ViduTextToVideoNode, + ViduImageToVideoNode, + ViduReferenceVideoNode, + ViduStartEndToVideoNode, + ] + +async def comfy_entrypoint() -> ViduExtension: + return ViduExtension() diff --git a/comfy_api_nodes/util/validation_utils.py b/comfy_api_nodes/util/validation_utils.py index 031b9fbd3..606b794bf 100644 --- a/comfy_api_nodes/util/validation_utils.py +++ b/comfy_api_nodes/util/validation_utils.py @@ -53,6 +53,53 @@ def validate_image_aspect_ratio( ) +def validate_image_aspect_ratio_range( + image: torch.Tensor, + min_ratio: tuple[float, float], # e.g. (1, 4) + max_ratio: tuple[float, float], # e.g. (4, 1) + *, + strict: bool = True, # True -> (min, max); False -> [min, max] +) -> float: + a1, b1 = min_ratio + a2, b2 = max_ratio + if a1 <= 0 or b1 <= 0 or a2 <= 0 or b2 <= 0: + raise ValueError("Ratios must be positive, like (1, 4) or (4, 1).") + lo, hi = (a1 / b1), (a2 / b2) + if lo > hi: + lo, hi = hi, lo + a1, b1, a2, b2 = a2, b2, a1, b1 # swap only for error text + w, h = get_image_dimensions(image) + if w <= 0 or h <= 0: + raise ValueError(f"Invalid image dimensions: {w}x{h}") + ar = w / h + ok = (lo < ar < hi) if strict else (lo <= ar <= hi) + if not ok: + op = "<" if strict else "≤" + raise ValueError(f"Image aspect ratio {ar:.6g} is outside allowed range: {a1}:{b1} {op} ratio {op} {a2}:{b2}") + return ar + + +def validate_aspect_ratio_closeness( + start_img, + end_img, + min_rel: float, + max_rel: float, + *, + strict: bool = False, # True => exclusive, False => inclusive +) -> None: + w1, h1 = get_image_dimensions(start_img) + w2, h2 = get_image_dimensions(end_img) + if min(w1, h1, w2, h2) <= 0: + raise ValueError("Invalid image dimensions") + ar1 = w1 / h1 + ar2 = w2 / h2 + # Normalize so it is symmetric (no need to check both ar1/ar2 and ar2/ar1) + closeness = max(ar1, ar2) / min(ar1, ar2) + limit = max(max_rel, 1.0 / min_rel) # for 0.8..1.25 this is 1.25 + if (closeness >= limit) if strict else (closeness > limit): + raise ValueError(f"Aspect ratios must be close: start/end={ar1/ar2:.4f}, allowed range {min_rel}–{max_rel}.") + + def validate_video_dimensions( video: VideoInput, min_width: Optional[int] = None, @@ -98,3 +145,9 @@ def validate_video_duration( raise ValueError( f"Video duration must be at most {max_duration}s, got {duration}s" ) + + +def get_number_of_images(images): + if isinstance(images, torch.Tensor): + return images.shape[0] if images.ndim >= 4 else 1 + return len(images) diff --git a/nodes.py b/nodes.py index b3fa9c51a..35dda1b19 100644 --- a/nodes.py +++ b/nodes.py @@ -2351,6 +2351,7 @@ async def init_builtin_api_nodes(): "nodes_moonvalley.py", "nodes_rodin.py", "nodes_gemini.py", + "nodes_vidu.py", ] if not await load_custom_node(os.path.join(api_nodes_dir, "canary.py"), module_parent="comfy_api_nodes"): From bddd69618bf4463209c3681babfcbebd9b9aed85 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 19 Aug 2025 13:49:01 -0700 Subject: [PATCH 159/243] Change the TextEncodeQwenImageEdit node to use logic closer to reference. (#9432) --- comfy_extras/nodes_qwen.py | 37 +++++++++++-------------------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/comfy_extras/nodes_qwen.py b/comfy_extras/nodes_qwen.py index b5088fae2..fff89556f 100644 --- a/comfy_extras/nodes_qwen.py +++ b/comfy_extras/nodes_qwen.py @@ -1,25 +1,6 @@ import node_helpers import comfy.utils - -PREFERRED_QWENIMAGE_RESOLUTIONS = [ - (672, 1568), - (688, 1504), - (720, 1456), - (752, 1392), - (800, 1328), - (832, 1248), - (880, 1184), - (944, 1104), - (1024, 1024), - (1104, 944), - (1184, 880), - (1248, 832), - (1328, 800), - (1392, 752), - (1456, 720), - (1504, 688), - (1568, 672), -] +import math class TextEncodeQwenImageEdit: @@ -42,13 +23,17 @@ class TextEncodeQwenImageEdit: if image is None: images = [] else: - images = [image] + samples = image.movedim(-1, 1) + total = int(1024 * 1024) + + scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2])) + width = round(samples.shape[3] * scale_by) + height = round(samples.shape[2] * scale_by) + + s = comfy.utils.common_upscale(samples, width, height, "area", "disabled") + image = s.movedim(1, -1) + images = [image[:, :, :, :3]] if vae is not None: - width = image.shape[2] - height = image.shape[1] - aspect_ratio = width / height - _, width, height = min((abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_QWENIMAGE_RESOLUTIONS) - image = comfy.utils.common_upscale(image.movedim(-1, 1), width, height, "lanczos", "center").movedim(1, -1) ref_latent = vae.encode(image[:, :, :, :3]) tokens = clip.tokenize(prompt, images=images) From dfa791eb4bfcaac3eb9b2b33fa15ae5a25589bb8 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 19 Aug 2025 17:47:42 -0700 Subject: [PATCH 160/243] Rope fix for qwen vl. (#9435) --- comfy/text_encoders/llama.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py index 9d90d5a61..4c976058f 100644 --- a/comfy/text_encoders/llama.py +++ b/comfy/text_encoders/llama.py @@ -27,6 +27,7 @@ class Llama2Config: rms_norm_add = False mlp_activation = "silu" qkv_bias = False + rope_dims = None @dataclass class Qwen25_3BConfig: @@ -44,6 +45,7 @@ class Qwen25_3BConfig: rms_norm_add = False mlp_activation = "silu" qkv_bias = True + rope_dims = None @dataclass class Qwen25_7BVLI_Config: @@ -61,6 +63,7 @@ class Qwen25_7BVLI_Config: rms_norm_add = False mlp_activation = "silu" qkv_bias = True + rope_dims = [16, 24, 24] @dataclass class Gemma2_2B_Config: @@ -78,6 +81,7 @@ class Gemma2_2B_Config: rms_norm_add = True mlp_activation = "gelu_pytorch_tanh" qkv_bias = False + rope_dims = None class RMSNorm(nn.Module): def __init__(self, dim: int, eps: float = 1e-5, add=False, device=None, dtype=None): @@ -102,7 +106,7 @@ def rotate_half(x): return torch.cat((-x2, x1), dim=-1) -def precompute_freqs_cis(head_dim, position_ids, theta, device=None): +def precompute_freqs_cis(head_dim, position_ids, theta, rope_dims=None, device=None): theta_numerator = torch.arange(0, head_dim, 2, device=device).float() inv_freq = 1.0 / (theta ** (theta_numerator / head_dim)) @@ -112,12 +116,20 @@ def precompute_freqs_cis(head_dim, position_ids, theta, device=None): emb = torch.cat((freqs, freqs), dim=-1) cos = emb.cos() sin = emb.sin() + if rope_dims is not None and position_ids.shape[0] > 1: + mrope_section = rope_dims * 2 + cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0) + sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0) + else: + cos = cos.unsqueeze(1) + sin = sin.unsqueeze(1) + return (cos, sin) def apply_rope(xq, xk, freqs_cis): - cos = freqs_cis[0].unsqueeze(1) - sin = freqs_cis[1].unsqueeze(1) + cos = freqs_cis[0] + sin = freqs_cis[1] q_embed = (xq * cos) + (rotate_half(xq) * sin) k_embed = (xk * cos) + (rotate_half(xk) * sin) return q_embed, k_embed @@ -292,6 +304,7 @@ class Llama2_(nn.Module): freqs_cis = precompute_freqs_cis(self.config.head_dim, position_ids, self.config.rope_theta, + self.config.rope_dims, device=x.device) mask = None From 7cd2c4bd6ab20f35a6bb1b1f2252c3ea16da4777 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 19 Aug 2025 21:45:27 -0700 Subject: [PATCH 161/243] Qwen rotary embeddings should now match reference code. (#9437) --- comfy/ldm/qwen_image/model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index a3c726299..bf3940313 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -349,8 +349,8 @@ class QwenImageTransformer2DModel(nn.Module): img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype) img_ids[:, :, 0] = img_ids[:, :, 1] + index - img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1) - img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) + img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1) - (h_len // 2) + img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) - (w_len // 2) return hidden_states, repeat(img_ids, "h w c -> b (h w) c", b=bs), orig_shape def forward( @@ -396,7 +396,7 @@ class QwenImageTransformer2DModel(nn.Module): hidden_states = torch.cat([hidden_states, kontext], dim=1) img_ids = torch.cat([img_ids, kontext_ids], dim=1) - txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size), ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size))) + txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size) // 2, ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size) // 2)) txt_ids = torch.linspace(txt_start, txt_start + context.shape[1], steps=context.shape[1], device=x.device, dtype=x.dtype).reshape(1, -1, 1).repeat(x.shape[0], 1, 3) ids = torch.cat((txt_ids, img_ids), dim=1) image_rotary_emb = self.pe_embedder(ids).squeeze(1).unsqueeze(2).to(x.dtype) From 5a8f502db5889873ffa13132b603b7b6daac605a Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 19 Aug 2025 22:08:11 -0700 Subject: [PATCH 162/243] Disable prompt weights for qwen. (#9438) --- comfy/sd1_clip.py | 5 ++++- comfy/text_encoders/qwen_image.py | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py index 1e8adbe69..f8a7c2a1b 100644 --- a/comfy/sd1_clip.py +++ b/comfy/sd1_clip.py @@ -534,7 +534,10 @@ class SDTokenizer: min_padding = tokenizer_options.get("{}_min_padding".format(self.embedding_key), self.min_padding) text = escape_important(text) - parsed_weights = token_weights(text, 1.0) + if kwargs.get("disable_weights", False): + parsed_weights = [(text, 1.0)] + else: + parsed_weights = token_weights(text, 1.0) # tokenize words tokens = [] diff --git a/comfy/text_encoders/qwen_image.py b/comfy/text_encoders/qwen_image.py index f07318d6c..6646b1003 100644 --- a/comfy/text_encoders/qwen_image.py +++ b/comfy/text_encoders/qwen_image.py @@ -15,7 +15,7 @@ class QwenImageTokenizer(sd1_clip.SD1Tokenizer): def __init__(self, embedding_directory=None, tokenizer_data={}): super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="qwen25_7b", tokenizer=Qwen25_7BVLITokenizer) self.llama_template = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" - self.llama_template_images = "<|im_start|>system\nDescribe the key features of the input image \\(color, shape, size, texture, objects, background\\), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n" + self.llama_template_images = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n" def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], **kwargs): if llama_template is None: @@ -25,7 +25,7 @@ class QwenImageTokenizer(sd1_clip.SD1Tokenizer): llama_text = self.llama_template.format(text) else: llama_text = llama_template.format(text) - tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, **kwargs) + tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, disable_weights=True, **kwargs) key_name = next(iter(tokens)) embed_count = 0 qwen_tokens = tokens[key_name] From 8d38ea3bbf7e77ed7e7aee401b157dab211c5307 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 19 Aug 2025 23:58:54 -0700 Subject: [PATCH 163/243] Fix bf16 precision issue with qwen image embeddings. (#9441) --- comfy/ldm/qwen_image/model.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index bf3940313..49f66b90a 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -347,7 +347,7 @@ class QwenImageTransformer2DModel(nn.Module): h_offset = ((h_offset + (patch_size // 2)) // patch_size) w_offset = ((w_offset + (patch_size // 2)) // patch_size) - img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype) + img_ids = torch.zeros((h_len, w_len, 3), device=x.device) img_ids[:, :, 0] = img_ids[:, :, 1] + index img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1) - (h_len // 2) img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) - (w_len // 2) @@ -397,9 +397,10 @@ class QwenImageTransformer2DModel(nn.Module): img_ids = torch.cat([img_ids, kontext_ids], dim=1) txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size) // 2, ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size) // 2)) - txt_ids = torch.linspace(txt_start, txt_start + context.shape[1], steps=context.shape[1], device=x.device, dtype=x.dtype).reshape(1, -1, 1).repeat(x.shape[0], 1, 3) + txt_ids = torch.arange(txt_start, txt_start + context.shape[1], device=x.device).reshape(1, -1, 1).repeat(x.shape[0], 1, 3) ids = torch.cat((txt_ids, img_ids), dim=1) image_rotary_emb = self.pe_embedder(ids).squeeze(1).unsqueeze(2).to(x.dtype) + del ids, txt_ids, img_ids hidden_states = self.img_in(hidden_states) encoder_hidden_states = self.txt_norm(encoder_hidden_states) From 2f52e8f05f2039dd67e9b9783b8397350a548b95 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Wed, 20 Aug 2025 15:15:09 +0800 Subject: [PATCH 164/243] Bump template to 0.1.62 (#9419) * Bump template to 0.1.61 * Bump template to 0.1.62 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index c7a5c47ab..8d928d826 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.25.9 -comfyui-workflow-templates==0.1.60 +comfyui-workflow-templates==0.1.62 comfyui-embedded-docs==0.2.6 torch torchsde From 7139d6d93fc7b5481a69b687080bd36f7b531c46 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Wed, 20 Aug 2025 03:15:30 -0400 Subject: [PATCH 165/243] ComfyUI version 0.3.51 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 29ec07ca6..65f06cf37 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.50" +__version__ = "0.3.51" diff --git a/pyproject.toml b/pyproject.toml index 659b5730a..ecbf04303 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.50" +version = "0.3.51" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From fe01885acf892de636b1b2743903812099bd42e3 Mon Sep 17 00:00:00 2001 From: Harel Cain Date: Wed, 20 Aug 2025 09:33:10 +0200 Subject: [PATCH 166/243] LTXV: fix key frame noise mask dimensions for when real noise mask exists (#9425) --- comfy_extras/nodes_lt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_extras/nodes_lt.py b/comfy_extras/nodes_lt.py index b5058667a..f82337a67 100644 --- a/comfy_extras/nodes_lt.py +++ b/comfy_extras/nodes_lt.py @@ -166,7 +166,7 @@ class LTXVAddGuide: negative = self.add_keyframe_index(negative, frame_idx, guiding_latent, scale_factors) mask = torch.full( - (noise_mask.shape[0], 1, guiding_latent.shape[2], 1, 1), + (noise_mask.shape[0], 1, guiding_latent.shape[2], noise_mask.shape[3], noise_mask.shape[4]), 1.0 - strength, dtype=noise_mask.dtype, device=noise_mask.device, From e73a9dbe30434280c69d852ea78cc4bf88bfd501 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 20 Aug 2025 14:34:13 -0700 Subject: [PATCH 167/243] Add that qwen edit model is supported to readme. (#9463) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index fa99a8cbe..79a8a8c79 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,7 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith - [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/) - [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model) - [HiDream E1.1](https://comfyanonymous.github.io/ComfyUI_examples/hidream/#hidream-e11) + - [Qwen Image Edit](https://comfyanonymous.github.io/ComfyUI_examples/qwen_image/#edit-model) - Video Models - [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/) - [Mochi](https://comfyanonymous.github.io/ComfyUI_examples/mochi/) From 0963493a9c3b6565f8537288a0fb90991391ec41 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 20 Aug 2025 19:26:37 -0700 Subject: [PATCH 168/243] Support for Qwen Diffsynth Controlnets canny and depth. (#9465) These are not real controlnets but actually a patch on the model so they will be treated as such. Put them in the models/model_patches/ folder. Use the new ModelPatchLoader and QwenImageDiffsynthControlnet nodes. --- comfy/ldm/qwen_image/model.py | 7 + comfy/model_management.py | 8 +- comfy/model_patcher.py | 27 ++++ comfy_api/latest/_io.py | 4 + comfy_extras/nodes_model_patch.py | 138 ++++++++++++++++++++ models/model_patches/put_model_patches_here | 0 nodes.py | 1 + 7 files changed, 184 insertions(+), 1 deletion(-) create mode 100644 comfy_extras/nodes_model_patch.py create mode 100644 models/model_patches/put_model_patches_here diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index 49f66b90a..2503583cb 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -416,6 +416,7 @@ class QwenImageTransformer2DModel(nn.Module): ) patches_replace = transformer_options.get("patches_replace", {}) + patches = transformer_options.get("patches", {}) blocks_replace = patches_replace.get("dit", {}) for i, block in enumerate(self.transformer_blocks): @@ -436,6 +437,12 @@ class QwenImageTransformer2DModel(nn.Module): image_rotary_emb=image_rotary_emb, ) + if "double_block" in patches: + for p in patches["double_block"]: + out = p({"img": hidden_states, "txt": encoder_hidden_states, "x": x, "block_index": i}) + hidden_states = out["img"] + encoder_hidden_states = out["txt"] + hidden_states = self.norm_out(hidden_states, temb) hidden_states = self.proj_out(hidden_states) diff --git a/comfy/model_management.py b/comfy/model_management.py index 2a9f18068..d08aee1fe 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -593,7 +593,13 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu else: minimum_memory_required = max(inference_memory, minimum_memory_required + extra_reserved_memory()) - models = set(models) + models_temp = set() + for m in models: + models_temp.add(m) + for mm in m.model_patches_models(): + models_temp.add(mm) + + models = models_temp models_to_load = [] diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 52e76b5f3..a944cb421 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -430,6 +430,9 @@ class ModelPatcher: def set_model_forward_timestep_embed_patch(self, patch): self.set_model_patch(patch, "forward_timestep_embed_patch") + def set_model_double_block_patch(self, patch): + self.set_model_patch(patch, "double_block") + def add_object_patch(self, name, obj): self.object_patches[name] = obj @@ -486,6 +489,30 @@ class ModelPatcher: if hasattr(wrap_func, "to"): self.model_options["model_function_wrapper"] = wrap_func.to(device) + def model_patches_models(self): + to = self.model_options["transformer_options"] + models = [] + if "patches" in to: + patches = to["patches"] + for name in patches: + patch_list = patches[name] + for i in range(len(patch_list)): + if hasattr(patch_list[i], "models"): + models += patch_list[i].models() + if "patches_replace" in to: + patches = to["patches_replace"] + for name in patches: + patch_list = patches[name] + for k in patch_list: + if hasattr(patch_list[k], "models"): + models += patch_list[k].models() + if "model_function_wrapper" in self.model_options: + wrap_func = self.model_options["model_function_wrapper"] + if hasattr(wrap_func, "models"): + models += wrap_func.models() + + return models + def model_dtype(self): if hasattr(self.model, "get_dtype"): return self.model.get_dtype() diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index ec1efb51d..a3a21facc 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -726,6 +726,10 @@ class SEGS(ComfyTypeIO): class AnyType(ComfyTypeIO): Type = Any +@comfytype(io_type="MODEL_PATCH") +class MODEL_PATCH(ComfyTypeIO): + Type = Any + @comfytype(io_type="COMFY_MULTITYPED_V3") class MultiType: Type = Any diff --git a/comfy_extras/nodes_model_patch.py b/comfy_extras/nodes_model_patch.py new file mode 100644 index 000000000..bb239bc45 --- /dev/null +++ b/comfy_extras/nodes_model_patch.py @@ -0,0 +1,138 @@ +import torch +import folder_paths +import comfy.utils +import comfy.ops +import comfy.model_management +import comfy.ldm.common_dit +import comfy.latent_formats + + +class BlockWiseControlBlock(torch.nn.Module): + # [linear, gelu, linear] + def __init__(self, dim: int = 3072, device=None, dtype=None, operations=None): + super().__init__() + self.x_rms = operations.RMSNorm(dim, eps=1e-6) + self.y_rms = operations.RMSNorm(dim, eps=1e-6) + self.input_proj = operations.Linear(dim, dim) + self.act = torch.nn.GELU() + self.output_proj = operations.Linear(dim, dim) + + def forward(self, x, y): + x, y = self.x_rms(x), self.y_rms(y) + x = self.input_proj(x + y) + x = self.act(x) + x = self.output_proj(x) + return x + + +class QwenImageBlockWiseControlNet(torch.nn.Module): + def __init__( + self, + num_layers: int = 60, + in_dim: int = 64, + additional_in_dim: int = 0, + dim: int = 3072, + device=None, dtype=None, operations=None + ): + super().__init__() + self.img_in = operations.Linear(in_dim + additional_in_dim, dim, device=device, dtype=dtype) + self.controlnet_blocks = torch.nn.ModuleList( + [ + BlockWiseControlBlock(dim, device=device, dtype=dtype, operations=operations) + for _ in range(num_layers) + ] + ) + + def process_input_latent_image(self, latent_image): + latent_image = comfy.latent_formats.Wan21().process_in(latent_image) + patch_size = 2 + hidden_states = comfy.ldm.common_dit.pad_to_patch_size(latent_image, (1, patch_size, patch_size)) + orig_shape = hidden_states.shape + hidden_states = hidden_states.view(orig_shape[0], orig_shape[1], orig_shape[-2] // 2, 2, orig_shape[-1] // 2, 2) + hidden_states = hidden_states.permute(0, 2, 4, 1, 3, 5) + hidden_states = hidden_states.reshape(orig_shape[0], (orig_shape[-2] // 2) * (orig_shape[-1] // 2), orig_shape[1] * 4) + return self.img_in(hidden_states) + + def control_block(self, img, controlnet_conditioning, block_id): + return self.controlnet_blocks[block_id](img, controlnet_conditioning) + + +class ModelPatchLoader: + @classmethod + def INPUT_TYPES(s): + return {"required": { "name": (folder_paths.get_filename_list("model_patches"), ), + }} + RETURN_TYPES = ("MODEL_PATCH",) + FUNCTION = "load_model_patch" + EXPERIMENTAL = True + + CATEGORY = "advanced/loaders" + + def load_model_patch(self, name): + model_patch_path = folder_paths.get_full_path_or_raise("model_patches", name) + sd = comfy.utils.load_torch_file(model_patch_path, safe_load=True) + dtype = comfy.utils.weight_dtype(sd) + # TODO: this node will work with more types of model patches + model = QwenImageBlockWiseControlNet(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast) + model.load_state_dict(sd) + model = comfy.model_patcher.ModelPatcher(model, load_device=comfy.model_management.get_torch_device(), offload_device=comfy.model_management.unet_offload_device()) + return (model,) + + +class DiffSynthCnetPatch: + def __init__(self, model_patch, vae, image, strength): + self.encoded_image = model_patch.model.process_input_latent_image(vae.encode(image)) + self.model_patch = model_patch + self.vae = vae + self.image = image + self.strength = strength + + def __call__(self, kwargs): + x = kwargs.get("x") + img = kwargs.get("img") + block_index = kwargs.get("block_index") + if self.encoded_image is None or self.encoded_image.shape[1:] != img.shape[1:]: + spacial_compression = self.vae.spacial_compression_encode() + image_scaled = comfy.utils.common_upscale(self.image.movedim(-1, 1), x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression, "area", "center") + loaded_models = comfy.model_management.loaded_models(only_currently_used=True) + self.encoded_image = self.model_patch.model.process_input_latent_image(self.vae.encode(image_scaled.movedim(1, -1))) + comfy.model_management.load_models_gpu(loaded_models) + + img = img + (self.model_patch.model.control_block(img, self.encoded_image.to(img.dtype), block_index) * self.strength) + kwargs['img'] = img + return kwargs + + def to(self, device_or_dtype): + if isinstance(device_or_dtype, torch.device): + self.encoded_image = self.encoded_image.to(device_or_dtype) + return self + + def models(self): + return [self.model_patch] + +class QwenImageDiffsynthControlnet: + @classmethod + def INPUT_TYPES(s): + return {"required": { "model": ("MODEL",), + "model_patch": ("MODEL_PATCH",), + "vae": ("VAE",), + "image": ("IMAGE",), + "strength": ("FLOAT", {"default": 1.0, "min": -10.0, "max": 10.0, "step": 0.01}), + }} + RETURN_TYPES = ("MODEL",) + FUNCTION = "diffsynth_controlnet" + EXPERIMENTAL = True + + CATEGORY = "advanced/loaders/qwen" + + def diffsynth_controlnet(self, model, model_patch, vae, image, strength): + model_patched = model.clone() + image = image[:, :, :, :3] + model_patched.set_model_double_block_patch(DiffSynthCnetPatch(model_patch, vae, image, strength)) + return (model_patched,) + + +NODE_CLASS_MAPPINGS = { + "ModelPatchLoader": ModelPatchLoader, + "QwenImageDiffsynthControlnet": QwenImageDiffsynthControlnet, +} diff --git a/models/model_patches/put_model_patches_here b/models/model_patches/put_model_patches_here new file mode 100644 index 000000000..e69de29bb diff --git a/nodes.py b/nodes.py index 35dda1b19..9681750d3 100644 --- a/nodes.py +++ b/nodes.py @@ -2322,6 +2322,7 @@ async def init_builtin_extra_nodes(): "nodes_tcfg.py", "nodes_context_windows.py", "nodes_qwen.py", + "nodes_model_patch.py" ] import_failed = [] From 0737b7e0d245de20192064da4888debbef3241c2 Mon Sep 17 00:00:00 2001 From: saurabh-pingale Date: Thu, 21 Aug 2025 07:57:57 +0530 Subject: [PATCH 169/243] fix(userdata): catch invalid workflow filenames (#9434) (#9445) --- app/user_manager.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/app/user_manager.py b/app/user_manager.py index 0ec3e46ea..a2d376c0c 100644 --- a/app/user_manager.py +++ b/app/user_manager.py @@ -363,10 +363,17 @@ class UserManager(): if not overwrite and os.path.exists(path): return web.Response(status=409, text="File already exists") - body = await request.read() + try: + body = await request.read() - with open(path, "wb") as f: - f.write(body) + with open(path, "wb") as f: + f.write(body) + except OSError as e: + logging.warning(f"Error saving file '{path}': {e}") + return web.Response( + status=400, + reason="Invalid filename. Please avoid special characters like :\\/*?\"<>|" + ) user_path = self.get_request_user_filepath(request, None) if full_info: From 9fa1036f60b5264302072453be524aa55928bbaf Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 20 Aug 2025 20:09:35 -0700 Subject: [PATCH 170/243] Forgot this. (#9470) --- folder_paths.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/folder_paths.py b/folder_paths.py index 9ec952940..b34af39e8 100644 --- a/folder_paths.py +++ b/folder_paths.py @@ -46,6 +46,8 @@ folder_names_and_paths["photomaker"] = ([os.path.join(models_dir, "photomaker")] folder_names_and_paths["classifiers"] = ([os.path.join(models_dir, "classifiers")], {""}) +folder_names_and_paths["model_patches"] = ([os.path.join(models_dir, "model_patches")], supported_pt_extensions) + output_directory = os.path.join(base_path, "output") temp_directory = os.path.join(base_path, "temp") input_directory = os.path.join(base_path, "input") From 1b2de2642d38099acdde7c460d133d93e91074f0 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 20 Aug 2025 21:33:49 -0700 Subject: [PATCH 171/243] Support diffsynth inpaint controlnet (model patch). (#9471) --- comfy_extras/nodes_model_patch.py | 39 ++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/comfy_extras/nodes_model_patch.py b/comfy_extras/nodes_model_patch.py index bb239bc45..3eaada9bc 100644 --- a/comfy_extras/nodes_model_patch.py +++ b/comfy_extras/nodes_model_patch.py @@ -35,6 +35,7 @@ class QwenImageBlockWiseControlNet(torch.nn.Module): device=None, dtype=None, operations=None ): super().__init__() + self.additional_in_dim = additional_in_dim self.img_in = operations.Linear(in_dim + additional_in_dim, dim, device=device, dtype=dtype) self.controlnet_blocks = torch.nn.ModuleList( [ @@ -44,7 +45,7 @@ class QwenImageBlockWiseControlNet(torch.nn.Module): ) def process_input_latent_image(self, latent_image): - latent_image = comfy.latent_formats.Wan21().process_in(latent_image) + latent_image[:, :16] = comfy.latent_formats.Wan21().process_in(latent_image[:, :16]) patch_size = 2 hidden_states = comfy.ldm.common_dit.pad_to_patch_size(latent_image, (1, patch_size, patch_size)) orig_shape = hidden_states.shape @@ -73,19 +74,33 @@ class ModelPatchLoader: sd = comfy.utils.load_torch_file(model_patch_path, safe_load=True) dtype = comfy.utils.weight_dtype(sd) # TODO: this node will work with more types of model patches - model = QwenImageBlockWiseControlNet(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast) + additional_in_dim = sd["img_in.weight"].shape[1] - 64 + model = QwenImageBlockWiseControlNet(additional_in_dim=additional_in_dim, device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast) model.load_state_dict(sd) model = comfy.model_patcher.ModelPatcher(model, load_device=comfy.model_management.get_torch_device(), offload_device=comfy.model_management.unet_offload_device()) return (model,) class DiffSynthCnetPatch: - def __init__(self, model_patch, vae, image, strength): - self.encoded_image = model_patch.model.process_input_latent_image(vae.encode(image)) + def __init__(self, model_patch, vae, image, strength, mask=None): self.model_patch = model_patch self.vae = vae self.image = image self.strength = strength + self.mask = mask + self.encoded_image = model_patch.model.process_input_latent_image(self.encode_latent_cond(image)) + + def encode_latent_cond(self, image): + latent_image = self.vae.encode(image) + if self.model_patch.model.additional_in_dim > 0: + if self.mask is None: + mask_ = torch.ones_like(latent_image)[:, :self.model_patch.model.additional_in_dim // 4] + else: + mask_ = comfy.utils.common_upscale(self.mask.mean(dim=1, keepdim=True), latent_image.shape[-1], latent_image.shape[-2], "bilinear", "none") + + return torch.cat([latent_image, mask_], dim=1) + else: + return latent_image def __call__(self, kwargs): x = kwargs.get("x") @@ -95,7 +110,7 @@ class DiffSynthCnetPatch: spacial_compression = self.vae.spacial_compression_encode() image_scaled = comfy.utils.common_upscale(self.image.movedim(-1, 1), x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression, "area", "center") loaded_models = comfy.model_management.loaded_models(only_currently_used=True) - self.encoded_image = self.model_patch.model.process_input_latent_image(self.vae.encode(image_scaled.movedim(1, -1))) + self.encoded_image = self.model_patch.model.process_input_latent_image(self.encode_latent_cond(image_scaled.movedim(1, -1))) comfy.model_management.load_models_gpu(loaded_models) img = img + (self.model_patch.model.control_block(img, self.encoded_image.to(img.dtype), block_index) * self.strength) @@ -118,17 +133,25 @@ class QwenImageDiffsynthControlnet: "vae": ("VAE",), "image": ("IMAGE",), "strength": ("FLOAT", {"default": 1.0, "min": -10.0, "max": 10.0, "step": 0.01}), - }} + }, + "optional": {"mask": ("MASK",)}} RETURN_TYPES = ("MODEL",) FUNCTION = "diffsynth_controlnet" EXPERIMENTAL = True CATEGORY = "advanced/loaders/qwen" - def diffsynth_controlnet(self, model, model_patch, vae, image, strength): + def diffsynth_controlnet(self, model, model_patch, vae, image, strength, mask=None): model_patched = model.clone() image = image[:, :, :, :3] - model_patched.set_model_double_block_patch(DiffSynthCnetPatch(model_patch, vae, image, strength)) + if mask is not None: + if mask.ndim == 3: + mask = mask.unsqueeze(1) + if mask.ndim == 4: + mask = mask.unsqueeze(2) + mask = 1.0 - mask + + model_patched.set_model_double_block_patch(DiffSynthCnetPatch(model_patch, vae, image, strength, mask)) return (model_patched,) From bc49106837b627eb657fc86f2e475770ac5ce68a Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 22 Aug 2025 05:03:57 +0300 Subject: [PATCH 172/243] convert String nodes to V3 schema (#9370) --- comfy_extras/nodes_string.py | 449 ++++++++++++++++++----------------- 1 file changed, 237 insertions(+), 212 deletions(-) diff --git a/comfy_extras/nodes_string.py b/comfy_extras/nodes_string.py index b1a8ceef0..571d89f62 100644 --- a/comfy_extras/nodes_string.py +++ b/comfy_extras/nodes_string.py @@ -1,77 +1,91 @@ import re +from typing_extensions import override -from comfy.comfy_types.node_typing import IO +from comfy_api.latest import ComfyExtension, io -class StringConcatenate(): + +class StringConcatenate(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "string_a": (IO.STRING, {"multiline": True}), - "string_b": (IO.STRING, {"multiline": True}), - "delimiter": (IO.STRING, {"multiline": False, "default": ""}) - } - } + def define_schema(cls): + return io.Schema( + node_id="StringConcatenate", + display_name="Concatenate", + category="utils/string", + inputs=[ + io.String.Input("string_a", multiline=True), + io.String.Input("string_b", multiline=True), + io.String.Input("delimiter", multiline=False, default=""), + ], + outputs=[ + io.String.Output(), + ] + ) - RETURN_TYPES = (IO.STRING,) - FUNCTION = "execute" - CATEGORY = "utils/string" - - def execute(self, string_a, string_b, delimiter, **kwargs): - return delimiter.join((string_a, string_b)), - -class StringSubstring(): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "string": (IO.STRING, {"multiline": True}), - "start": (IO.INT, {}), - "end": (IO.INT, {}), - } - } + def execute(cls, string_a, string_b, delimiter): + return io.NodeOutput(delimiter.join((string_a, string_b))) - RETURN_TYPES = (IO.STRING,) - FUNCTION = "execute" - CATEGORY = "utils/string" - def execute(self, string, start, end, **kwargs): - return string[start:end], - -class StringLength(): +class StringSubstring(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "string": (IO.STRING, {"multiline": True}) - } - } + def define_schema(cls): + return io.Schema( + node_id="StringSubstring", + display_name="Substring", + category="utils/string", + inputs=[ + io.String.Input("string", multiline=True), + io.Int.Input("start"), + io.Int.Input("end"), + ], + outputs=[ + io.String.Output(), + ] + ) - RETURN_TYPES = (IO.INT,) - RETURN_NAMES = ("length",) - FUNCTION = "execute" - CATEGORY = "utils/string" - - def execute(self, string, **kwargs): - length = len(string) - - return length, - -class CaseConverter(): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "string": (IO.STRING, {"multiline": True}), - "mode": (IO.COMBO, {"options": ["UPPERCASE", "lowercase", "Capitalize", "Title Case"]}) - } - } + def execute(cls, string, start, end): + return io.NodeOutput(string[start:end]) - RETURN_TYPES = (IO.STRING,) - FUNCTION = "execute" - CATEGORY = "utils/string" - def execute(self, string, mode, **kwargs): +class StringLength(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="StringLength", + display_name="Length", + category="utils/string", + inputs=[ + io.String.Input("string", multiline=True), + ], + outputs=[ + io.Int.Output(display_name="length"), + ] + ) + + @classmethod + def execute(cls, string): + return io.NodeOutput(len(string)) + + +class CaseConverter(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="CaseConverter", + display_name="Case Converter", + category="utils/string", + inputs=[ + io.String.Input("string", multiline=True), + io.Combo.Input("mode", options=["UPPERCASE", "lowercase", "Capitalize", "Title Case"]), + ], + outputs=[ + io.String.Output(), + ] + ) + + @classmethod + def execute(cls, string, mode): if mode == "UPPERCASE": result = string.upper() elif mode == "lowercase": @@ -83,24 +97,27 @@ class CaseConverter(): else: result = string - return result, + return io.NodeOutput(result) -class StringTrim(): +class StringTrim(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "string": (IO.STRING, {"multiline": True}), - "mode": (IO.COMBO, {"options": ["Both", "Left", "Right"]}) - } - } + def define_schema(cls): + return io.Schema( + node_id="StringTrim", + display_name="Trim", + category="utils/string", + inputs=[ + io.String.Input("string", multiline=True), + io.Combo.Input("mode", options=["Both", "Left", "Right"]), + ], + outputs=[ + io.String.Output(), + ] + ) - RETURN_TYPES = (IO.STRING,) - FUNCTION = "execute" - CATEGORY = "utils/string" - - def execute(self, string, mode, **kwargs): + @classmethod + def execute(cls, string, mode): if mode == "Both": result = string.strip() elif mode == "Left": @@ -110,70 +127,78 @@ class StringTrim(): else: result = string - return result, + return io.NodeOutput(result) -class StringReplace(): + +class StringReplace(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "string": (IO.STRING, {"multiline": True}), - "find": (IO.STRING, {"multiline": True}), - "replace": (IO.STRING, {"multiline": True}) - } - } + def define_schema(cls): + return io.Schema( + node_id="StringReplace", + display_name="Replace", + category="utils/string", + inputs=[ + io.String.Input("string", multiline=True), + io.String.Input("find", multiline=True), + io.String.Input("replace", multiline=True), + ], + outputs=[ + io.String.Output(), + ] + ) - RETURN_TYPES = (IO.STRING,) - FUNCTION = "execute" - CATEGORY = "utils/string" - - def execute(self, string, find, replace, **kwargs): - result = string.replace(find, replace) - return result, - - -class StringContains(): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "string": (IO.STRING, {"multiline": True}), - "substring": (IO.STRING, {"multiline": True}), - "case_sensitive": (IO.BOOLEAN, {"default": True}) - } - } + def execute(cls, string, find, replace): + return io.NodeOutput(string.replace(find, replace)) - RETURN_TYPES = (IO.BOOLEAN,) - RETURN_NAMES = ("contains",) - FUNCTION = "execute" - CATEGORY = "utils/string" - def execute(self, string, substring, case_sensitive, **kwargs): +class StringContains(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="StringContains", + display_name="Contains", + category="utils/string", + inputs=[ + io.String.Input("string", multiline=True), + io.String.Input("substring", multiline=True), + io.Boolean.Input("case_sensitive", default=True), + ], + outputs=[ + io.Boolean.Output(display_name="contains"), + ] + ) + + @classmethod + def execute(cls, string, substring, case_sensitive): if case_sensitive: contains = substring in string else: contains = substring.lower() in string.lower() - return contains, + return io.NodeOutput(contains) -class StringCompare(): +class StringCompare(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "string_a": (IO.STRING, {"multiline": True}), - "string_b": (IO.STRING, {"multiline": True}), - "mode": (IO.COMBO, {"options": ["Starts With", "Ends With", "Equal"]}), - "case_sensitive": (IO.BOOLEAN, {"default": True}) - } - } + def define_schema(cls): + return io.Schema( + node_id="StringCompare", + display_name="Compare", + category="utils/string", + inputs=[ + io.String.Input("string_a", multiline=True), + io.String.Input("string_b", multiline=True), + io.Combo.Input("mode", options=["Starts With", "Ends With", "Equal"]), + io.Boolean.Input("case_sensitive", default=True), + ], + outputs=[ + io.Boolean.Output(), + ] + ) - RETURN_TYPES = (IO.BOOLEAN,) - FUNCTION = "execute" - CATEGORY = "utils/string" - - def execute(self, string_a, string_b, mode, case_sensitive, **kwargs): + @classmethod + def execute(cls, string_a, string_b, mode, case_sensitive): if case_sensitive: a = string_a b = string_b @@ -182,31 +207,34 @@ class StringCompare(): b = string_b.lower() if mode == "Equal": - return a == b, + return io.NodeOutput(a == b) elif mode == "Starts With": - return a.startswith(b), + return io.NodeOutput(a.startswith(b)) elif mode == "Ends With": - return a.endswith(b), + return io.NodeOutput(a.endswith(b)) -class RegexMatch(): + +class RegexMatch(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "string": (IO.STRING, {"multiline": True}), - "regex_pattern": (IO.STRING, {"multiline": True}), - "case_insensitive": (IO.BOOLEAN, {"default": True}), - "multiline": (IO.BOOLEAN, {"default": False}), - "dotall": (IO.BOOLEAN, {"default": False}) - } - } + def define_schema(cls): + return io.Schema( + node_id="RegexMatch", + display_name="Regex Match", + category="utils/string", + inputs=[ + io.String.Input("string", multiline=True), + io.String.Input("regex_pattern", multiline=True), + io.Boolean.Input("case_insensitive", default=True), + io.Boolean.Input("multiline", default=False), + io.Boolean.Input("dotall", default=False), + ], + outputs=[ + io.Boolean.Output(display_name="matches"), + ] + ) - RETURN_TYPES = (IO.BOOLEAN,) - RETURN_NAMES = ("matches",) - FUNCTION = "execute" - CATEGORY = "utils/string" - - def execute(self, string, regex_pattern, case_insensitive, multiline, dotall, **kwargs): + @classmethod + def execute(cls, string, regex_pattern, case_insensitive, multiline, dotall): flags = 0 if case_insensitive: @@ -223,29 +251,32 @@ class RegexMatch(): except re.error: result = False - return result, + return io.NodeOutput(result) -class RegexExtract(): +class RegexExtract(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "string": (IO.STRING, {"multiline": True}), - "regex_pattern": (IO.STRING, {"multiline": True}), - "mode": (IO.COMBO, {"options": ["First Match", "All Matches", "First Group", "All Groups"]}), - "case_insensitive": (IO.BOOLEAN, {"default": True}), - "multiline": (IO.BOOLEAN, {"default": False}), - "dotall": (IO.BOOLEAN, {"default": False}), - "group_index": (IO.INT, {"default": 1, "min": 0, "max": 100}) - } - } + def define_schema(cls): + return io.Schema( + node_id="RegexExtract", + display_name="Regex Extract", + category="utils/string", + inputs=[ + io.String.Input("string", multiline=True), + io.String.Input("regex_pattern", multiline=True), + io.Combo.Input("mode", options=["First Match", "All Matches", "First Group", "All Groups"]), + io.Boolean.Input("case_insensitive", default=True), + io.Boolean.Input("multiline", default=False), + io.Boolean.Input("dotall", default=False), + io.Int.Input("group_index", default=1, min=0, max=100), + ], + outputs=[ + io.String.Output(), + ] + ) - RETURN_TYPES = (IO.STRING,) - FUNCTION = "execute" - CATEGORY = "utils/string" - - def execute(self, string, regex_pattern, mode, case_insensitive, multiline, dotall, group_index, **kwargs): + @classmethod + def execute(cls, string, regex_pattern, mode, case_insensitive, multiline, dotall, group_index): join_delimiter = "\n" flags = 0 @@ -294,32 +325,33 @@ class RegexExtract(): except re.error: result = "" - return result, + return io.NodeOutput(result) -class RegexReplace(): - DESCRIPTION = "Find and replace text using regex patterns." +class RegexReplace(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "string": (IO.STRING, {"multiline": True}), - "regex_pattern": (IO.STRING, {"multiline": True}), - "replace": (IO.STRING, {"multiline": True}), - }, - "optional": { - "case_insensitive": (IO.BOOLEAN, {"default": True}), - "multiline": (IO.BOOLEAN, {"default": False}), - "dotall": (IO.BOOLEAN, {"default": False, "tooltip": "When enabled, the dot (.) character will match any character including newline characters. When disabled, dots won't match newlines."}), - "count": (IO.INT, {"default": 0, "min": 0, "max": 100, "tooltip": "Maximum number of replacements to make. Set to 0 to replace all occurrences (default). Set to 1 to replace only the first match, 2 for the first two matches, etc."}), - } - } + def define_schema(cls): + return io.Schema( + node_id="RegexReplace", + display_name="Regex Replace", + category="utils/string", + description="Find and replace text using regex patterns.", + inputs=[ + io.String.Input("string", multiline=True), + io.String.Input("regex_pattern", multiline=True), + io.String.Input("replace", multiline=True), + io.Boolean.Input("case_insensitive", default=True, optional=True), + io.Boolean.Input("multiline", default=False, optional=True), + io.Boolean.Input("dotall", default=False, optional=True, tooltip="When enabled, the dot (.) character will match any character including newline characters. When disabled, dots won't match newlines."), + io.Int.Input("count", default=0, min=0, max=100, optional=True, tooltip="Maximum number of replacements to make. Set to 0 to replace all occurrences (default). Set to 1 to replace only the first match, 2 for the first two matches, etc."), + ], + outputs=[ + io.String.Output(), + ] + ) - RETURN_TYPES = (IO.STRING,) - FUNCTION = "execute" - CATEGORY = "utils/string" - - def execute(self, string, regex_pattern, replace, case_insensitive=True, multiline=False, dotall=False, count=0, **kwargs): + @classmethod + def execute(cls, string, regex_pattern, replace, case_insensitive=True, multiline=False, dotall=False, count=0): flags = 0 if case_insensitive: @@ -329,32 +361,25 @@ class RegexReplace(): if dotall: flags |= re.DOTALL result = re.sub(regex_pattern, replace, string, count=count, flags=flags) - return result, + return io.NodeOutput(result) -NODE_CLASS_MAPPINGS = { - "StringConcatenate": StringConcatenate, - "StringSubstring": StringSubstring, - "StringLength": StringLength, - "CaseConverter": CaseConverter, - "StringTrim": StringTrim, - "StringReplace": StringReplace, - "StringContains": StringContains, - "StringCompare": StringCompare, - "RegexMatch": RegexMatch, - "RegexExtract": RegexExtract, - "RegexReplace": RegexReplace, -} -NODE_DISPLAY_NAME_MAPPINGS = { - "StringConcatenate": "Concatenate", - "StringSubstring": "Substring", - "StringLength": "Length", - "CaseConverter": "Case Converter", - "StringTrim": "Trim", - "StringReplace": "Replace", - "StringContains": "Contains", - "StringCompare": "Compare", - "RegexMatch": "Regex Match", - "RegexExtract": "Regex Extract", - "RegexReplace": "Regex Replace", -} +class StringExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + StringConcatenate, + StringSubstring, + StringLength, + CaseConverter, + StringTrim, + StringReplace, + StringContains, + StringCompare, + RegexMatch, + RegexExtract, + RegexReplace, + ] + +async def comfy_entrypoint() -> StringExtension: + return StringExtension() From bab08f40d10c8737c3424e35bbff873bcb2333bd Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 22 Aug 2025 05:05:36 +0300 Subject: [PATCH 173/243] v3 nodes (part a) (#9149) --- comfy_extras/nodes_ace.py | 80 +++++++----- comfy_extras/nodes_advanced_samplers.py | 88 +++++++------ comfy_extras/nodes_apg.py | 72 +++++++---- comfy_extras/nodes_attention_multiply.py | 154 ++++++++++++++--------- 4 files changed, 239 insertions(+), 155 deletions(-) diff --git a/comfy_extras/nodes_ace.py b/comfy_extras/nodes_ace.py index cbfec15a2..1409233c9 100644 --- a/comfy_extras/nodes_ace.py +++ b/comfy_extras/nodes_ace.py @@ -1,49 +1,63 @@ import torch +from typing_extensions import override + import comfy.model_management import node_helpers +from comfy_api.latest import ComfyExtension, io -class TextEncodeAceStepAudio: + +class TextEncodeAceStepAudio(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "clip": ("CLIP", ), - "tags": ("STRING", {"multiline": True, "dynamicPrompts": True}), - "lyrics": ("STRING", {"multiline": True, "dynamicPrompts": True}), - "lyrics_strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - }} - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "encode" + def define_schema(cls): + return io.Schema( + node_id="TextEncodeAceStepAudio", + category="conditioning", + inputs=[ + io.Clip.Input("clip"), + io.String.Input("tags", multiline=True, dynamic_prompts=True), + io.String.Input("lyrics", multiline=True, dynamic_prompts=True), + io.Float.Input("lyrics_strength", default=1.0, min=0.0, max=10.0, step=0.01), + ], + outputs=[io.Conditioning.Output()], + ) - CATEGORY = "conditioning" - - def encode(self, clip, tags, lyrics, lyrics_strength): + @classmethod + def execute(cls, clip, tags, lyrics, lyrics_strength) -> io.NodeOutput: tokens = clip.tokenize(tags, lyrics=lyrics) conditioning = clip.encode_from_tokens_scheduled(tokens) conditioning = node_helpers.conditioning_set_values(conditioning, {"lyrics_strength": lyrics_strength}) - return (conditioning, ) + return io.NodeOutput(conditioning) -class EmptyAceStepLatentAudio: - def __init__(self): - self.device = comfy.model_management.intermediate_device() +class EmptyAceStepLatentAudio(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="EmptyAceStepLatentAudio", + category="latent/audio", + inputs=[ + io.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.1), + io.Int.Input( + "batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch." + ), + ], + outputs=[io.Latent.Output()], + ) @classmethod - def INPUT_TYPES(s): - return {"required": {"seconds": ("FLOAT", {"default": 120.0, "min": 1.0, "max": 1000.0, "step": 0.1}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096, "tooltip": "The number of latent images in the batch."}), - }} - RETURN_TYPES = ("LATENT",) - FUNCTION = "generate" - - CATEGORY = "latent/audio" - - def generate(self, seconds, batch_size): + def execute(cls, seconds, batch_size) -> io.NodeOutput: length = int(seconds * 44100 / 512 / 8) - latent = torch.zeros([batch_size, 8, 16, length], device=self.device) - return ({"samples": latent, "type": "audio"}, ) + latent = torch.zeros([batch_size, 8, 16, length], device=comfy.model_management.intermediate_device()) + return io.NodeOutput({"samples": latent, "type": "audio"}) -NODE_CLASS_MAPPINGS = { - "TextEncodeAceStepAudio": TextEncodeAceStepAudio, - "EmptyAceStepLatentAudio": EmptyAceStepLatentAudio, -} +class AceExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + TextEncodeAceStepAudio, + EmptyAceStepLatentAudio, + ] + +async def comfy_entrypoint() -> AceExtension: + return AceExtension() diff --git a/comfy_extras/nodes_advanced_samplers.py b/comfy_extras/nodes_advanced_samplers.py index 5fbb096fb..5532ffe6a 100644 --- a/comfy_extras/nodes_advanced_samplers.py +++ b/comfy_extras/nodes_advanced_samplers.py @@ -1,8 +1,13 @@ +import numpy as np +import torch +from tqdm.auto import trange +from typing_extensions import override + +import comfy.model_patcher import comfy.samplers import comfy.utils -import torch -import numpy as np -from tqdm.auto import trange +from comfy.k_diffusion.sampling import to_d +from comfy_api.latest import ComfyExtension, io @torch.no_grad() @@ -33,30 +38,29 @@ def sample_lcm_upscale(model, x, sigmas, extra_args=None, callback=None, disable return x -class SamplerLCMUpscale: - upscale_methods = ["bislerp", "nearest-exact", "bilinear", "area", "bicubic"] +class SamplerLCMUpscale(io.ComfyNode): + UPSCALE_METHODS = ["bislerp", "nearest-exact", "bilinear", "area", "bicubic"] @classmethod - def INPUT_TYPES(s): - return {"required": - {"scale_ratio": ("FLOAT", {"default": 1.0, "min": 0.1, "max": 20.0, "step": 0.01}), - "scale_steps": ("INT", {"default": -1, "min": -1, "max": 1000, "step": 1}), - "upscale_method": (s.upscale_methods,), - } - } - RETURN_TYPES = ("SAMPLER",) - CATEGORY = "sampling/custom_sampling/samplers" + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="SamplerLCMUpscale", + category="sampling/custom_sampling/samplers", + inputs=[ + io.Float.Input("scale_ratio", default=1.0, min=0.1, max=20.0, step=0.01), + io.Int.Input("scale_steps", default=-1, min=-1, max=1000, step=1), + io.Combo.Input("upscale_method", options=cls.UPSCALE_METHODS), + ], + outputs=[io.Sampler.Output()], + ) - FUNCTION = "get_sampler" - - def get_sampler(self, scale_ratio, scale_steps, upscale_method): + @classmethod + def execute(cls, scale_ratio, scale_steps, upscale_method) -> io.NodeOutput: if scale_steps < 0: scale_steps = None sampler = comfy.samplers.KSAMPLER(sample_lcm_upscale, extra_options={"total_upscale": scale_ratio, "upscale_steps": scale_steps, "upscale_method": upscale_method}) - return (sampler, ) + return io.NodeOutput(sampler) -from comfy.k_diffusion.sampling import to_d -import comfy.model_patcher @torch.no_grad() def sample_euler_pp(model, x, sigmas, extra_args=None, callback=None, disable=None): @@ -82,30 +86,36 @@ def sample_euler_pp(model, x, sigmas, extra_args=None, callback=None, disable=No return x -class SamplerEulerCFGpp: +class SamplerEulerCFGpp(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"version": (["regular", "alternative"],),} - } - RETURN_TYPES = ("SAMPLER",) - # CATEGORY = "sampling/custom_sampling/samplers" - CATEGORY = "_for_testing" + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="SamplerEulerCFGpp", + display_name="SamplerEulerCFG++", + category="_for_testing", # "sampling/custom_sampling/samplers" + inputs=[ + io.Combo.Input("version", options=["regular", "alternative"]), + ], + outputs=[io.Sampler.Output()], + is_experimental=True, + ) - FUNCTION = "get_sampler" - - def get_sampler(self, version): + @classmethod + def execute(cls, version) -> io.NodeOutput: if version == "alternative": sampler = comfy.samplers.KSAMPLER(sample_euler_pp) else: sampler = comfy.samplers.ksampler("euler_cfg_pp") - return (sampler, ) + return io.NodeOutput(sampler) -NODE_CLASS_MAPPINGS = { - "SamplerLCMUpscale": SamplerLCMUpscale, - "SamplerEulerCFGpp": SamplerEulerCFGpp, -} -NODE_DISPLAY_NAME_MAPPINGS = { - "SamplerEulerCFGpp": "SamplerEulerCFG++", -} +class AdvancedSamplersExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + SamplerLCMUpscale, + SamplerEulerCFGpp, + ] + +async def comfy_entrypoint() -> AdvancedSamplersExtension: + return AdvancedSamplersExtension() diff --git a/comfy_extras/nodes_apg.py b/comfy_extras/nodes_apg.py index 25b21b1b8..f27ae7da8 100644 --- a/comfy_extras/nodes_apg.py +++ b/comfy_extras/nodes_apg.py @@ -1,4 +1,8 @@ import torch +from typing_extensions import override + +from comfy_api.latest import ComfyExtension, io + def project(v0, v1): v1 = torch.nn.functional.normalize(v1, dim=[-1, -2, -3]) @@ -6,22 +10,45 @@ def project(v0, v1): v0_orthogonal = v0 - v0_parallel return v0_parallel, v0_orthogonal -class APG: +class APG(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "model": ("MODEL",), - "eta": ("FLOAT", {"default": 1.0, "min": -10.0, "max": 10.0, "step": 0.01, "tooltip": "Controls the scale of the parallel guidance vector. Default CFG behavior at a setting of 1."}), - "norm_threshold": ("FLOAT", {"default": 5.0, "min": 0.0, "max": 50.0, "step": 0.1, "tooltip": "Normalize guidance vector to this value, normalization disable at a setting of 0."}), - "momentum": ("FLOAT", {"default": 0.0, "min": -5.0, "max": 1.0, "step": 0.01, "tooltip":"Controls a running average of guidance during diffusion, disabled at a setting of 0."}), - } - } - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" - CATEGORY = "sampling/custom_sampling" + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="APG", + display_name="Adaptive Projected Guidance", + category="sampling/custom_sampling", + inputs=[ + io.Model.Input("model"), + io.Float.Input( + "eta", + default=1.0, + min=-10.0, + max=10.0, + step=0.01, + tooltip="Controls the scale of the parallel guidance vector. Default CFG behavior at a setting of 1.", + ), + io.Float.Input( + "norm_threshold", + default=5.0, + min=0.0, + max=50.0, + step=0.1, + tooltip="Normalize guidance vector to this value, normalization disable at a setting of 0.", + ), + io.Float.Input( + "momentum", + default=0.0, + min=-5.0, + max=1.0, + step=0.01, + tooltip="Controls a running average of guidance during diffusion, disabled at a setting of 0.", + ), + ], + outputs=[io.Model.Output()], + ) - def patch(self, model, eta, norm_threshold, momentum): + @classmethod + def execute(cls, model, eta, norm_threshold, momentum) -> io.NodeOutput: running_avg = 0 prev_sigma = None @@ -65,12 +92,15 @@ class APG: m = model.clone() m.set_model_sampler_pre_cfg_function(pre_cfg_function) - return (m,) + return io.NodeOutput(m) -NODE_CLASS_MAPPINGS = { - "APG": APG, -} -NODE_DISPLAY_NAME_MAPPINGS = { - "APG": "Adaptive Projected Guidance", -} +class ApgExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + APG, + ] + +async def comfy_entrypoint() -> ApgExtension: + return ApgExtension() diff --git a/comfy_extras/nodes_attention_multiply.py b/comfy_extras/nodes_attention_multiply.py index 4747eb395..c0e494c2a 100644 --- a/comfy_extras/nodes_attention_multiply.py +++ b/comfy_extras/nodes_attention_multiply.py @@ -1,3 +1,7 @@ +from typing_extensions import override + +from comfy_api.latest import ComfyExtension, io + def attention_multiply(attn, model, q, k, v, out): m = model.clone() @@ -16,57 +20,71 @@ def attention_multiply(attn, model, q, k, v, out): return m -class UNetSelfAttentionMultiply: +class UNetSelfAttentionMultiply(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "model": ("MODEL",), - "q": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "k": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "v": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "out": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="UNetSelfAttentionMultiply", + category="_for_testing/attention_experiments", + inputs=[ + io.Model.Input("model"), + io.Float.Input("q", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("k", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("v", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("out", default=1.0, min=0.0, max=10.0, step=0.01), + ], + outputs=[io.Model.Output()], + is_experimental=True, + ) - CATEGORY = "_for_testing/attention_experiments" - - def patch(self, model, q, k, v, out): + @classmethod + def execute(cls, model, q, k, v, out) -> io.NodeOutput: m = attention_multiply("attn1", model, q, k, v, out) - return (m, ) + return io.NodeOutput(m) -class UNetCrossAttentionMultiply: + +class UNetCrossAttentionMultiply(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "model": ("MODEL",), - "q": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "k": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "v": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "out": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="UNetCrossAttentionMultiply", + category="_for_testing/attention_experiments", + inputs=[ + io.Model.Input("model"), + io.Float.Input("q", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("k", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("v", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("out", default=1.0, min=0.0, max=10.0, step=0.01), + ], + outputs=[io.Model.Output()], + is_experimental=True, + ) - CATEGORY = "_for_testing/attention_experiments" - - def patch(self, model, q, k, v, out): + @classmethod + def execute(cls, model, q, k, v, out) -> io.NodeOutput: m = attention_multiply("attn2", model, q, k, v, out) - return (m, ) + return io.NodeOutput(m) -class CLIPAttentionMultiply: + +class CLIPAttentionMultiply(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "clip": ("CLIP",), - "q": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "k": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "v": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "out": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - }} - RETURN_TYPES = ("CLIP",) - FUNCTION = "patch" + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="CLIPAttentionMultiply", + category="_for_testing/attention_experiments", + inputs=[ + io.Clip.Input("clip"), + io.Float.Input("q", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("k", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("v", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("out", default=1.0, min=0.0, max=10.0, step=0.01), + ], + outputs=[io.Clip.Output()], + is_experimental=True, + ) - CATEGORY = "_for_testing/attention_experiments" - - def patch(self, clip, q, k, v, out): + @classmethod + def execute(cls, clip, q, k, v, out) -> io.NodeOutput: m = clip.clone() sd = m.patcher.model_state_dict() @@ -79,23 +97,28 @@ class CLIPAttentionMultiply: m.add_patches({key: (None,)}, 0.0, v) if key.endswith("self_attn.out_proj.weight") or key.endswith("self_attn.out_proj.bias"): m.add_patches({key: (None,)}, 0.0, out) - return (m, ) + return io.NodeOutput(m) -class UNetTemporalAttentionMultiply: + +class UNetTemporalAttentionMultiply(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "model": ("MODEL",), - "self_structural": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "self_temporal": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "cross_structural": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "cross_temporal": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="UNetTemporalAttentionMultiply", + category="_for_testing/attention_experiments", + inputs=[ + io.Model.Input("model"), + io.Float.Input("self_structural", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("self_temporal", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("cross_structural", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("cross_temporal", default=1.0, min=0.0, max=10.0, step=0.01), + ], + outputs=[io.Model.Output()], + is_experimental=True, + ) - CATEGORY = "_for_testing/attention_experiments" - - def patch(self, model, self_structural, self_temporal, cross_structural, cross_temporal): + @classmethod + def execute(cls, model, self_structural, self_temporal, cross_structural, cross_temporal) -> io.NodeOutput: m = model.clone() sd = model.model_state_dict() @@ -110,11 +133,18 @@ class UNetTemporalAttentionMultiply: m.add_patches({k: (None,)}, 0.0, cross_temporal) else: m.add_patches({k: (None,)}, 0.0, cross_structural) - return (m, ) + return io.NodeOutput(m) -NODE_CLASS_MAPPINGS = { - "UNetSelfAttentionMultiply": UNetSelfAttentionMultiply, - "UNetCrossAttentionMultiply": UNetCrossAttentionMultiply, - "CLIPAttentionMultiply": CLIPAttentionMultiply, - "UNetTemporalAttentionMultiply": UNetTemporalAttentionMultiply, -} + +class AttentionMultiplyExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + UNetSelfAttentionMultiply, + UNetCrossAttentionMultiply, + CLIPAttentionMultiply, + UNetTemporalAttentionMultiply, + ] + +async def comfy_entrypoint() -> AttentionMultiplyExtension: + return AttentionMultiplyExtension() From eb39019daae96128ee848d0b7837ede299518a7c Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 22 Aug 2025 05:06:13 +0300 Subject: [PATCH 174/243] [V3] convert Google Veo API node to the V3 schema (#9272) * convert Google Veo API node to the V3 schema * use own full io.Schema for Veo3VideoGenerationNode * fixed typo * use auth_kwargs instead of auth_token/comfy_api_key --- comfy_api_nodes/nodes_veo2.py | 332 +++++++++++++++++++--------------- 1 file changed, 190 insertions(+), 142 deletions(-) diff --git a/comfy_api_nodes/nodes_veo2.py b/comfy_api_nodes/nodes_veo2.py index e25dab2f5..251aecd42 100644 --- a/comfy_api_nodes/nodes_veo2.py +++ b/comfy_api_nodes/nodes_veo2.py @@ -1,17 +1,18 @@ -import io import logging import base64 import aiohttp import torch +from io import BytesIO from typing import Optional +from typing_extensions import override -from comfy.comfy_types.node_typing import IO, ComfyNodeABC +from comfy_api.latest import ComfyExtension, io as comfy_io from comfy_api.input_impl.video_types import VideoFromFile from comfy_api_nodes.apis import ( VeoGenVidRequest, VeoGenVidResponse, VeoGenVidPollRequest, - VeoGenVidPollResponse + VeoGenVidPollResponse, ) from comfy_api_nodes.apis.client import ( ApiEndpoint, @@ -22,7 +23,7 @@ from comfy_api_nodes.apis.client import ( from comfy_api_nodes.apinode_utils import ( downscale_image_tensor, - tensor_to_base64_string + tensor_to_base64_string, ) AVERAGE_DURATION_VIDEO_GEN = 32 @@ -50,7 +51,7 @@ def get_video_url_from_response(poll_response: VeoGenVidPollResponse) -> Optiona return None -class VeoVideoGenerationNode(ComfyNodeABC): +class VeoVideoGenerationNode(comfy_io.ComfyNode): """ Generates videos from text prompts using Google's Veo API. @@ -59,101 +60,93 @@ class VeoVideoGenerationNode(ComfyNodeABC): """ @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Text description of the video", - }, + def define_schema(cls): + return comfy_io.Schema( + node_id="VeoVideoGenerationNode", + display_name="Google Veo 2 Video Generation", + category="api node/video/Veo", + description="Generates videos from text prompts using Google's Veo 2 API", + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Text description of the video", ), - "aspect_ratio": ( - IO.COMBO, - { - "options": ["16:9", "9:16"], - "default": "16:9", - "tooltip": "Aspect ratio of the output video", - }, + comfy_io.Combo.Input( + "aspect_ratio", + options=["16:9", "9:16"], + default="16:9", + tooltip="Aspect ratio of the output video", ), - }, - "optional": { - "negative_prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Negative text prompt to guide what to avoid in the video", - }, + comfy_io.String.Input( + "negative_prompt", + multiline=True, + default="", + tooltip="Negative text prompt to guide what to avoid in the video", + optional=True, ), - "duration_seconds": ( - IO.INT, - { - "default": 5, - "min": 5, - "max": 8, - "step": 1, - "display": "number", - "tooltip": "Duration of the output video in seconds", - }, + comfy_io.Int.Input( + "duration_seconds", + default=5, + min=5, + max=8, + step=1, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Duration of the output video in seconds", + optional=True, ), - "enhance_prompt": ( - IO.BOOLEAN, - { - "default": True, - "tooltip": "Whether to enhance the prompt with AI assistance", - } + comfy_io.Boolean.Input( + "enhance_prompt", + default=True, + tooltip="Whether to enhance the prompt with AI assistance", + optional=True, ), - "person_generation": ( - IO.COMBO, - { - "options": ["ALLOW", "BLOCK"], - "default": "ALLOW", - "tooltip": "Whether to allow generating people in the video", - }, + comfy_io.Combo.Input( + "person_generation", + options=["ALLOW", "BLOCK"], + default="ALLOW", + tooltip="Whether to allow generating people in the video", + optional=True, ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFF, - "step": 1, - "display": "number", - "control_after_generate": True, - "tooltip": "Seed for video generation (0 for random)", - }, + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFF, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed for video generation (0 for random)", + optional=True, ), - "image": (IO.IMAGE, { - "default": None, - "tooltip": "Optional reference image to guide video generation", - }), - "model": ( - IO.COMBO, - { - "options": ["veo-2.0-generate-001"], - "default": "veo-2.0-generate-001", - "tooltip": "Veo 2 model to use for video generation", - }, + comfy_io.Image.Input( + "image", + tooltip="Optional reference image to guide video generation", + optional=True, ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + comfy_io.Combo.Input( + "model", + options=["veo-2.0-generate-001"], + default="veo-2.0-generate-001", + tooltip="Veo 2 model to use for video generation", + optional=True, + ), + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) - RETURN_TYPES = (IO.VIDEO,) - FUNCTION = "generate_video" - CATEGORY = "api node/video/Veo" - DESCRIPTION = "Generates videos from text prompts using Google's Veo 2 API" - API_NODE = True - - async def generate_video( - self, + @classmethod + async def execute( + cls, prompt, aspect_ratio="16:9", negative_prompt="", @@ -164,8 +157,6 @@ class VeoVideoGenerationNode(ComfyNodeABC): image=None, model="veo-2.0-generate-001", generate_audio=False, - unique_id: Optional[str] = None, - **kwargs, ): # Prepare the instances for the request instances = [] @@ -202,6 +193,10 @@ class VeoVideoGenerationNode(ComfyNodeABC): if "veo-3.0" in model: parameters["generateAudio"] = generate_audio + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } # Initial request to start video generation initial_operation = SynchronousOperation( endpoint=ApiEndpoint( @@ -214,7 +209,7 @@ class VeoVideoGenerationNode(ComfyNodeABC): instances=instances, parameters=parameters ), - auth_kwargs=kwargs, + auth_kwargs=auth, ) initial_response = await initial_operation.execute() @@ -248,10 +243,10 @@ class VeoVideoGenerationNode(ComfyNodeABC): request=VeoGenVidPollRequest( operationName=operation_name ), - auth_kwargs=kwargs, + auth_kwargs=auth, poll_interval=5.0, result_url_extractor=get_video_url_from_response, - node_id=unique_id, + node_id=cls.hidden.unique_id, estimated_duration=AVERAGE_DURATION_VIDEO_GEN, ) @@ -304,10 +299,10 @@ class VeoVideoGenerationNode(ComfyNodeABC): logging.info("Video generation completed successfully") # Convert video data to BytesIO object - video_io = io.BytesIO(video_data) + video_io = BytesIO(video_data) # Return VideoFromFile object - return (VideoFromFile(video_io),) + return comfy_io.NodeOutput(VideoFromFile(video_io)) class Veo3VideoGenerationNode(VeoVideoGenerationNode): @@ -323,51 +318,104 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode): """ @classmethod - def INPUT_TYPES(s): - parent_input = super().INPUT_TYPES() - - # Update model options for Veo 3 - parent_input["optional"]["model"] = ( - IO.COMBO, - { - "options": ["veo-3.0-generate-001", "veo-3.0-fast-generate-001"], - "default": "veo-3.0-generate-001", - "tooltip": "Veo 3 model to use for video generation", - }, + def define_schema(cls): + return comfy_io.Schema( + node_id="Veo3VideoGenerationNode", + display_name="Google Veo 3 Video Generation", + category="api node/video/Veo", + description="Generates videos from text prompts using Google's Veo 3 API", + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Text description of the video", + ), + comfy_io.Combo.Input( + "aspect_ratio", + options=["16:9", "9:16"], + default="16:9", + tooltip="Aspect ratio of the output video", + ), + comfy_io.String.Input( + "negative_prompt", + multiline=True, + default="", + tooltip="Negative text prompt to guide what to avoid in the video", + optional=True, + ), + comfy_io.Int.Input( + "duration_seconds", + default=8, + min=8, + max=8, + step=1, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Duration of the output video in seconds (Veo 3 only supports 8 seconds)", + optional=True, + ), + comfy_io.Boolean.Input( + "enhance_prompt", + default=True, + tooltip="Whether to enhance the prompt with AI assistance", + optional=True, + ), + comfy_io.Combo.Input( + "person_generation", + options=["ALLOW", "BLOCK"], + default="ALLOW", + tooltip="Whether to allow generating people in the video", + optional=True, + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFF, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed for video generation (0 for random)", + optional=True, + ), + comfy_io.Image.Input( + "image", + tooltip="Optional reference image to guide video generation", + optional=True, + ), + comfy_io.Combo.Input( + "model", + options=["veo-3.0-generate-001", "veo-3.0-fast-generate-001"], + default="veo-3.0-generate-001", + tooltip="Veo 3 model to use for video generation", + optional=True, + ), + comfy_io.Boolean.Input( + "generate_audio", + default=False, + tooltip="Generate audio for the video. Supported by all Veo 3 models.", + optional=True, + ), + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, ) - # Add generateAudio parameter - parent_input["optional"]["generate_audio"] = ( - IO.BOOLEAN, - { - "default": False, - "tooltip": "Generate audio for the video. Supported by all Veo 3 models.", - } - ) - # Update duration constraints for Veo 3 (only 8 seconds supported) - parent_input["optional"]["duration_seconds"] = ( - IO.INT, - { - "default": 8, - "min": 8, - "max": 8, - "step": 1, - "display": "number", - "tooltip": "Duration of the output video in seconds (Veo 3 only supports 8 seconds)", - }, - ) +class VeoExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + return [ + VeoVideoGenerationNode, + Veo3VideoGenerationNode, + ] - return parent_input - - -# Register the nodes -NODE_CLASS_MAPPINGS = { - "VeoVideoGenerationNode": VeoVideoGenerationNode, - "Veo3VideoGenerationNode": Veo3VideoGenerationNode, -} - -NODE_DISPLAY_NAME_MAPPINGS = { - "VeoVideoGenerationNode": "Google Veo 2 Video Generation", - "Veo3VideoGenerationNode": "Google Veo 3 Video Generation", -} +async def comfy_entrypoint() -> VeoExtension: + return VeoExtension() From 7ed73d12d13c2c389e0469c46c2db635a7d74278 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 22 Aug 2025 05:06:51 +0300 Subject: [PATCH 175/243] [V3] convert Ideogram API nodes to the V3 schema (#9278) * convert Ideogram API nodes to the V3 schema * use auth_kwargs instead of auth_token/comfy_api_key --- comfy_api_nodes/nodes_ideogram.py | 536 ++++++++++++++---------------- 1 file changed, 257 insertions(+), 279 deletions(-) diff --git a/comfy_api_nodes/nodes_ideogram.py b/comfy_api_nodes/nodes_ideogram.py index db24e6da4..d28895f3e 100644 --- a/comfy_api_nodes/nodes_ideogram.py +++ b/comfy_api_nodes/nodes_ideogram.py @@ -1,8 +1,8 @@ -from comfy.comfy_types.node_typing import IO, ComfyNodeABC, InputTypeDict -from inspect import cleandoc +from io import BytesIO +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io as comfy_io from PIL import Image import numpy as np -import io import torch from comfy_api_nodes.apis import ( IdeogramGenerateRequest, @@ -246,90 +246,81 @@ def display_image_urls_on_node(image_urls, node_id): PromptServer.instance.send_progress_text(urls_text, node_id) -class IdeogramV1(ComfyNodeABC): - """ - Generates images using the Ideogram V1 model. - """ - - def __init__(self): - pass +class IdeogramV1(comfy_io.ComfyNode): @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation", - }, + def define_schema(cls): + return comfy_io.Schema( + node_id="IdeogramV1", + display_name="Ideogram V1", + category="api node/image/Ideogram", + description="Generates images using the Ideogram V1 model.", + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the image generation", ), - "turbo": ( - IO.BOOLEAN, - { - "default": False, - "tooltip": "Whether to use turbo mode (faster generation, potentially lower quality)", - } + comfy_io.Boolean.Input( + "turbo", + default=False, + tooltip="Whether to use turbo mode (faster generation, potentially lower quality)", ), - }, - "optional": { - "aspect_ratio": ( - IO.COMBO, - { - "options": list(V1_V2_RATIO_MAP.keys()), - "default": "1:1", - "tooltip": "The aspect ratio for image generation.", - }, + comfy_io.Combo.Input( + "aspect_ratio", + options=list(V1_V2_RATIO_MAP.keys()), + default="1:1", + tooltip="The aspect ratio for image generation.", + optional=True, ), - "magic_prompt_option": ( - IO.COMBO, - { - "options": ["AUTO", "ON", "OFF"], - "default": "AUTO", - "tooltip": "Determine if MagicPrompt should be used in generation", - }, + comfy_io.Combo.Input( + "magic_prompt_option", + options=["AUTO", "ON", "OFF"], + default="AUTO", + tooltip="Determine if MagicPrompt should be used in generation", + optional=True, ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 2147483647, - "step": 1, - "control_after_generate": True, - "display": "number", - }, + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + control_after_generate=True, + display_mode=comfy_io.NumberDisplay.number, + optional=True, ), - "negative_prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Description of what to exclude from the image", - }, + comfy_io.String.Input( + "negative_prompt", + multiline=True, + default="", + tooltip="Description of what to exclude from the image", + optional=True, ), - "num_images": ( - IO.INT, - {"default": 1, "min": 1, "max": 8, "step": 1, "display": "number"}, + comfy_io.Int.Input( + "num_images", + default=1, + min=1, + max=8, + step=1, + display_mode=comfy_io.NumberDisplay.number, + optional=True, ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[ + comfy_io.Image.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + ) - RETURN_TYPES = (IO.IMAGE,) - FUNCTION = "api_call" - CATEGORY = "api node/image/Ideogram" - DESCRIPTION = cleandoc(__doc__ or "") - API_NODE = True - - async def api_call( - self, + @classmethod + async def execute( + cls, prompt, turbo=False, aspect_ratio="1:1", @@ -337,13 +328,15 @@ class IdeogramV1(ComfyNodeABC): seed=0, negative_prompt="", num_images=1, - unique_id=None, - **kwargs, ): # Determine the model based on turbo setting aspect_ratio = V1_V2_RATIO_MAP.get(aspect_ratio, None) model = "V_1_TURBO" if turbo else "V_1" + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } operation = SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/ideogram/generate", @@ -364,7 +357,7 @@ class IdeogramV1(ComfyNodeABC): negative_prompt=negative_prompt if negative_prompt else None, ) ), - auth_kwargs=kwargs, + auth_kwargs=auth, ) response = await operation.execute() @@ -377,93 +370,85 @@ class IdeogramV1(ComfyNodeABC): if not image_urls: raise Exception("No image URLs were generated in the response") - display_image_urls_on_node(image_urls, unique_id) - return (await download_and_process_images(image_urls),) + display_image_urls_on_node(image_urls, cls.hidden.unique_id) + return comfy_io.NodeOutput(await download_and_process_images(image_urls)) -class IdeogramV2(ComfyNodeABC): - """ - Generates images using the Ideogram V2 model. - """ - - def __init__(self): - pass +class IdeogramV2(comfy_io.ComfyNode): @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation", - }, + def define_schema(cls): + return comfy_io.Schema( + node_id="IdeogramV2", + display_name="Ideogram V2", + category="api node/image/Ideogram", + description="Generates images using the Ideogram V2 model.", + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the image generation", ), - "turbo": ( - IO.BOOLEAN, - { - "default": False, - "tooltip": "Whether to use turbo mode (faster generation, potentially lower quality)", - } + comfy_io.Boolean.Input( + "turbo", + default=False, + tooltip="Whether to use turbo mode (faster generation, potentially lower quality)", ), - }, - "optional": { - "aspect_ratio": ( - IO.COMBO, - { - "options": list(V1_V2_RATIO_MAP.keys()), - "default": "1:1", - "tooltip": "The aspect ratio for image generation. Ignored if resolution is not set to AUTO.", - }, + comfy_io.Combo.Input( + "aspect_ratio", + options=list(V1_V2_RATIO_MAP.keys()), + default="1:1", + tooltip="The aspect ratio for image generation. Ignored if resolution is not set to AUTO.", + optional=True, ), - "resolution": ( - IO.COMBO, - { - "options": list(V1_V1_RES_MAP.keys()), - "default": "Auto", - "tooltip": "The resolution for image generation. If not set to AUTO, this overrides the aspect_ratio setting.", - }, + comfy_io.Combo.Input( + "resolution", + options=list(V1_V1_RES_MAP.keys()), + default="Auto", + tooltip="The resolution for image generation. " + "If not set to AUTO, this overrides the aspect_ratio setting.", + optional=True, ), - "magic_prompt_option": ( - IO.COMBO, - { - "options": ["AUTO", "ON", "OFF"], - "default": "AUTO", - "tooltip": "Determine if MagicPrompt should be used in generation", - }, + comfy_io.Combo.Input( + "magic_prompt_option", + options=["AUTO", "ON", "OFF"], + default="AUTO", + tooltip="Determine if MagicPrompt should be used in generation", + optional=True, ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 2147483647, - "step": 1, - "control_after_generate": True, - "display": "number", - }, + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + control_after_generate=True, + display_mode=comfy_io.NumberDisplay.number, + optional=True, ), - "style_type": ( - IO.COMBO, - { - "options": ["AUTO", "GENERAL", "REALISTIC", "DESIGN", "RENDER_3D", "ANIME"], - "default": "NONE", - "tooltip": "Style type for generation (V2 only)", - }, + comfy_io.Combo.Input( + "style_type", + options=["AUTO", "GENERAL", "REALISTIC", "DESIGN", "RENDER_3D", "ANIME"], + default="NONE", + tooltip="Style type for generation (V2 only)", + optional=True, ), - "negative_prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Description of what to exclude from the image", - }, + comfy_io.String.Input( + "negative_prompt", + multiline=True, + default="", + tooltip="Description of what to exclude from the image", + optional=True, ), - "num_images": ( - IO.INT, - {"default": 1, "min": 1, "max": 8, "step": 1, "display": "number"}, + comfy_io.Int.Input( + "num_images", + default=1, + min=1, + max=8, + step=1, + display_mode=comfy_io.NumberDisplay.number, + optional=True, ), #"color_palette": ( # IO.STRING, @@ -473,22 +458,20 @@ class IdeogramV2(ComfyNodeABC): # "tooltip": "Color palette preset name or hex colors with weights", # }, #), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[ + comfy_io.Image.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + ) - RETURN_TYPES = (IO.IMAGE,) - FUNCTION = "api_call" - CATEGORY = "api node/image/Ideogram" - DESCRIPTION = cleandoc(__doc__ or "") - API_NODE = True - - async def api_call( - self, + @classmethod + async def execute( + cls, prompt, turbo=False, aspect_ratio="1:1", @@ -499,8 +482,6 @@ class IdeogramV2(ComfyNodeABC): negative_prompt="", num_images=1, color_palette="", - unique_id=None, - **kwargs, ): aspect_ratio = V1_V2_RATIO_MAP.get(aspect_ratio, None) resolution = V1_V1_RES_MAP.get(resolution, None) @@ -517,6 +498,10 @@ class IdeogramV2(ComfyNodeABC): else: final_aspect_ratio = aspect_ratio if aspect_ratio != "ASPECT_1_1" else None + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } operation = SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/ideogram/generate", @@ -540,7 +525,7 @@ class IdeogramV2(ComfyNodeABC): color_palette=color_palette if color_palette else None, ) ), - auth_kwargs=kwargs, + auth_kwargs=auth, ) response = await operation.execute() @@ -553,108 +538,99 @@ class IdeogramV2(ComfyNodeABC): if not image_urls: raise Exception("No image URLs were generated in the response") - display_image_urls_on_node(image_urls, unique_id) - return (await download_and_process_images(image_urls),) + display_image_urls_on_node(image_urls, cls.hidden.unique_id) + return comfy_io.NodeOutput(await download_and_process_images(image_urls)) -class IdeogramV3(ComfyNodeABC): - """ - Generates images using the Ideogram V3 model. Supports both regular image generation from text prompts and image editing with mask. - """ - def __init__(self): - pass +class IdeogramV3(comfy_io.ComfyNode): @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation or editing", - }, + def define_schema(cls): + return comfy_io.Schema( + node_id="IdeogramV3", + display_name="Ideogram V3", + category="api node/image/Ideogram", + description="Generates images using the Ideogram V3 model. " + "Supports both regular image generation from text prompts and image editing with mask.", + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the image generation or editing", ), - }, - "optional": { - "image": ( - IO.IMAGE, - { - "default": None, - "tooltip": "Optional reference image for image editing.", - }, + comfy_io.Image.Input( + "image", + tooltip="Optional reference image for image editing.", + optional=True, ), - "mask": ( - IO.MASK, - { - "default": None, - "tooltip": "Optional mask for inpainting (white areas will be replaced)", - }, + comfy_io.Mask.Input( + "mask", + tooltip="Optional mask for inpainting (white areas will be replaced)", + optional=True, ), - "aspect_ratio": ( - IO.COMBO, - { - "options": list(V3_RATIO_MAP.keys()), - "default": "1:1", - "tooltip": "The aspect ratio for image generation. Ignored if resolution is not set to Auto.", - }, + comfy_io.Combo.Input( + "aspect_ratio", + options=list(V3_RATIO_MAP.keys()), + default="1:1", + tooltip="The aspect ratio for image generation. Ignored if resolution is not set to Auto.", + optional=True, ), - "resolution": ( - IO.COMBO, - { - "options": V3_RESOLUTIONS, - "default": "Auto", - "tooltip": "The resolution for image generation. If not set to Auto, this overrides the aspect_ratio setting.", - }, + comfy_io.Combo.Input( + "resolution", + options=V3_RESOLUTIONS, + default="Auto", + tooltip="The resolution for image generation. " + "If not set to Auto, this overrides the aspect_ratio setting.", + optional=True, ), - "magic_prompt_option": ( - IO.COMBO, - { - "options": ["AUTO", "ON", "OFF"], - "default": "AUTO", - "tooltip": "Determine if MagicPrompt should be used in generation", - }, + comfy_io.Combo.Input( + "magic_prompt_option", + options=["AUTO", "ON", "OFF"], + default="AUTO", + tooltip="Determine if MagicPrompt should be used in generation", + optional=True, ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 2147483647, - "step": 1, - "control_after_generate": True, - "display": "number", - }, + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + control_after_generate=True, + display_mode=comfy_io.NumberDisplay.number, + optional=True, ), - "num_images": ( - IO.INT, - {"default": 1, "min": 1, "max": 8, "step": 1, "display": "number"}, + comfy_io.Int.Input( + "num_images", + default=1, + min=1, + max=8, + step=1, + display_mode=comfy_io.NumberDisplay.number, + optional=True, ), - "rendering_speed": ( - IO.COMBO, - { - "options": ["BALANCED", "TURBO", "QUALITY"], - "default": "BALANCED", - "tooltip": "Controls the trade-off between generation speed and quality", - }, + comfy_io.Combo.Input( + "rendering_speed", + options=["BALANCED", "TURBO", "QUALITY"], + default="BALANCED", + tooltip="Controls the trade-off between generation speed and quality", + optional=True, ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[ + comfy_io.Image.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + ) - RETURN_TYPES = (IO.IMAGE,) - FUNCTION = "api_call" - CATEGORY = "api node/image/Ideogram" - DESCRIPTION = cleandoc(__doc__ or "") - API_NODE = True - - async def api_call( - self, + @classmethod + async def execute( + cls, prompt, image=None, mask=None, @@ -664,9 +640,11 @@ class IdeogramV3(ComfyNodeABC): seed=0, num_images=1, rendering_speed="BALANCED", - unique_id=None, - **kwargs, ): + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } # Check if both image and mask are provided for editing mode if image is not None and mask is not None: # Edit mode @@ -686,7 +664,7 @@ class IdeogramV3(ComfyNodeABC): # Process image img_np = (input_tensor.numpy() * 255).astype(np.uint8) img = Image.fromarray(img_np) - img_byte_arr = io.BytesIO() + img_byte_arr = BytesIO() img.save(img_byte_arr, format="PNG") img_byte_arr.seek(0) img_binary = img_byte_arr @@ -695,7 +673,7 @@ class IdeogramV3(ComfyNodeABC): # Process mask - white areas will be replaced mask_np = (mask.squeeze().cpu().numpy() * 255).astype(np.uint8) mask_img = Image.fromarray(mask_np) - mask_byte_arr = io.BytesIO() + mask_byte_arr = BytesIO() mask_img.save(mask_byte_arr, format="PNG") mask_byte_arr.seek(0) mask_binary = mask_byte_arr @@ -729,7 +707,7 @@ class IdeogramV3(ComfyNodeABC): "mask": mask_binary, }, content_type="multipart/form-data", - auth_kwargs=kwargs, + auth_kwargs=auth, ) elif image is not None or mask is not None: @@ -770,7 +748,7 @@ class IdeogramV3(ComfyNodeABC): response_model=IdeogramGenerateResponse, ), request=gen_request, - auth_kwargs=kwargs, + auth_kwargs=auth, ) # Execute the operation and process response @@ -784,18 +762,18 @@ class IdeogramV3(ComfyNodeABC): if not image_urls: raise Exception("No image URLs were generated in the response") - display_image_urls_on_node(image_urls, unique_id) - return (await download_and_process_images(image_urls),) + display_image_urls_on_node(image_urls, cls.hidden.unique_id) + return comfy_io.NodeOutput(await download_and_process_images(image_urls)) -NODE_CLASS_MAPPINGS = { - "IdeogramV1": IdeogramV1, - "IdeogramV2": IdeogramV2, - "IdeogramV3": IdeogramV3, -} +class IdeogramExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + return [ + IdeogramV1, + IdeogramV2, + IdeogramV3, + ] -NODE_DISPLAY_NAME_MAPPINGS = { - "IdeogramV1": "Ideogram V1", - "IdeogramV2": "Ideogram V2", - "IdeogramV3": "Ideogram V3", -} +async def comfy_entrypoint() -> IdeogramExtension: + return IdeogramExtension() From f7bd5e58dd03e799e02f6851b84b51e14ad0da7b Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 21 Aug 2025 20:18:04 -0700 Subject: [PATCH 176/243] Make it easier to implement future qwen controlnets. (#9485) --- comfy/controlnet.py | 4 ++-- comfy/ldm/qwen_image/model.py | 16 +++++++++++++--- comfy/model_detection.py | 2 ++ 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/comfy/controlnet.py b/comfy/controlnet.py index 988acdb57..6cb69dcdf 100644 --- a/comfy/controlnet.py +++ b/comfy/controlnet.py @@ -236,11 +236,11 @@ class ControlNet(ControlBase): self.cond_hint = None compression_ratio = self.compression_ratio if self.vae is not None: - compression_ratio *= self.vae.downscale_ratio + compression_ratio *= self.vae.spacial_compression_encode() else: if self.latent_format is not None: raise ValueError("This Controlnet needs a VAE but none was provided, please use a ControlNetApply node with a VAE input and connect it.") - self.cond_hint = comfy.utils.common_upscale(self.cond_hint_original, x_noisy.shape[3] * compression_ratio, x_noisy.shape[2] * compression_ratio, self.upscale_algorithm, "center") + self.cond_hint = comfy.utils.common_upscale(self.cond_hint_original, x_noisy.shape[-1] * compression_ratio, x_noisy.shape[-2] * compression_ratio, self.upscale_algorithm, "center") self.cond_hint = self.preprocess_image(self.cond_hint) if self.vae is not None: loaded_models = comfy.model_management.loaded_models(only_currently_used=True) diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index 2503583cb..d0e39833a 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -293,6 +293,7 @@ class QwenImageTransformer2DModel(nn.Module): guidance_embeds: bool = False, axes_dims_rope: Tuple[int, int, int] = (16, 56, 56), image_model=None, + final_layer=True, dtype=None, device=None, operations=None, @@ -300,6 +301,7 @@ class QwenImageTransformer2DModel(nn.Module): super().__init__() self.dtype = dtype self.patch_size = patch_size + self.in_channels = in_channels self.out_channels = out_channels or in_channels self.inner_dim = num_attention_heads * attention_head_dim @@ -329,9 +331,9 @@ class QwenImageTransformer2DModel(nn.Module): for _ in range(num_layers) ]) - self.norm_out = LastLayer(self.inner_dim, self.inner_dim, dtype=dtype, device=device, operations=operations) - self.proj_out = operations.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True, dtype=dtype, device=device) - self.gradient_checkpointing = False + if final_layer: + self.norm_out = LastLayer(self.inner_dim, self.inner_dim, dtype=dtype, device=device, operations=operations) + self.proj_out = operations.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True, dtype=dtype, device=device) def process_img(self, x, index=0, h_offset=0, w_offset=0): bs, c, t, h, w = x.shape @@ -362,6 +364,7 @@ class QwenImageTransformer2DModel(nn.Module): guidance: torch.Tensor = None, ref_latents=None, transformer_options={}, + control=None, **kwargs ): timestep = timesteps @@ -443,6 +446,13 @@ class QwenImageTransformer2DModel(nn.Module): hidden_states = out["img"] encoder_hidden_states = out["txt"] + if control is not None: # Controlnet + control_i = control.get("input") + if i < len(control_i): + add = control_i[i] + if add is not None: + hidden_states += add + hidden_states = self.norm_out(hidden_states, temb) hidden_states = self.proj_out(hidden_states) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 2bec0541e..0caff53e0 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -492,6 +492,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): if '{}txt_norm.weight'.format(key_prefix) in state_dict_keys: # Qwen Image dit_config = {} dit_config["image_model"] = "qwen_image" + dit_config["in_channels"] = state_dict['{}img_in.weight'.format(key_prefix)].shape[1] + dit_config["num_layers"] = count_blocks(state_dict_keys, '{}transformer_blocks.'.format(key_prefix) + '{}.') return dit_config if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys: From ff57793659702d502506047445f0972b10b6b9fe Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 21 Aug 2025 21:53:11 -0700 Subject: [PATCH 177/243] Support InstantX Qwen controlnet. (#9488) --- comfy/controlnet.py | 13 +++++ comfy/ldm/qwen_image/controlnet.py | 77 ++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 comfy/ldm/qwen_image/controlnet.py diff --git a/comfy/controlnet.py b/comfy/controlnet.py index 6cb69dcdf..e3dfedf55 100644 --- a/comfy/controlnet.py +++ b/comfy/controlnet.py @@ -36,6 +36,7 @@ import comfy.ldm.cascade.controlnet import comfy.cldm.mmdit import comfy.ldm.hydit.controlnet import comfy.ldm.flux.controlnet +import comfy.ldm.qwen_image.controlnet import comfy.cldm.dit_embedder from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -582,6 +583,15 @@ def load_controlnet_flux_instantx(sd, model_options={}): control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, concat_mask=concat_mask, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds) return control +def load_controlnet_qwen_instantx(sd, model_options={}): + model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(sd, model_options=model_options) + control_model = comfy.ldm.qwen_image.controlnet.QwenImageControlNetModel(operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config) + control_model = controlnet_load_state_dict(control_model, sd) + latent_format = comfy.latent_formats.Wan21() + extra_conds = [] + control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds) + return control + def convert_mistoline(sd): return comfy.utils.state_dict_prefix_replace(sd, {"single_controlnet_blocks.": "controlnet_single_blocks."}) @@ -655,8 +665,11 @@ def load_controlnet_state_dict(state_dict, model=None, model_options={}): return load_controlnet_sd35(controlnet_data, model_options=model_options) #Stability sd3.5 format else: return load_controlnet_mmdit(controlnet_data, model_options=model_options) #SD3 diffusers controlnet + elif "transformer_blocks.0.img_mlp.net.0.proj.weight" in controlnet_data: + return load_controlnet_qwen_instantx(controlnet_data, model_options=model_options) elif "controlnet_x_embedder.weight" in controlnet_data: return load_controlnet_flux_instantx(controlnet_data, model_options=model_options) + elif "controlnet_blocks.0.linear.weight" in controlnet_data: #mistoline flux return load_controlnet_flux_xlabs_mistoline(convert_mistoline(controlnet_data), mistoline=True, model_options=model_options) diff --git a/comfy/ldm/qwen_image/controlnet.py b/comfy/ldm/qwen_image/controlnet.py new file mode 100644 index 000000000..92ac3cf0a --- /dev/null +++ b/comfy/ldm/qwen_image/controlnet.py @@ -0,0 +1,77 @@ +import torch +import math + +from .model import QwenImageTransformer2DModel + + +class QwenImageControlNetModel(QwenImageTransformer2DModel): + def __init__( + self, + extra_condition_channels=0, + dtype=None, + device=None, + operations=None, + **kwargs + ): + super().__init__(final_layer=False, dtype=dtype, device=device, operations=operations, **kwargs) + self.main_model_double = 60 + + # controlnet_blocks + self.controlnet_blocks = torch.nn.ModuleList([]) + for _ in range(len(self.transformer_blocks)): + self.controlnet_blocks.append(operations.Linear(self.inner_dim, self.inner_dim, device=device, dtype=dtype)) + self.controlnet_x_embedder = operations.Linear(self.in_channels + extra_condition_channels, self.inner_dim, device=device, dtype=dtype) + + def forward( + self, + x, + timesteps, + context, + attention_mask=None, + guidance: torch.Tensor = None, + ref_latents=None, + hint=None, + transformer_options={}, + **kwargs + ): + timestep = timesteps + encoder_hidden_states = context + encoder_hidden_states_mask = attention_mask + + hidden_states, img_ids, orig_shape = self.process_img(x) + hint, _, _ = self.process_img(hint) + + txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size) // 2, ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size) // 2)) + txt_ids = torch.arange(txt_start, txt_start + context.shape[1], device=x.device).reshape(1, -1, 1).repeat(x.shape[0], 1, 3) + ids = torch.cat((txt_ids, img_ids), dim=1) + image_rotary_emb = self.pe_embedder(ids).squeeze(1).unsqueeze(2).to(x.dtype) + del ids, txt_ids, img_ids + + hidden_states = self.img_in(hidden_states) + self.controlnet_x_embedder(hint) + encoder_hidden_states = self.txt_norm(encoder_hidden_states) + encoder_hidden_states = self.txt_in(encoder_hidden_states) + + if guidance is not None: + guidance = guidance * 1000 + + temb = ( + self.time_text_embed(timestep, hidden_states) + if guidance is None + else self.time_text_embed(timestep, guidance, hidden_states) + ) + + repeat = math.ceil(self.main_model_double / len(self.controlnet_blocks)) + + controlnet_block_samples = () + for i, block in enumerate(self.transformer_blocks): + encoder_hidden_states, hidden_states = block( + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + encoder_hidden_states_mask=encoder_hidden_states_mask, + temb=temb, + image_rotary_emb=image_rotary_emb, + ) + + controlnet_block_samples = controlnet_block_samples + (self.controlnet_blocks[i](hidden_states),) * repeat + + return {"input": controlnet_block_samples[:self.main_model_double]} From 497d41fb500668635aa4a782549e9a0caa24375e Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 22 Aug 2025 20:50:35 +0300 Subject: [PATCH 178/243] feat(api-nodes): change "OpenAI Chat" display name to "OpenAI ChatGPT" (#9443) --- comfy_api_nodes/nodes_openai.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/comfy_api_nodes/nodes_openai.py b/comfy_api_nodes/nodes_openai.py index 674c9ede0..e3b81de75 100644 --- a/comfy_api_nodes/nodes_openai.py +++ b/comfy_api_nodes/nodes_openai.py @@ -998,7 +998,7 @@ NODE_DISPLAY_NAME_MAPPINGS = { "OpenAIDalle2": "OpenAI DALL·E 2", "OpenAIDalle3": "OpenAI DALL·E 3", "OpenAIGPTImage1": "OpenAI GPT Image 1", - "OpenAIChatNode": "OpenAI Chat", - "OpenAIInputFiles": "OpenAI Chat Input Files", - "OpenAIChatConfig": "OpenAI Chat Advanced Options", + "OpenAIChatNode": "OpenAI ChatGPT", + "OpenAIInputFiles": "OpenAI ChatGPT Input Files", + "OpenAIChatConfig": "OpenAI ChatGPT Advanced Options", } From 050c67323c33f6543309d4f09df706ec8c9a1389 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 22 Aug 2025 20:51:14 +0300 Subject: [PATCH 179/243] feat(api-nodes): add copy button to Gemini Chat node (#9440) --- comfy_api_nodes/nodes_gemini.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index ba4167a50..78c402a7a 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -5,7 +5,10 @@ See: https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/infer from __future__ import annotations +import json +import time import os +import uuid from enum import Enum from typing import Optional, Literal @@ -350,7 +353,27 @@ class GeminiNode(ComfyNodeABC): # Get result output output_text = self.get_text_from_response(response) if unique_id and output_text: - PromptServer.instance.send_progress_text(output_text, node_id=unique_id) + # Not a true chat history like the OpenAI Chat node. It is emulated so the frontend can show a copy button. + render_spec = { + "node_id": unique_id, + "component": "ChatHistoryWidget", + "props": { + "history": json.dumps( + [ + { + "prompt": prompt, + "response": output_text, + "response_id": str(uuid.uuid4()), + "timestamp": time.time(), + } + ] + ), + }, + } + PromptServer.instance.send_sync( + "display_component", + render_spec, + ) return (output_text or "Empty response from Gemini model...",) From ca4e96a8ae6c9ee8d40fe35100ed9b2247e71e40 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Sat, 23 Aug 2025 05:40:18 +0800 Subject: [PATCH 180/243] Update template to 0.1.65 (#9501) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8d928d826..6b53fabc1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.25.9 -comfyui-workflow-templates==0.1.62 +comfyui-workflow-templates==0.1.65 comfyui-embedded-docs==0.2.6 torch torchsde From fe31ad02768c66c61b3dc12f5d4bdfe8990ce25c Mon Sep 17 00:00:00 2001 From: contentis Date: Sat, 23 Aug 2025 01:39:15 +0200 Subject: [PATCH 181/243] Add elementwise fusions (#9495) * Add elementwise fusions * Add addcmul pattern to Qwen --- comfy/ldm/modules/diffusionmodules/mmdit.py | 12 +++++++----- comfy/ldm/qwen_image/model.py | 12 ++++++------ comfy/ldm/wan/model.py | 14 +++++++------- 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/comfy/ldm/modules/diffusionmodules/mmdit.py b/comfy/ldm/modules/diffusionmodules/mmdit.py index eaf3e73a4..4d6beba2d 100644 --- a/comfy/ldm/modules/diffusionmodules/mmdit.py +++ b/comfy/ldm/modules/diffusionmodules/mmdit.py @@ -109,7 +109,7 @@ class PatchEmbed(nn.Module): def modulate(x, shift, scale): if shift is None: shift = torch.zeros_like(scale) - return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1) + return torch.addcmul(shift.unsqueeze(1), x, 1+ scale.unsqueeze(1)) ################################################################################# @@ -564,10 +564,7 @@ class DismantledBlock(nn.Module): assert not self.pre_only attn1 = self.attn.post_attention(attn) attn2 = self.attn2.post_attention(attn2) - out1 = gate_msa.unsqueeze(1) * attn1 - out2 = gate_msa2.unsqueeze(1) * attn2 - x = x + out1 - x = x + out2 + x = gate_cat(x, gate_msa, gate_msa2, attn1, attn2) x = x + gate_mlp.unsqueeze(1) * self.mlp( modulate(self.norm2(x), shift_mlp, scale_mlp) ) @@ -594,6 +591,11 @@ class DismantledBlock(nn.Module): ) return self.post_attention(attn, *intermediates) +def gate_cat(x, gate_msa, gate_msa2, attn1, attn2): + out1 = gate_msa.unsqueeze(1) * attn1 + out2 = gate_msa2.unsqueeze(1) * attn2 + x = torch.stack([x, out1, out2], dim=0).sum(dim=0) + return x def block_mixing(*args, use_checkpoint=True, **kwargs): if use_checkpoint: diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index d0e39833a..af00ff119 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -214,9 +214,9 @@ class QwenImageTransformerBlock(nn.Module): operations=operations, ) - def _modulate(self, x, mod_params): - shift, scale, gate = mod_params.chunk(3, dim=-1) - return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1), gate.unsqueeze(1) + def _modulate(self, x: torch.Tensor, mod_params: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + shift, scale, gate = torch.chunk(mod_params, 3, dim=-1) + return torch.addcmul(shift.unsqueeze(1), x, 1 + scale.unsqueeze(1)), gate.unsqueeze(1) def forward( self, @@ -248,11 +248,11 @@ class QwenImageTransformerBlock(nn.Module): img_normed2 = self.img_norm2(hidden_states) img_modulated2, img_gate2 = self._modulate(img_normed2, img_mod2) - hidden_states = hidden_states + img_gate2 * self.img_mlp(img_modulated2) + hidden_states = torch.addcmul(hidden_states, img_gate2, self.img_mlp(img_modulated2)) txt_normed2 = self.txt_norm2(encoder_hidden_states) txt_modulated2, txt_gate2 = self._modulate(txt_normed2, txt_mod2) - encoder_hidden_states = encoder_hidden_states + txt_gate2 * self.txt_mlp(txt_modulated2) + encoder_hidden_states = torch.addcmul(encoder_hidden_states, txt_gate2, self.txt_mlp(txt_modulated2)) return encoder_hidden_states, hidden_states @@ -275,7 +275,7 @@ class LastLayer(nn.Module): def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor: emb = self.linear(self.silu(conditioning_embedding)) scale, shift = torch.chunk(emb, 2, dim=1) - x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :] + x = torch.addcmul(shift[:, None, :], self.norm(x), (1 + scale)[:, None, :]) return x diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index 9d3741be3..0726b8e1b 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -148,8 +148,8 @@ WAN_CROSSATTENTION_CLASSES = { def repeat_e(e, x): repeats = 1 - if e.shape[1] > 1: - repeats = x.shape[1] // e.shape[1] + if e.size(1) > 1: + repeats = x.size(1) // e.size(1) if repeats == 1: return e return torch.repeat_interleave(e, repeats, dim=1) @@ -219,15 +219,15 @@ class WanAttentionBlock(nn.Module): # self-attention y = self.self_attn( - self.norm1(x) * (1 + repeat_e(e[1], x)) + repeat_e(e[0], x), + torch.addcmul(repeat_e(e[0], x), self.norm1(x), 1 + repeat_e(e[1], x)), freqs) - x = x + y * repeat_e(e[2], x) + x = torch.addcmul(x, y, repeat_e(e[2], x)) # cross-attention & ffn x = x + self.cross_attn(self.norm3(x), context, context_img_len=context_img_len) - y = self.ffn(self.norm2(x) * (1 + repeat_e(e[4], x)) + repeat_e(e[3], x)) - x = x + y * repeat_e(e[5], x) + y = self.ffn(torch.addcmul(repeat_e(e[3], x), self.norm2(x), 1 + repeat_e(e[4], x))) + x = torch.addcmul(x, y, repeat_e(e[5], x)) return x @@ -342,7 +342,7 @@ class Head(nn.Module): else: e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device).unsqueeze(0) + e.unsqueeze(2)).unbind(2) - x = (self.head(self.norm(x) * (1 + repeat_e(e[1], x)) + repeat_e(e[0], x))) + x = (self.head(torch.addcmul(repeat_e(e[0], x), self.norm(x), 1 + repeat_e(e[1], x)))) return x From fc247150fec502b1834390516b556a87003f1d79 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Fri, 22 Aug 2025 19:41:08 -0700 Subject: [PATCH 182/243] Implement EasyCache and Invent LazyCache (#9496) * Attempting a universal implementation of EasyCache, starting with flux as test; I screwed up the math a bit, but when I set it just right it works. * Fixed math to make threshold work as expected, refactored code to use EasyCacheHolder instead of a dict wrapped by object * Use sigmas from transformer_options instead of timesteps to be compatible with a greater amount of models, make end_percent work * Make log statement when not skipping useful, preparing for per-cond caching * Added DIFFUSION_MODEL wrapper around forward function for wan model * Add subsampling for heuristic inputs * Add subsampling to output_prev (output_prev_subsampled now) * Properly consider conds in EasyCache logic * Created SuperEasyCache to test what happens if caching and reuse is moved outside the scope of conds, added PREDICT_NOISE wrapper to facilitate this test * Change max reuse_threshold to 3.0 * Mark EasyCache/SuperEasyCache as experimental (beta) * Make Lumina2 compatible with EasyCache * Add EasyCache support for Qwen Image * Fix missing comma, curse you Cursor * Add EasyCache support to AceStep * Add EasyCache support to Chroma * Added EasyCache support to Cosmos Predict t2i * Make EasyCache not crash with Cosmos Predict ImagToVideo latents, but does not work well at all * Add EasyCache support to hidream * Added EasyCache support to hunyuan video * Added EasyCache support to hunyuan3d * Added EasyCache support to LTXV (not very good, but does not crash) * Implemented EasyCache for aura_flow * Renamed SuperEasyCache to LazyCache, hardcoded subsample_factor to 8 on nodes * Eatra logging when verbose is true for EasyCache --- comfy/ldm/ace/model.py | 24 +- comfy/ldm/aura/mmdit.py | 8 + comfy/ldm/chroma/model.py | 8 + comfy/ldm/cosmos/model.py | 38 +++ comfy/ldm/cosmos/predict2.py | 17 +- comfy/ldm/flux/model.py | 8 + comfy/ldm/hidream/model.py | 19 +- comfy/ldm/hunyuan3d/model.py | 8 + comfy/ldm/hunyuan_video/model.py | 8 + comfy/ldm/lightricks/model.py | 8 + comfy/ldm/lumina/model.py | 10 +- comfy/ldm/qwen_image/model.py | 10 +- comfy/ldm/wan/model.py | 8 + comfy/patcher_extension.py | 1 + comfy/samplers.py | 9 +- comfy_extras/nodes_easycache.py | 459 +++++++++++++++++++++++++++++++ nodes.py | 3 +- 17 files changed, 639 insertions(+), 7 deletions(-) create mode 100644 comfy_extras/nodes_easycache.py diff --git a/comfy/ldm/ace/model.py b/comfy/ldm/ace/model.py index 12c524701..41d85eeb5 100644 --- a/comfy/ldm/ace/model.py +++ b/comfy/ldm/ace/model.py @@ -19,6 +19,7 @@ import torch from torch import nn import comfy.model_management +import comfy.patcher_extension from comfy.ldm.lightricks.model import TimestepEmbedding, Timesteps from .attention import LinearTransformerBlock, t2i_modulate @@ -343,7 +344,28 @@ class ACEStepTransformer2DModel(nn.Module): output = self.final_layer(hidden_states, embedded_timestep, output_length) return output - def forward( + def forward(self, + x, + timestep, + attention_mask=None, + context: Optional[torch.Tensor] = None, + text_attention_mask: Optional[torch.LongTensor] = None, + speaker_embeds: Optional[torch.FloatTensor] = None, + lyric_token_idx: Optional[torch.LongTensor] = None, + lyric_mask: Optional[torch.LongTensor] = None, + block_controlnet_hidden_states: Optional[Union[List[torch.Tensor], torch.Tensor]] = None, + controlnet_scale: Union[float, torch.Tensor] = 1.0, + lyrics_strength=1.0, + **kwargs + ): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, kwargs.get("transformer_options", {})) + ).execute(x, timestep, attention_mask, context, text_attention_mask, speaker_embeds, lyric_token_idx, lyric_mask, block_controlnet_hidden_states, + controlnet_scale, lyrics_strength, **kwargs) + + def _forward( self, x, timestep, diff --git a/comfy/ldm/aura/mmdit.py b/comfy/ldm/aura/mmdit.py index 1258ae11f..d7f32b5e8 100644 --- a/comfy/ldm/aura/mmdit.py +++ b/comfy/ldm/aura/mmdit.py @@ -9,6 +9,7 @@ import torch.nn.functional as F from comfy.ldm.modules.attention import optimized_attention import comfy.ops +import comfy.patcher_extension import comfy.ldm.common_dit def modulate(x, shift, scale): @@ -436,6 +437,13 @@ class MMDiT(nn.Module): return x + pos_encoding.reshape(1, -1, self.positional_encoding.shape[-1]) def forward(self, x, timestep, context, transformer_options={}, **kwargs): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options) + ).execute(x, timestep, context, transformer_options, **kwargs) + + def _forward(self, x, timestep, context, transformer_options={}, **kwargs): patches_replace = transformer_options.get("patches_replace", {}) # patchify x, add PE b, c, h, w = x.shape diff --git a/comfy/ldm/chroma/model.py b/comfy/ldm/chroma/model.py index 06021d4f2..5cff44dc8 100644 --- a/comfy/ldm/chroma/model.py +++ b/comfy/ldm/chroma/model.py @@ -5,6 +5,7 @@ from dataclasses import dataclass import torch from torch import Tensor, nn from einops import rearrange, repeat +import comfy.patcher_extension import comfy.ldm.common_dit from comfy.ldm.flux.layers import ( @@ -253,6 +254,13 @@ class Chroma(nn.Module): return img def forward(self, x, timestep, context, guidance, control=None, transformer_options={}, **kwargs): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options) + ).execute(x, timestep, context, guidance, control, transformer_options, **kwargs) + + def _forward(self, x, timestep, context, guidance, control=None, transformer_options={}, **kwargs): bs, c, h, w = x.shape x = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size)) diff --git a/comfy/ldm/cosmos/model.py b/comfy/ldm/cosmos/model.py index 4836e0b69..53698b758 100644 --- a/comfy/ldm/cosmos/model.py +++ b/comfy/ldm/cosmos/model.py @@ -27,6 +27,8 @@ from torchvision import transforms from enum import Enum import logging +import comfy.patcher_extension + from .blocks import ( FinalLayer, GeneralDITTransformerBlock, @@ -435,6 +437,42 @@ class GeneralDIT(nn.Module): latent_condition_sigma: Optional[torch.Tensor] = None, condition_video_augment_sigma: Optional[torch.Tensor] = None, **kwargs, + ): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, kwargs.get("transformer_options", {})) + ).execute(x, + timesteps, + context, + attention_mask, + fps, + image_size, + padding_mask, + scalar_feature, + data_type, + latent_condition, + latent_condition_sigma, + condition_video_augment_sigma, + **kwargs) + + def _forward( + self, + x: torch.Tensor, + timesteps: torch.Tensor, + context: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + # crossattn_emb: torch.Tensor, + # crossattn_mask: Optional[torch.Tensor] = None, + fps: Optional[torch.Tensor] = None, + image_size: Optional[torch.Tensor] = None, + padding_mask: Optional[torch.Tensor] = None, + scalar_feature: Optional[torch.Tensor] = None, + data_type: Optional[DataType] = DataType.VIDEO, + latent_condition: Optional[torch.Tensor] = None, + latent_condition_sigma: Optional[torch.Tensor] = None, + condition_video_augment_sigma: Optional[torch.Tensor] = None, + **kwargs, ): """ Args: diff --git a/comfy/ldm/cosmos/predict2.py b/comfy/ldm/cosmos/predict2.py index 316117f77..fcc83ba76 100644 --- a/comfy/ldm/cosmos/predict2.py +++ b/comfy/ldm/cosmos/predict2.py @@ -11,6 +11,7 @@ import math from .position_embedding import VideoRopePosition3DEmb, LearnablePosEmbAxis from torchvision import transforms +import comfy.patcher_extension from comfy.ldm.modules.attention import optimized_attention def apply_rotary_pos_emb( @@ -805,7 +806,21 @@ class MiniTrainDIT(nn.Module): ) return x_B_C_Tt_Hp_Wp - def forward( + def forward(self, + x: torch.Tensor, + timesteps: torch.Tensor, + context: torch.Tensor, + fps: Optional[torch.Tensor] = None, + padding_mask: Optional[torch.Tensor] = None, + **kwargs, + ): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, kwargs.get("transformer_options", {})) + ).execute(x, timesteps, context, fps, padding_mask, **kwargs) + + def _forward( self, x: torch.Tensor, timesteps: torch.Tensor, diff --git a/comfy/ldm/flux/model.py b/comfy/ldm/flux/model.py index c4de82795..0a77fa097 100644 --- a/comfy/ldm/flux/model.py +++ b/comfy/ldm/flux/model.py @@ -6,6 +6,7 @@ import torch from torch import Tensor, nn from einops import rearrange, repeat import comfy.ldm.common_dit +import comfy.patcher_extension from .layers import ( DoubleStreamBlock, @@ -214,6 +215,13 @@ class Flux(nn.Module): return img, repeat(img_ids, "h w c -> b (h w) c", b=bs) def forward(self, x, timestep, context, y=None, guidance=None, ref_latents=None, control=None, transformer_options={}, **kwargs): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options) + ).execute(x, timestep, context, y, guidance, ref_latents, control, transformer_options, **kwargs) + + def _forward(self, x, timestep, context, y=None, guidance=None, ref_latents=None, control=None, transformer_options={}, **kwargs): bs, c, h_orig, w_orig = x.shape patch_size = self.patch_size diff --git a/comfy/ldm/hidream/model.py b/comfy/ldm/hidream/model.py index 0305747bf..ae49cf945 100644 --- a/comfy/ldm/hidream/model.py +++ b/comfy/ldm/hidream/model.py @@ -13,6 +13,7 @@ from comfy.ldm.flux.layers import LastLayer from comfy.ldm.modules.attention import optimized_attention import comfy.model_management +import comfy.patcher_extension import comfy.ldm.common_dit @@ -692,7 +693,23 @@ class HiDreamImageTransformer2DModel(nn.Module): raise NotImplementedError return x, x_masks, img_sizes - def forward( + def forward(self, + x: torch.Tensor, + t: torch.Tensor, + y: Optional[torch.Tensor] = None, + context: Optional[torch.Tensor] = None, + encoder_hidden_states_llama3=None, + image_cond=None, + control = None, + transformer_options = {}, + ): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options) + ).execute(x, t, y, context, encoder_hidden_states_llama3, image_cond, control, transformer_options) + + def _forward( self, x: torch.Tensor, t: torch.Tensor, diff --git a/comfy/ldm/hunyuan3d/model.py b/comfy/ldm/hunyuan3d/model.py index 4e18358f0..0fa5e78c1 100644 --- a/comfy/ldm/hunyuan3d/model.py +++ b/comfy/ldm/hunyuan3d/model.py @@ -7,6 +7,7 @@ from comfy.ldm.flux.layers import ( SingleStreamBlock, timestep_embedding, ) +import comfy.patcher_extension class Hunyuan3Dv2(nn.Module): @@ -67,6 +68,13 @@ class Hunyuan3Dv2(nn.Module): self.final_layer = LastLayer(hidden_size, 1, in_channels, dtype=dtype, device=device, operations=operations) def forward(self, x, timestep, context, guidance=None, transformer_options={}, **kwargs): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options) + ).execute(x, timestep, context, guidance, transformer_options, **kwargs) + + def _forward(self, x, timestep, context, guidance=None, transformer_options={}, **kwargs): x = x.movedim(-1, -2) timestep = 1.0 - timestep txt = context diff --git a/comfy/ldm/hunyuan_video/model.py b/comfy/ldm/hunyuan_video/model.py index fbd8d4196..da1011596 100644 --- a/comfy/ldm/hunyuan_video/model.py +++ b/comfy/ldm/hunyuan_video/model.py @@ -1,6 +1,7 @@ #Based on Flux code because of weird hunyuan video code license. import torch +import comfy.patcher_extension import comfy.ldm.flux.layers import comfy.ldm.modules.diffusionmodules.mmdit from comfy.ldm.modules.attention import optimized_attention @@ -348,6 +349,13 @@ class HunyuanVideo(nn.Module): return repeat(img_ids, "t h w c -> b (t h w) c", b=bs) def forward(self, x, timestep, context, y, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, control=None, transformer_options={}, **kwargs): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options) + ).execute(x, timestep, context, y, guidance, attention_mask, guiding_frame_index, ref_latent, control, transformer_options, **kwargs) + + def _forward(self, x, timestep, context, y, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, control=None, transformer_options={}, **kwargs): bs, c, t, h, w = x.shape img_ids = self.img_ids(x) txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype) diff --git a/comfy/ldm/lightricks/model.py b/comfy/ldm/lightricks/model.py index ad9a7daea..aa2ea62b1 100644 --- a/comfy/ldm/lightricks/model.py +++ b/comfy/ldm/lightricks/model.py @@ -1,5 +1,6 @@ import torch from torch import nn +import comfy.patcher_extension import comfy.ldm.modules.attention import comfy.ldm.common_dit from einops import rearrange @@ -420,6 +421,13 @@ class LTXVModel(torch.nn.Module): self.patchifier = SymmetricPatchifier(1) def forward(self, x, timestep, context, attention_mask, frame_rate=25, transformer_options={}, keyframe_idxs=None, **kwargs): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options) + ).execute(x, timestep, context, attention_mask, frame_rate, transformer_options, keyframe_idxs, **kwargs) + + def _forward(self, x, timestep, context, attention_mask, frame_rate=25, transformer_options={}, keyframe_idxs=None, **kwargs): patches_replace = transformer_options.get("patches_replace", {}) orig_shape = list(x.shape) diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py index f8dc4d7db..e08ed817d 100644 --- a/comfy/ldm/lumina/model.py +++ b/comfy/ldm/lumina/model.py @@ -11,6 +11,7 @@ import comfy.ldm.common_dit from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder from comfy.ldm.modules.attention import optimized_attention_masked from comfy.ldm.flux.layers import EmbedND +import comfy.patcher_extension def modulate(x, scale): @@ -590,8 +591,15 @@ class NextDiT(nn.Module): return padded_full_embed, mask, img_sizes, l_effective_cap_len, freqs_cis - # def forward(self, x, t, cap_feats, cap_mask): def forward(self, x, timesteps, context, num_tokens, attention_mask=None, **kwargs): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, kwargs.get("transformer_options", {})) + ).execute(x, timesteps, context, num_tokens, attention_mask, **kwargs) + + # def forward(self, x, t, cap_feats, cap_mask): + def _forward(self, x, timesteps, context, num_tokens, attention_mask=None, **kwargs): t = 1.0 - timesteps cap_feats = context cap_mask = attention_mask diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index af00ff119..57a458210 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -9,6 +9,7 @@ from comfy.ldm.lightricks.model import TimestepEmbedding, Timesteps from comfy.ldm.modules.attention import optimized_attention_masked from comfy.ldm.flux.layers import EmbedND import comfy.ldm.common_dit +import comfy.patcher_extension class GELU(nn.Module): def __init__(self, dim_in: int, dim_out: int, approximate: str = "none", bias: bool = True, dtype=None, device=None, operations=None): @@ -355,7 +356,14 @@ class QwenImageTransformer2DModel(nn.Module): img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) - (w_len // 2) return hidden_states, repeat(img_ids, "h w c -> b (h w) c", b=bs), orig_shape - def forward( + def forward(self, x, timestep, context, attention_mask=None, guidance=None, ref_latents=None, transformer_options={}, **kwargs): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options) + ).execute(x, timestep, context, attention_mask, guidance, ref_latents, transformer_options, **kwargs) + + def _forward( self, x, timesteps, diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index 0726b8e1b..1885d9730 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -11,6 +11,7 @@ from comfy.ldm.flux.layers import EmbedND from comfy.ldm.flux.math import apply_rope import comfy.ldm.common_dit import comfy.model_management +import comfy.patcher_extension def sinusoidal_embedding_1d(dim, position): @@ -573,6 +574,13 @@ class WanModel(torch.nn.Module): return x def forward(self, x, timestep, context, clip_fea=None, time_dim_concat=None, transformer_options={}, **kwargs): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options) + ).execute(x, timestep, context, clip_fea, time_dim_concat, transformer_options, **kwargs) + + def _forward(self, x, timestep, context, clip_fea=None, time_dim_concat=None, transformer_options={}, **kwargs): bs, c, t, h, w = x.shape x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size) diff --git a/comfy/patcher_extension.py b/comfy/patcher_extension.py index 965958f4c..46cc7b2a8 100644 --- a/comfy/patcher_extension.py +++ b/comfy/patcher_extension.py @@ -50,6 +50,7 @@ class WrappersMP: OUTER_SAMPLE = "outer_sample" PREPARE_SAMPLING = "prepare_sampling" SAMPLER_SAMPLE = "sampler_sample" + PREDICT_NOISE = "predict_noise" CALC_COND_BATCH = "calc_cond_batch" APPLY_MODEL = "apply_model" DIFFUSION_MODEL = "diffusion_model" diff --git a/comfy/samplers.py b/comfy/samplers.py index d5390d64e..ec7e0b350 100644 --- a/comfy/samplers.py +++ b/comfy/samplers.py @@ -953,7 +953,14 @@ class CFGGuider: self.original_conds[k] = comfy.sampler_helpers.convert_cond(conds[k]) def __call__(self, *args, **kwargs): - return self.predict_noise(*args, **kwargs) + return self.outer_predict_noise(*args, **kwargs) + + def outer_predict_noise(self, x, timestep, model_options={}, seed=None): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self.predict_noise, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.PREDICT_NOISE, self.model_options, is_model_options=True) + ).execute(x, timestep, model_options, seed) def predict_noise(self, x, timestep, model_options={}, seed=None): return sampling_function(self.inner_model, x, timestep, self.conds.get("negative", None), self.conds.get("positive", None), self.cfg, model_options=model_options, seed=seed) diff --git a/comfy_extras/nodes_easycache.py b/comfy_extras/nodes_easycache.py new file mode 100644 index 000000000..e2b2efcd9 --- /dev/null +++ b/comfy_extras/nodes_easycache.py @@ -0,0 +1,459 @@ +from __future__ import annotations +from typing import TYPE_CHECKING, Union +from comfy_api.latest import io, ComfyExtension +import comfy.patcher_extension +import logging +import torch +import comfy.model_patcher +if TYPE_CHECKING: + from uuid import UUID + + +def easycache_forward_wrapper(executor, *args, **kwargs): + # get values from args + x: torch.Tensor = args[0] + transformer_options: dict[str] = args[-1] + if not isinstance(transformer_options, dict): + transformer_options = kwargs.get("transformer_options") + if not transformer_options: + transformer_options = args[-2] + easycache: EasyCacheHolder = transformer_options["easycache"] + sigmas = transformer_options["sigmas"] + uuids = transformer_options["uuids"] + if sigmas is not None and easycache.is_past_end_timestep(sigmas): + return executor(*args, **kwargs) + # prepare next x_prev + has_first_cond_uuid = easycache.has_first_cond_uuid(uuids) + next_x_prev = x + input_change = None + do_easycache = easycache.should_do_easycache(sigmas) + if do_easycache: + # if first cond marked this step for skipping, skip it and use appropriate cached values + if easycache.skip_current_step: + if easycache.verbose: + logging.info(f"EasyCache [verbose] - was marked to skip this step by {easycache.first_cond_uuid}. Present uuids: {uuids}") + return easycache.apply_cache_diff(x, uuids) + if easycache.initial_step: + easycache.first_cond_uuid = uuids[0] + has_first_cond_uuid = easycache.has_first_cond_uuid(uuids) + easycache.initial_step = False + if has_first_cond_uuid: + if easycache.has_x_prev_subsampled(): + input_change = (easycache.subsample(x, uuids, clone=False) - easycache.x_prev_subsampled).flatten().abs().mean() + if easycache.has_output_prev_norm() and easycache.has_relative_transformation_rate(): + approx_output_change_rate = (easycache.relative_transformation_rate * input_change) / easycache.output_prev_norm + easycache.cumulative_change_rate += approx_output_change_rate + if easycache.cumulative_change_rate < easycache.reuse_threshold: + if easycache.verbose: + logging.info(f"EasyCache [verbose] - skipping step; cumulative_change_rate: {easycache.cumulative_change_rate}, reuse_threshold: {easycache.reuse_threshold}") + # other conds should also skip this step, and instead use their cached values + easycache.skip_current_step = True + return easycache.apply_cache_diff(x, uuids) + else: + if easycache.verbose: + logging.info(f"EasyCache [verbose] - NOT skipping step; cumulative_change_rate: {easycache.cumulative_change_rate}, reuse_threshold: {easycache.reuse_threshold}") + easycache.cumulative_change_rate = 0.0 + + output: torch.Tensor = executor(*args, **kwargs) + if has_first_cond_uuid and easycache.has_output_prev_norm(): + output_change = (easycache.subsample(output, uuids, clone=False) - easycache.output_prev_subsampled).flatten().abs().mean() + if easycache.verbose: + output_change_rate = output_change / easycache.output_prev_norm + easycache.output_change_rates.append(output_change_rate.item()) + if easycache.has_relative_transformation_rate(): + approx_output_change_rate = (easycache.relative_transformation_rate * input_change) / easycache.output_prev_norm + easycache.approx_output_change_rates.append(approx_output_change_rate.item()) + if easycache.verbose: + logging.info(f"EasyCache [verbose] - approx_output_change_rate: {approx_output_change_rate}") + if input_change is not None: + easycache.relative_transformation_rate = output_change / input_change + if easycache.verbose: + logging.info(f"EasyCache [verbose] - output_change_rate: {output_change_rate}") + # TODO: allow cache_diff to be offloaded + easycache.update_cache_diff(output, next_x_prev, uuids) + if has_first_cond_uuid: + easycache.x_prev_subsampled = easycache.subsample(next_x_prev, uuids) + easycache.output_prev_subsampled = easycache.subsample(output, uuids) + easycache.output_prev_norm = output.flatten().abs().mean() + if easycache.verbose: + logging.info(f"EasyCache [verbose] - x_prev_subsampled: {easycache.x_prev_subsampled.shape}") + return output + +def lazycache_predict_noise_wrapper(executor, *args, **kwargs): + # get values from args + x: torch.Tensor = args[0] + timestep: float = args[1] + model_options: dict[str] = args[2] + easycache: LazyCacheHolder = model_options["transformer_options"]["easycache"] + if easycache.is_past_end_timestep(timestep): + return executor(*args, **kwargs) + # prepare next x_prev + next_x_prev = x + input_change = None + do_easycache = easycache.should_do_easycache(timestep) + if do_easycache: + if easycache.has_x_prev_subsampled(): + if easycache.has_x_prev_subsampled(): + input_change = (easycache.subsample(x, clone=False) - easycache.x_prev_subsampled).flatten().abs().mean() + if easycache.has_output_prev_norm() and easycache.has_relative_transformation_rate(): + approx_output_change_rate = (easycache.relative_transformation_rate * input_change) / easycache.output_prev_norm + easycache.cumulative_change_rate += approx_output_change_rate + if easycache.cumulative_change_rate < easycache.reuse_threshold: + if easycache.verbose: + logging.info(f"LazyCache [verbose] - skipping step; cumulative_change_rate: {easycache.cumulative_change_rate}, reuse_threshold: {easycache.reuse_threshold}") + # other conds should also skip this step, and instead use their cached values + easycache.skip_current_step = True + return easycache.apply_cache_diff(x) + else: + if easycache.verbose: + logging.info(f"LazyCache [verbose] - NOT skipping step; cumulative_change_rate: {easycache.cumulative_change_rate}, reuse_threshold: {easycache.reuse_threshold}") + easycache.cumulative_change_rate = 0.0 + output: torch.Tensor = executor(*args, **kwargs) + if easycache.has_output_prev_norm(): + output_change = (easycache.subsample(output, clone=False) - easycache.output_prev_subsampled).flatten().abs().mean() + if easycache.verbose: + output_change_rate = output_change / easycache.output_prev_norm + easycache.output_change_rates.append(output_change_rate.item()) + if easycache.has_relative_transformation_rate(): + approx_output_change_rate = (easycache.relative_transformation_rate * input_change) / easycache.output_prev_norm + easycache.approx_output_change_rates.append(approx_output_change_rate.item()) + if easycache.verbose: + logging.info(f"LazyCache [verbose] - approx_output_change_rate: {approx_output_change_rate}") + if input_change is not None: + easycache.relative_transformation_rate = output_change / input_change + if easycache.verbose: + logging.info(f"LazyCache [verbose] - output_change_rate: {output_change_rate}") + # TODO: allow cache_diff to be offloaded + easycache.update_cache_diff(output, next_x_prev) + easycache.x_prev_subsampled = easycache.subsample(next_x_prev) + easycache.output_prev_subsampled = easycache.subsample(output) + easycache.output_prev_norm = output.flatten().abs().mean() + if easycache.verbose: + logging.info(f"LazyCache [verbose] - x_prev_subsampled: {easycache.x_prev_subsampled.shape}") + return output + +def easycache_calc_cond_batch_wrapper(executor, *args, **kwargs): + model_options = args[-1] + easycache: EasyCacheHolder = model_options["transformer_options"]["easycache"] + easycache.skip_current_step = False + # TODO: check if first_cond_uuid is active at this timestep; otherwise, EasyCache needs to be partially reset + return executor(*args, **kwargs) + +def easycache_sample_wrapper(executor, *args, **kwargs): + """ + This OUTER_SAMPLE wrapper makes sure easycache is prepped for current run, and all memory usage is cleared at the end. + """ + try: + guider = executor.class_obj + orig_model_options = guider.model_options + guider.model_options = comfy.model_patcher.create_model_options_clone(orig_model_options) + # clone and prepare timesteps + guider.model_options["transformer_options"]["easycache"] = guider.model_options["transformer_options"]["easycache"].clone().prepare_timesteps(guider.model_patcher.model.model_sampling) + easycache: Union[EasyCacheHolder, LazyCacheHolder] = guider.model_options['transformer_options']['easycache'] + logging.info(f"{easycache.name} enabled - threshold: {easycache.reuse_threshold}, start_percent: {easycache.start_percent}, end_percent: {easycache.end_percent}") + return executor(*args, **kwargs) + finally: + easycache = guider.model_options['transformer_options']['easycache'] + output_change_rates = easycache.output_change_rates + approx_output_change_rates = easycache.approx_output_change_rates + if easycache.verbose: + logging.info(f"{easycache.name} [verbose] - output_change_rates {len(output_change_rates)}: {output_change_rates}") + logging.info(f"{easycache.name} [verbose] - approx_output_change_rates {len(approx_output_change_rates)}: {approx_output_change_rates}") + total_steps = len(args[3])-1 + logging.info(f"{easycache.name} - skipped {easycache.total_steps_skipped}/{total_steps} steps ({total_steps/(total_steps-easycache.total_steps_skipped):.2f}x speedup).") + easycache.reset() + guider.model_options = orig_model_options + + +class EasyCacheHolder: + def __init__(self, reuse_threshold: float, start_percent: float, end_percent: float, subsample_factor: int, offload_cache_diff: bool, verbose: bool=False): + self.name = "EasyCache" + self.reuse_threshold = reuse_threshold + self.start_percent = start_percent + self.end_percent = end_percent + self.subsample_factor = subsample_factor + self.offload_cache_diff = offload_cache_diff + self.verbose = verbose + # timestep values + self.start_t = 0.0 + self.end_t = 0.0 + # control values + self.relative_transformation_rate: float = None + self.cumulative_change_rate = 0.0 + self.initial_step = True + self.skip_current_step = False + # cache values + self.first_cond_uuid = None + self.x_prev_subsampled: torch.Tensor = None + self.output_prev_subsampled: torch.Tensor = None + self.output_prev_norm: torch.Tensor = None + self.uuid_cache_diffs: dict[UUID, torch.Tensor] = {} + self.output_change_rates = [] + self.approx_output_change_rates = [] + self.total_steps_skipped = 0 + # how to deal with mismatched dims + self.allow_mismatch = True + self.cut_from_start = True + + def is_past_end_timestep(self, timestep: float) -> bool: + return not (timestep[0] > self.end_t).item() + + def should_do_easycache(self, timestep: float) -> bool: + return (timestep[0] <= self.start_t).item() + + def has_x_prev_subsampled(self) -> bool: + return self.x_prev_subsampled is not None + + def has_output_prev_subsampled(self) -> bool: + return self.output_prev_subsampled is not None + + def has_output_prev_norm(self) -> bool: + return self.output_prev_norm is not None + + def has_relative_transformation_rate(self) -> bool: + return self.relative_transformation_rate is not None + + def prepare_timesteps(self, model_sampling): + self.start_t = model_sampling.percent_to_sigma(self.start_percent) + self.end_t = model_sampling.percent_to_sigma(self.end_percent) + return self + + def subsample(self, x: torch.Tensor, uuids: list[UUID], clone: bool = True) -> torch.Tensor: + batch_offset = x.shape[0] // len(uuids) + uuid_idx = uuids.index(self.first_cond_uuid) + if self.subsample_factor > 1: + to_return = x[uuid_idx*batch_offset:(uuid_idx+1)*batch_offset, ..., ::self.subsample_factor, ::self.subsample_factor] + if clone: + return to_return.clone() + return to_return + to_return = x[uuid_idx*batch_offset:(uuid_idx+1)*batch_offset, ...] + if clone: + return to_return.clone() + return to_return + + def apply_cache_diff(self, x: torch.Tensor, uuids: list[UUID]): + if self.first_cond_uuid in uuids: + self.total_steps_skipped += 1 + batch_offset = x.shape[0] // len(uuids) + for i, uuid in enumerate(uuids): + # if cached dims don't match x dims, cut off excess and hope for the best (cosmos world2video) + if x.shape[1:] != self.uuid_cache_diffs[uuid].shape[1:]: + if not self.allow_mismatch: + raise ValueError(f"Cached dims {self.uuid_cache_diffs[uuid].shape} don't match x dims {x.shape} - this is no good") + slicing = [] + skip_this_dim = True + for dim_u, dim_x in zip(self.uuid_cache_diffs[uuid].shape, x.shape): + if skip_this_dim: + skip_this_dim = False + continue + if dim_u != dim_x: + if self.cut_from_start: + slicing.append(slice(dim_x-dim_u, None)) + else: + slicing.append(slice(None, dim_u)) + else: + slicing.append(slice(None)) + slicing = [slice(i*batch_offset,(i+1)*batch_offset)] + slicing + x = x[slicing] + x += self.uuid_cache_diffs[uuid].to(x.device) + return x + + def update_cache_diff(self, output: torch.Tensor, x: torch.Tensor, uuids: list[UUID]): + # if output dims don't match x dims, cut off excess and hope for the best (cosmos world2video) + if output.shape[1:] != x.shape[1:]: + if not self.allow_mismatch: + raise ValueError(f"Output dims {output.shape} don't match x dims {x.shape} - this is no good") + slicing = [] + skip_dim = True + for dim_o, dim_x in zip(output.shape, x.shape): + if not skip_dim and dim_o != dim_x: + if self.cut_from_start: + slicing.append(slice(dim_x-dim_o, None)) + else: + slicing.append(slice(None, dim_o)) + else: + slicing.append(slice(None)) + skip_dim = False + x = x[slicing] + diff = output - x + batch_offset = diff.shape[0] // len(uuids) + for i, uuid in enumerate(uuids): + self.uuid_cache_diffs[uuid] = diff[i*batch_offset:(i+1)*batch_offset, ...] + + def has_first_cond_uuid(self, uuids: list[UUID]) -> bool: + return self.first_cond_uuid in uuids + + def reset(self): + self.relative_transformation_rate = 0.0 + self.cumulative_change_rate = 0.0 + self.initial_step = True + self.skip_current_step = False + self.output_change_rates = [] + self.first_cond_uuid = None + del self.x_prev_subsampled + self.x_prev_subsampled = None + del self.output_prev_subsampled + self.output_prev_subsampled = None + del self.output_prev_norm + self.output_prev_norm = None + del self.uuid_cache_diffs + self.uuid_cache_diffs = {} + self.total_steps_skipped = 0 + return self + + def clone(self): + return EasyCacheHolder(self.reuse_threshold, self.start_percent, self.end_percent, self.subsample_factor, self.offload_cache_diff, self.verbose) + + +class EasyCacheNode(io.ComfyNode): + @classmethod + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="EasyCache", + display_name="EasyCache", + description="Native EasyCache implementation.", + category="advanced/debug/model", + is_experimental=True, + inputs=[ + io.Model.Input("model", tooltip="The model to add EasyCache to."), + io.Float.Input("reuse_threshold", min=0.0, default=0.2, max=3.0, step=0.01, tooltip="The threshold for reusing cached steps."), + io.Float.Input("start_percent", min=0.0, default=0.15, max=1.0, step=0.01, tooltip="The relative sampling step to begin use of EasyCache."), + io.Float.Input("end_percent", min=0.0, default=0.95, max=1.0, step=0.01, tooltip="The relative sampling step to end use of EasyCache."), + io.Boolean.Input("verbose", default=False, tooltip="Whether to log verbose information."), + ], + outputs=[ + io.Model.Output(tooltip="The model with EasyCache."), + ], + ) + + @classmethod + def execute(cls, model: io.Model.Type, reuse_threshold: float, start_percent: float, end_percent: float, verbose: bool) -> io.NodeOutput: + model = model.clone() + model.model_options["transformer_options"]["easycache"] = EasyCacheHolder(reuse_threshold, start_percent, end_percent, subsample_factor=8, offload_cache_diff=False, verbose=verbose) + model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.OUTER_SAMPLE, "easycache", easycache_sample_wrapper) + model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.CALC_COND_BATCH, "easycache", easycache_calc_cond_batch_wrapper) + model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, "easycache", easycache_forward_wrapper) + return io.NodeOutput(model) + + +class LazyCacheHolder: + def __init__(self, reuse_threshold: float, start_percent: float, end_percent: float, subsample_factor: int, offload_cache_diff: bool, verbose: bool=False): + self.name = "LazyCache" + self.reuse_threshold = reuse_threshold + self.start_percent = start_percent + self.end_percent = end_percent + self.subsample_factor = subsample_factor + self.offload_cache_diff = offload_cache_diff + self.verbose = verbose + # timestep values + self.start_t = 0.0 + self.end_t = 0.0 + # control values + self.relative_transformation_rate: float = None + self.cumulative_change_rate = 0.0 + self.initial_step = True + # cache values + self.x_prev_subsampled: torch.Tensor = None + self.output_prev_subsampled: torch.Tensor = None + self.output_prev_norm: torch.Tensor = None + self.cache_diff: torch.Tensor = None + self.output_change_rates = [] + self.approx_output_change_rates = [] + self.total_steps_skipped = 0 + + def has_cache_diff(self) -> bool: + return self.cache_diff is not None + + def is_past_end_timestep(self, timestep: float) -> bool: + return not (timestep[0] > self.end_t).item() + + def should_do_easycache(self, timestep: float) -> bool: + return (timestep[0] <= self.start_t).item() + + def has_x_prev_subsampled(self) -> bool: + return self.x_prev_subsampled is not None + + def has_output_prev_subsampled(self) -> bool: + return self.output_prev_subsampled is not None + + def has_output_prev_norm(self) -> bool: + return self.output_prev_norm is not None + + def has_relative_transformation_rate(self) -> bool: + return self.relative_transformation_rate is not None + + def prepare_timesteps(self, model_sampling): + self.start_t = model_sampling.percent_to_sigma(self.start_percent) + self.end_t = model_sampling.percent_to_sigma(self.end_percent) + return self + + def subsample(self, x: torch.Tensor, clone: bool = True) -> torch.Tensor: + if self.subsample_factor > 1: + to_return = x[..., ::self.subsample_factor, ::self.subsample_factor] + if clone: + return to_return.clone() + return to_return + if clone: + return x.clone() + return x + + def apply_cache_diff(self, x: torch.Tensor): + self.total_steps_skipped += 1 + return x + self.cache_diff.to(x.device) + + def update_cache_diff(self, output: torch.Tensor, x: torch.Tensor): + self.cache_diff = output - x + + def reset(self): + self.relative_transformation_rate = 0.0 + self.cumulative_change_rate = 0.0 + self.initial_step = True + self.output_change_rates = [] + self.approx_output_change_rates = [] + del self.cache_diff + self.cache_diff = None + self.total_steps_skipped = 0 + return self + + def clone(self): + return LazyCacheHolder(self.reuse_threshold, self.start_percent, self.end_percent, self.subsample_factor, self.offload_cache_diff, self.verbose) + +class LazyCacheNode(io.ComfyNode): + @classmethod + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="LazyCache", + display_name="LazyCache", + description="A homebrew version of EasyCache - even 'easier' version of EasyCache to implement. Overall works worse than EasyCache, but better in some rare cases AND universal compatibility with everything in ComfyUI.", + category="advanced/debug/model", + is_experimental=True, + inputs=[ + io.Model.Input("model", tooltip="The model to add LazyCache to."), + io.Float.Input("reuse_threshold", min=0.0, default=0.2, max=3.0, step=0.01, tooltip="The threshold for reusing cached steps."), + io.Float.Input("start_percent", min=0.0, default=0.15, max=1.0, step=0.01, tooltip="The relative sampling step to begin use of LazyCache."), + io.Float.Input("end_percent", min=0.0, default=0.95, max=1.0, step=0.01, tooltip="The relative sampling step to end use of LazyCache."), + io.Boolean.Input("verbose", default=False, tooltip="Whether to log verbose information."), + ], + outputs=[ + io.Model.Output(tooltip="The model with LazyCache."), + ], + ) + + @classmethod + def execute(cls, model: io.Model.Type, reuse_threshold: float, start_percent: float, end_percent: float, verbose: bool) -> io.NodeOutput: + model = model.clone() + model.model_options["transformer_options"]["easycache"] = LazyCacheHolder(reuse_threshold, start_percent, end_percent, subsample_factor=8, offload_cache_diff=False, verbose=verbose) + model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.OUTER_SAMPLE, "lazycache", easycache_sample_wrapper) + model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.PREDICT_NOISE, "lazycache", lazycache_predict_noise_wrapper) + return io.NodeOutput(model) + + +class EasyCacheExtension(ComfyExtension): + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + EasyCacheNode, + LazyCacheNode, + ] + +def comfy_entrypoint(): + return EasyCacheExtension() diff --git a/nodes.py b/nodes.py index 9681750d3..723ce3384 100644 --- a/nodes.py +++ b/nodes.py @@ -2322,7 +2322,8 @@ async def init_builtin_extra_nodes(): "nodes_tcfg.py", "nodes_context_windows.py", "nodes_qwen.py", - "nodes_model_patch.py" + "nodes_model_patch.py", + "nodes_easycache.py", ] import_failed = [] From 41048c69b4ccf63f876213a95a51cdde1cb0ab84 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 22 Aug 2025 20:15:44 -0700 Subject: [PATCH 183/243] Fix Conditioning masks on 3d latents. (#9506) --- comfy/samplers.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/comfy/samplers.py b/comfy/samplers.py index ec7e0b350..c7dfef4ea 100644 --- a/comfy/samplers.py +++ b/comfy/samplers.py @@ -17,6 +17,7 @@ import comfy.model_patcher import comfy.patcher_extension import comfy.hooks import comfy.context_windows +import comfy.utils import scipy.stats import numpy @@ -61,7 +62,7 @@ def get_area_and_mult(conds, x_in, timestep_in): if "mask_strength" in conds: mask_strength = conds["mask_strength"] mask = conds['mask'] - assert (mask.shape[1:] == x_in.shape[2:]) + # assert (mask.shape[1:] == x_in.shape[2:]) mask = mask[:input_x.shape[0]] if area is not None: @@ -69,7 +70,7 @@ def get_area_and_mult(conds, x_in, timestep_in): mask = mask.narrow(i + 1, area[len(dims) + i], area[i]) mask = mask * mask_strength - mask = mask.unsqueeze(1).repeat(input_x.shape[0] // mask.shape[0], input_x.shape[1], 1, 1) + mask = mask.unsqueeze(1).repeat((input_x.shape[0] // mask.shape[0], input_x.shape[1]) + (1, ) * (mask.ndim - 1)) else: mask = torch.ones_like(input_x) mult = mask * strength @@ -553,7 +554,10 @@ def resolve_areas_and_cond_masks_multidim(conditions, dims, device): if len(mask.shape) == len(dims): mask = mask.unsqueeze(0) if mask.shape[1:] != dims: - mask = torch.nn.functional.interpolate(mask.unsqueeze(1), size=dims, mode='bilinear', align_corners=False).squeeze(1) + if mask.ndim < 4: + mask = comfy.utils.common_upscale(mask.unsqueeze(1), dims[-1], dims[-2], 'bilinear', 'none').squeeze(1) + else: + mask = comfy.utils.common_upscale(mask, dims[-1], dims[-2], 'bilinear', 'none') if modified.get("set_area_to_bounds", False): #TODO: handle dim != 2 bounds = torch.max(torch.abs(mask),dim=0).values.unsqueeze(0) From 59eddda90030b61f172e155bc1e2526a51a27dff Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 22 Aug 2025 22:36:44 -0700 Subject: [PATCH 184/243] Python 3.13 is well supported. (#9511) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 79a8a8c79..99a50571b 100644 --- a/README.md +++ b/README.md @@ -192,7 +192,7 @@ comfy install ## Manual Install (Windows, Linux) -python 3.13 is supported but using 3.12 is recommended because some custom nodes and their dependencies might not support it yet. +Python 3.13 is very well supported. If you have trouble with some custom node dependencies you can try 3.12 Git clone this repo. From 8be0d22ab76a3d548c9c376fd816b39d4c028c12 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 23 Aug 2025 10:56:17 -0700 Subject: [PATCH 185/243] Don't use the annoying new navigation mode by default. (#9518) --- app/app_settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/app_settings.py b/app/app_settings.py index c7ac73bf6..eb69133a3 100644 --- a/app/app_settings.py +++ b/app/app_settings.py @@ -25,7 +25,7 @@ class AppSettings(): logging.error(f"The user settings file is corrupted: {file}") return {} else: - return {} + return {"Comfy.Canvas.NavigationMode": "legacy"} def save_settings(self, request, settings): file = self.user_manager.get_request_user_filepath( From 3e316c6338503a535801db3ddac9572a38a607ef Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Sat, 23 Aug 2025 14:54:01 -0700 Subject: [PATCH 186/243] Update frontend to v1.25.10 and revert navigation mode override (#9522) - Update comfyui-frontend-package from 1.25.9 to 1.25.10 - Revert forced legacy navigation mode from PR #9518 - Frontend v1.25.10 includes proper navigation mode fixes and improved display text --- app/app_settings.py | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/app/app_settings.py b/app/app_settings.py index eb69133a3..c7ac73bf6 100644 --- a/app/app_settings.py +++ b/app/app_settings.py @@ -25,7 +25,7 @@ class AppSettings(): logging.error(f"The user settings file is corrupted: {file}") return {} else: - return {"Comfy.Canvas.NavigationMode": "legacy"} + return {} def save_settings(self, request, settings): file = self.user_manager.get_request_user_filepath( diff --git a/requirements.txt b/requirements.txt index 6b53fabc1..131484ce8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.25.9 +comfyui-frontend-package==1.25.10 comfyui-workflow-templates==0.1.65 comfyui-embedded-docs==0.2.6 torch From 71ed4a399ec76a75aa2870b772d2022e4b9a69a3 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Sat, 23 Aug 2025 18:57:09 -0400 Subject: [PATCH 187/243] ComfyUI version 0.3.52 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 65f06cf37..834c3e8c2 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.51" +__version__ = "0.3.52" diff --git a/pyproject.toml b/pyproject.toml index ecbf04303..f6e765a81 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.51" +version = "0.3.52" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 95ac7794b7c735de8e5426442507d08edd29bec5 Mon Sep 17 00:00:00 2001 From: blepping <157360029+blepping@users.noreply.github.com> Date: Sun, 24 Aug 2025 13:29:49 -0600 Subject: [PATCH 188/243] Fix EasyCache/LazyCache crash when tensor shape/dtype/device changes during sampling (#9528) * Fix EasyCache/LazyCache crash when tensor shape/dtype/device changes during sampling * Fix missing LazyCache check_metadata method Ensure LazyCache reset method resets all the tensor state values --- comfy_extras/nodes_easycache.py | 34 +++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/comfy_extras/nodes_easycache.py b/comfy_extras/nodes_easycache.py index e2b2efcd9..9d2988f5f 100644 --- a/comfy_extras/nodes_easycache.py +++ b/comfy_extras/nodes_easycache.py @@ -28,6 +28,7 @@ def easycache_forward_wrapper(executor, *args, **kwargs): input_change = None do_easycache = easycache.should_do_easycache(sigmas) if do_easycache: + easycache.check_metadata(x) # if first cond marked this step for skipping, skip it and use appropriate cached values if easycache.skip_current_step: if easycache.verbose: @@ -92,6 +93,7 @@ def lazycache_predict_noise_wrapper(executor, *args, **kwargs): input_change = None do_easycache = easycache.should_do_easycache(timestep) if do_easycache: + easycache.check_metadata(x) if easycache.has_x_prev_subsampled(): if easycache.has_x_prev_subsampled(): input_change = (easycache.subsample(x, clone=False) - easycache.x_prev_subsampled).flatten().abs().mean() @@ -194,6 +196,7 @@ class EasyCacheHolder: # how to deal with mismatched dims self.allow_mismatch = True self.cut_from_start = True + self.state_metadata = None def is_past_end_timestep(self, timestep: float) -> bool: return not (timestep[0] > self.end_t).item() @@ -283,6 +286,17 @@ class EasyCacheHolder: def has_first_cond_uuid(self, uuids: list[UUID]) -> bool: return self.first_cond_uuid in uuids + def check_metadata(self, x: torch.Tensor) -> bool: + metadata = (x.device, x.dtype, x.shape[1:]) + if self.state_metadata is None: + self.state_metadata = metadata + return True + if metadata == self.state_metadata: + return True + logging.warn(f"{self.name} - Tensor shape, dtype or device changed, resetting state") + self.reset() + return False + def reset(self): self.relative_transformation_rate = 0.0 self.cumulative_change_rate = 0.0 @@ -299,6 +313,7 @@ class EasyCacheHolder: del self.uuid_cache_diffs self.uuid_cache_diffs = {} self.total_steps_skipped = 0 + self.state_metadata = None return self def clone(self): @@ -360,6 +375,7 @@ class LazyCacheHolder: self.output_change_rates = [] self.approx_output_change_rates = [] self.total_steps_skipped = 0 + self.state_metadata = None def has_cache_diff(self) -> bool: return self.cache_diff is not None @@ -404,6 +420,17 @@ class LazyCacheHolder: def update_cache_diff(self, output: torch.Tensor, x: torch.Tensor): self.cache_diff = output - x + def check_metadata(self, x: torch.Tensor) -> bool: + metadata = (x.device, x.dtype, x.shape) + if self.state_metadata is None: + self.state_metadata = metadata + return True + if metadata == self.state_metadata: + return True + logging.warn(f"{self.name} - Tensor shape, dtype or device changed, resetting state") + self.reset() + return False + def reset(self): self.relative_transformation_rate = 0.0 self.cumulative_change_rate = 0.0 @@ -412,7 +439,14 @@ class LazyCacheHolder: self.approx_output_change_rates = [] del self.cache_diff self.cache_diff = None + del self.x_prev_subsampled + self.x_prev_subsampled = None + del self.output_prev_subsampled + self.output_prev_subsampled = None + del self.output_prev_norm + self.output_prev_norm = None self.total_steps_skipped = 0 + self.state_metadata = None return self def clone(self): From f6b93d41a03081fad3c1a01221eac9c42d6790df Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 24 Aug 2025 12:40:32 -0700 Subject: [PATCH 189/243] Remove models from readme that are not fully implemented. (#9535) Cosmos model implementations are currently missing the safety part so it is technically not fully implemented and should not be advertised as such. --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 99a50571b..8024870c2 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,6 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith - [Flux](https://comfyanonymous.github.io/ComfyUI_examples/flux/) - [Lumina Image 2.0](https://comfyanonymous.github.io/ComfyUI_examples/lumina2/) - [HiDream](https://comfyanonymous.github.io/ComfyUI_examples/hidream/) - - [Cosmos Predict2](https://comfyanonymous.github.io/ComfyUI_examples/cosmos_predict2/) - [Qwen Image](https://comfyanonymous.github.io/ComfyUI_examples/qwen_image/) - Image Editing Models - [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/) @@ -77,7 +76,6 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith - [Mochi](https://comfyanonymous.github.io/ComfyUI_examples/mochi/) - [LTX-Video](https://comfyanonymous.github.io/ComfyUI_examples/ltxv/) - [Hunyuan Video](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_video/) - - [Nvidia Cosmos](https://comfyanonymous.github.io/ComfyUI_examples/cosmos/) and [Cosmos Predict2](https://comfyanonymous.github.io/ComfyUI_examples/cosmos_predict2/) - [Wan 2.1](https://comfyanonymous.github.io/ComfyUI_examples/wan/) - [Wan 2.2](https://comfyanonymous.github.io/ComfyUI_examples/wan22/) - Audio Models From e633a47ad1b875e52758be27ec34cb8907ebe1fb Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 25 Aug 2025 17:13:54 -0700 Subject: [PATCH 190/243] Add models/audio_encoders directory. (#9548) --- folder_paths.py | 2 ++ models/audio_encoders/put_audio_encoder_models_here | 0 2 files changed, 2 insertions(+) create mode 100644 models/audio_encoders/put_audio_encoder_models_here diff --git a/folder_paths.py b/folder_paths.py index b34af39e8..f110d832b 100644 --- a/folder_paths.py +++ b/folder_paths.py @@ -48,6 +48,8 @@ folder_names_and_paths["classifiers"] = ([os.path.join(models_dir, "classifiers" folder_names_and_paths["model_patches"] = ([os.path.join(models_dir, "model_patches")], supported_pt_extensions) +folder_names_and_paths["audio_encoders"] = ([os.path.join(models_dir, "audio_encoders")], supported_pt_extensions) + output_directory = os.path.join(base_path, "output") temp_directory = os.path.join(base_path, "temp") input_directory = os.path.join(base_path, "input") diff --git a/models/audio_encoders/put_audio_encoder_models_here b/models/audio_encoders/put_audio_encoder_models_here new file mode 100644 index 000000000..e69de29bb From 914c2a29731be9c082f773c4b95892f553ac5ae8 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 25 Aug 2025 20:26:47 -0700 Subject: [PATCH 191/243] Implement wav2vec2 as an audio encoder model. (#9549) This is useless on its own but there are multiple models that use it. --- comfy/audio_encoders/audio_encoders.py | 42 +++++ comfy/audio_encoders/wav2vec2.py | 207 +++++++++++++++++++++++++ comfy_api/latest/_io.py | 8 + comfy_extras/nodes_audio_encoder.py | 44 ++++++ nodes.py | 1 + 5 files changed, 302 insertions(+) create mode 100644 comfy/audio_encoders/audio_encoders.py create mode 100644 comfy/audio_encoders/wav2vec2.py create mode 100644 comfy_extras/nodes_audio_encoder.py diff --git a/comfy/audio_encoders/audio_encoders.py b/comfy/audio_encoders/audio_encoders.py new file mode 100644 index 000000000..538c21bd5 --- /dev/null +++ b/comfy/audio_encoders/audio_encoders.py @@ -0,0 +1,42 @@ +from .wav2vec2 import Wav2Vec2Model +import comfy.model_management +import comfy.ops +import comfy.utils +import logging +import torchaudio + + +class AudioEncoderModel(): + def __init__(self, config): + self.load_device = comfy.model_management.text_encoder_device() + offload_device = comfy.model_management.text_encoder_offload_device() + self.dtype = comfy.model_management.text_encoder_dtype(self.load_device) + self.model = Wav2Vec2Model(dtype=self.dtype, device=offload_device, operations=comfy.ops.manual_cast) + self.model.eval() + self.patcher = comfy.model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device) + self.model_sample_rate = 16000 + + def load_sd(self, sd): + return self.model.load_state_dict(sd, strict=False) + + def get_sd(self): + return self.model.state_dict() + + def encode_audio(self, audio, sample_rate): + comfy.model_management.load_model_gpu(self.patcher) + audio = torchaudio.functional.resample(audio, sample_rate, self.model_sample_rate) + out, all_layers = self.model(audio.to(self.load_device)) + outputs = {} + outputs["encoded_audio"] = out + outputs["encoded_audio_all_layers"] = all_layers + return outputs + + +def load_audio_encoder_from_sd(sd, prefix=""): + audio_encoder = AudioEncoderModel(None) + sd = comfy.utils.state_dict_prefix_replace(sd, {"wav2vec2.": ""}) + m, u = audio_encoder.load_sd(sd) + if len(m) > 0: + logging.warning("missing audio encoder: {}".format(m)) + + return audio_encoder diff --git a/comfy/audio_encoders/wav2vec2.py b/comfy/audio_encoders/wav2vec2.py new file mode 100644 index 000000000..de906622a --- /dev/null +++ b/comfy/audio_encoders/wav2vec2.py @@ -0,0 +1,207 @@ +import torch +import torch.nn as nn +from comfy.ldm.modules.attention import optimized_attention_masked + + +class LayerNormConv(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, stride, bias=False, dtype=None, device=None, operations=None): + super().__init__() + self.conv = operations.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, bias=bias, device=device, dtype=dtype) + self.layer_norm = operations.LayerNorm(out_channels, elementwise_affine=True, device=device, dtype=dtype) + + def forward(self, x): + x = self.conv(x) + return torch.nn.functional.gelu(self.layer_norm(x.transpose(-2, -1)).transpose(-2, -1)) + + +class ConvFeatureEncoder(nn.Module): + def __init__(self, conv_dim, dtype=None, device=None, operations=None): + super().__init__() + self.conv_layers = nn.ModuleList([ + LayerNormConv(1, conv_dim, kernel_size=10, stride=5, bias=True, device=device, dtype=dtype, operations=operations), + LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations), + LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations), + LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations), + LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations), + LayerNormConv(conv_dim, conv_dim, kernel_size=2, stride=2, bias=True, device=device, dtype=dtype, operations=operations), + LayerNormConv(conv_dim, conv_dim, kernel_size=2, stride=2, bias=True, device=device, dtype=dtype, operations=operations), + ]) + + def forward(self, x): + x = x.unsqueeze(1) + + for conv in self.conv_layers: + x = conv(x) + + return x.transpose(1, 2) + + +class FeatureProjection(nn.Module): + def __init__(self, conv_dim, embed_dim, dtype=None, device=None, operations=None): + super().__init__() + self.layer_norm = operations.LayerNorm(conv_dim, eps=1e-05, device=device, dtype=dtype) + self.projection = operations.Linear(conv_dim, embed_dim, device=device, dtype=dtype) + + def forward(self, x): + x = self.layer_norm(x) + x = self.projection(x) + return x + + +class PositionalConvEmbedding(nn.Module): + def __init__(self, embed_dim=768, kernel_size=128, groups=16): + super().__init__() + self.conv = nn.Conv1d( + embed_dim, + embed_dim, + kernel_size=kernel_size, + padding=kernel_size // 2, + groups=groups, + ) + self.conv = torch.nn.utils.parametrizations.weight_norm(self.conv, name="weight", dim=2) + self.activation = nn.GELU() + + def forward(self, x): + x = x.transpose(1, 2) + x = self.conv(x)[:, :, :-1] + x = self.activation(x) + x = x.transpose(1, 2) + return x + + +class TransformerEncoder(nn.Module): + def __init__( + self, + embed_dim=768, + num_heads=12, + num_layers=12, + mlp_ratio=4.0, + dtype=None, device=None, operations=None + ): + super().__init__() + + self.pos_conv_embed = PositionalConvEmbedding(embed_dim=embed_dim) + self.layers = nn.ModuleList([ + TransformerEncoderLayer( + embed_dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + device=device, dtype=dtype, operations=operations + ) + for _ in range(num_layers) + ]) + + self.layer_norm = operations.LayerNorm(embed_dim, eps=1e-05, device=device, dtype=dtype) + + def forward(self, x, mask=None): + x = x + self.pos_conv_embed(x) + all_x = () + for layer in self.layers: + all_x += (x,) + x = layer(x, mask) + x = self.layer_norm(x) + all_x += (x,) + return x, all_x + + +class Attention(nn.Module): + def __init__(self, embed_dim, num_heads, bias=True, dtype=None, device=None, operations=None): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.head_dim = embed_dim // num_heads + + self.k_proj = operations.Linear(embed_dim, embed_dim, bias=bias, device=device, dtype=dtype) + self.v_proj = operations.Linear(embed_dim, embed_dim, bias=bias, device=device, dtype=dtype) + self.q_proj = operations.Linear(embed_dim, embed_dim, bias=bias, device=device, dtype=dtype) + self.out_proj = operations.Linear(embed_dim, embed_dim, bias=bias, device=device, dtype=dtype) + + def forward(self, x, mask=None): + assert (mask is None) # TODO? + q = self.q_proj(x) + k = self.k_proj(x) + v = self.v_proj(x) + + out = optimized_attention_masked(q, k, v, self.num_heads) + return self.out_proj(out) + + +class FeedForward(nn.Module): + def __init__(self, embed_dim, mlp_ratio, dtype=None, device=None, operations=None): + super().__init__() + self.intermediate_dense = operations.Linear(embed_dim, int(embed_dim * mlp_ratio), device=device, dtype=dtype) + self.output_dense = operations.Linear(int(embed_dim * mlp_ratio), embed_dim, device=device, dtype=dtype) + + def forward(self, x): + x = self.intermediate_dense(x) + x = torch.nn.functional.gelu(x) + x = self.output_dense(x) + return x + + +class TransformerEncoderLayer(nn.Module): + def __init__( + self, + embed_dim=768, + num_heads=12, + mlp_ratio=4.0, + dtype=None, device=None, operations=None + ): + super().__init__() + + self.attention = Attention(embed_dim, num_heads, device=device, dtype=dtype, operations=operations) + + self.layer_norm = operations.LayerNorm(embed_dim, device=device, dtype=dtype) + self.feed_forward = FeedForward(embed_dim, mlp_ratio, device=device, dtype=dtype, operations=operations) + self.final_layer_norm = operations.LayerNorm(embed_dim, device=device, dtype=dtype) + + def forward(self, x, mask=None): + residual = x + x = self.layer_norm(x) + x = self.attention(x, mask=mask) + x = residual + x + + x = x + self.feed_forward(self.final_layer_norm(x)) + return x + + +class Wav2Vec2Model(nn.Module): + """Complete Wav2Vec 2.0 model.""" + + def __init__( + self, + embed_dim=1024, + final_dim=256, + num_heads=16, + num_layers=24, + dtype=None, device=None, operations=None + ): + super().__init__() + + conv_dim = 512 + self.feature_extractor = ConvFeatureEncoder(conv_dim, device=device, dtype=dtype, operations=operations) + self.feature_projection = FeatureProjection(conv_dim, embed_dim, device=device, dtype=dtype, operations=operations) + + self.masked_spec_embed = nn.Parameter(torch.empty(embed_dim, device=device, dtype=dtype)) + + self.encoder = TransformerEncoder( + embed_dim=embed_dim, + num_heads=num_heads, + num_layers=num_layers, + device=device, dtype=dtype, operations=operations + ) + + def forward(self, x, mask_time_indices=None, return_dict=False): + + x = torch.mean(x, dim=1) + + x = (x - x.mean()) / torch.sqrt(x.var() + 1e-7) + + features = self.feature_extractor(x) + features = self.feature_projection(features) + + batch_size, seq_len, _ = features.shape + + x, all_x = self.encoder(features) + + return x, all_x diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index a3a21facc..5cb474459 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -730,6 +730,14 @@ class AnyType(ComfyTypeIO): class MODEL_PATCH(ComfyTypeIO): Type = Any +@comfytype(io_type="AUDIO_ENCODER") +class AUDIO_ENCODER(ComfyTypeIO): + Type = Any + +@comfytype(io_type="AUDIO_ENCODER_OUTPUT") +class AUDIO_ENCODER_OUTPUT(ComfyTypeIO): + Type = Any + @comfytype(io_type="COMFY_MULTITYPED_V3") class MultiType: Type = Any diff --git a/comfy_extras/nodes_audio_encoder.py b/comfy_extras/nodes_audio_encoder.py new file mode 100644 index 000000000..39a140fef --- /dev/null +++ b/comfy_extras/nodes_audio_encoder.py @@ -0,0 +1,44 @@ +import folder_paths +import comfy.audio_encoders.audio_encoders +import comfy.utils + + +class AudioEncoderLoader: + @classmethod + def INPUT_TYPES(s): + return {"required": { "audio_encoder_name": (folder_paths.get_filename_list("audio_encoders"), ), + }} + RETURN_TYPES = ("AUDIO_ENCODER",) + FUNCTION = "load_model" + + CATEGORY = "loaders" + + def load_model(self, audio_encoder_name): + audio_encoder_name = folder_paths.get_full_path_or_raise("audio_encoders", audio_encoder_name) + sd = comfy.utils.load_torch_file(audio_encoder_name, safe_load=True) + audio_encoder = comfy.audio_encoders.audio_encoders.load_audio_encoder_from_sd(sd) + if audio_encoder is None: + raise RuntimeError("ERROR: audio encoder file is invalid and does not contain a valid model.") + return (audio_encoder,) + + +class AudioEncoderEncode: + @classmethod + def INPUT_TYPES(s): + return {"required": { "audio_encoder": ("AUDIO_ENCODER",), + "audio": ("AUDIO",), + }} + RETURN_TYPES = ("AUDIO_ENCODER_OUTPUT",) + FUNCTION = "encode" + + CATEGORY = "conditioning" + + def encode(self, audio_encoder, audio): + output = audio_encoder.encode_audio(audio["waveform"], audio["sample_rate"]) + return (output,) + + +NODE_CLASS_MAPPINGS = { + "AudioEncoderLoader": AudioEncoderLoader, + "AudioEncoderEncode": AudioEncoderEncode, +} diff --git a/nodes.py b/nodes.py index 723ce3384..0aff6b14a 100644 --- a/nodes.py +++ b/nodes.py @@ -2324,6 +2324,7 @@ async def init_builtin_extra_nodes(): "nodes_qwen.py", "nodes_model_patch.py", "nodes_easycache.py", + "nodes_audio_encoder.py", ] import_failed = [] From 39aa06bd5d630e50c88d3be1586d21737c4387c1 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 26 Aug 2025 09:50:46 -0700 Subject: [PATCH 192/243] Make AudioEncoderOutput usable in v3 node schema. (#9554) --- comfy_api/latest/_io.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index 5cb474459..e0ee943a7 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -731,11 +731,11 @@ class MODEL_PATCH(ComfyTypeIO): Type = Any @comfytype(io_type="AUDIO_ENCODER") -class AUDIO_ENCODER(ComfyTypeIO): +class AudioEncoder(ComfyTypeIO): Type = Any @comfytype(io_type="AUDIO_ENCODER_OUTPUT") -class AUDIO_ENCODER_OUTPUT(ComfyTypeIO): +class AudioEncoderOutput(ComfyTypeIO): Type = Any @comfytype(io_type="COMFY_MULTITYPED_V3") @@ -1592,6 +1592,7 @@ class _IO: Model = Model ClipVision = ClipVision ClipVisionOutput = ClipVisionOutput + AudioEncoderOutput = AudioEncoderOutput StyleModel = StyleModel Gligen = Gligen UpscaleModel = UpscaleModel From 5352abc6d389570455776c457738db54367cd6cb Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Wed, 27 Aug 2025 01:33:54 +0800 Subject: [PATCH 193/243] Update template to 0.1.66 (#9557) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 131484ce8..db59bb38c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.25.10 -comfyui-workflow-templates==0.1.65 +comfyui-workflow-templates==0.1.66 comfyui-embedded-docs==0.2.6 torch torchsde From 47f4db3e84874ca6076e5cdbb345444faec83028 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Tue, 26 Aug 2025 19:20:44 -0700 Subject: [PATCH 194/243] Adding Google Gemini Image API node (#9566) * bigcat88's progress on adding Google Gemini Image node * Made Google Gemini Image node functional * Bump frontend version to get static pricing badge on Gemini Image node --- comfy_api_nodes/apis/gemini_api.py | 19 ++ comfy_api_nodes/nodes_gemini.py | 388 ++++++++++++++++++++++------- requirements.txt | 2 +- 3 files changed, 314 insertions(+), 95 deletions(-) create mode 100644 comfy_api_nodes/apis/gemini_api.py diff --git a/comfy_api_nodes/apis/gemini_api.py b/comfy_api_nodes/apis/gemini_api.py new file mode 100644 index 000000000..138bf035d --- /dev/null +++ b/comfy_api_nodes/apis/gemini_api.py @@ -0,0 +1,19 @@ +from __future__ import annotations + +from typing import List, Optional + +from comfy_api_nodes.apis import GeminiGenerationConfig, GeminiContent, GeminiSafetySetting, GeminiSystemInstructionContent, GeminiTool, GeminiVideoMetadata +from pydantic import BaseModel + + +class GeminiImageGenerationConfig(GeminiGenerationConfig): + responseModalities: Optional[List[str]] = None + + +class GeminiImageGenerateContentRequest(BaseModel): + contents: List[GeminiContent] + generationConfig: Optional[GeminiImageGenerationConfig] = None + safetySettings: Optional[List[GeminiSafetySetting]] = None + systemInstruction: Optional[GeminiSystemInstructionContent] = None + tools: Optional[List[GeminiTool]] = None + videoMetadata: Optional[GeminiVideoMetadata] = None diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index 78c402a7a..baa379b75 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -4,11 +4,12 @@ See: https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/infer """ from __future__ import annotations - import json import time import os import uuid +import base64 +from io import BytesIO from enum import Enum from typing import Optional, Literal @@ -25,6 +26,7 @@ from comfy_api_nodes.apis import ( GeminiPart, GeminiMimeType, ) +from comfy_api_nodes.apis.gemini_api import GeminiImageGenerationConfig, GeminiImageGenerateContentRequest from comfy_api_nodes.apis.client import ( ApiEndpoint, HttpMethod, @@ -35,6 +37,7 @@ from comfy_api_nodes.apinode_utils import ( audio_to_base64_string, video_to_base64_string, tensor_to_base64_string, + bytesio_to_image_tensor, ) @@ -53,6 +56,14 @@ class GeminiModel(str, Enum): gemini_2_5_flash = "gemini-2.5-flash" +class GeminiImageModel(str, Enum): + """ + Gemini Image Model Names allowed by comfy-api + """ + + gemini_2_5_flash_image_preview = "gemini-2.5-flash-image-preview" + + def get_gemini_endpoint( model: GeminiModel, ) -> ApiEndpoint[GeminiGenerateContentRequest, GeminiGenerateContentResponse]: @@ -75,6 +86,135 @@ def get_gemini_endpoint( ) +def get_gemini_image_endpoint( + model: GeminiImageModel, +) -> ApiEndpoint[GeminiGenerateContentRequest, GeminiGenerateContentResponse]: + """ + Get the API endpoint for a given Gemini model. + + Args: + model: The Gemini model to use, either as enum or string value. + + Returns: + ApiEndpoint configured for the specific Gemini model. + """ + if isinstance(model, str): + model = GeminiImageModel(model) + return ApiEndpoint( + path=f"{GEMINI_BASE_ENDPOINT}/{model.value}", + method=HttpMethod.POST, + request_model=GeminiImageGenerateContentRequest, + response_model=GeminiGenerateContentResponse, + ) + + +def create_image_parts(image_input: torch.Tensor) -> list[GeminiPart]: + """ + Convert image tensor input to Gemini API compatible parts. + + Args: + image_input: Batch of image tensors from ComfyUI. + + Returns: + List of GeminiPart objects containing the encoded images. + """ + image_parts: list[GeminiPart] = [] + for image_index in range(image_input.shape[0]): + image_as_b64 = tensor_to_base64_string( + image_input[image_index].unsqueeze(0) + ) + image_parts.append( + GeminiPart( + inlineData=GeminiInlineData( + mimeType=GeminiMimeType.image_png, + data=image_as_b64, + ) + ) + ) + return image_parts + + +def create_text_part(text: str) -> GeminiPart: + """ + Create a text part for the Gemini API request. + + Args: + text: The text content to include in the request. + + Returns: + A GeminiPart object with the text content. + """ + return GeminiPart(text=text) + + +def get_parts_from_response( + response: GeminiGenerateContentResponse +) -> list[GeminiPart]: + """ + Extract all parts from the Gemini API response. + + Args: + response: The API response from Gemini. + + Returns: + List of response parts from the first candidate. + """ + return response.candidates[0].content.parts + + +def get_parts_by_type( + response: GeminiGenerateContentResponse, part_type: Literal["text"] | str +) -> list[GeminiPart]: + """ + Filter response parts by their type. + + Args: + response: The API response from Gemini. + part_type: Type of parts to extract ("text" or a MIME type). + + Returns: + List of response parts matching the requested type. + """ + parts = [] + for part in get_parts_from_response(response): + if part_type == "text" and hasattr(part, "text") and part.text: + parts.append(part) + elif ( + hasattr(part, "inlineData") + and part.inlineData + and part.inlineData.mimeType == part_type + ): + parts.append(part) + # Skip parts that don't match the requested type + return parts + + +def get_text_from_response(response: GeminiGenerateContentResponse) -> str: + """ + Extract and concatenate all text parts from the response. + + Args: + response: The API response from Gemini. + + Returns: + Combined text from all text parts in the response. + """ + parts = get_parts_by_type(response, "text") + return "\n".join([part.text for part in parts]) + + +def get_image_from_response(response: GeminiGenerateContentResponse) -> torch.Tensor: + image_tensors: list[torch.Tensor] = [] + parts = get_parts_by_type(response, "image/png") + for part in parts: + image_data = base64.b64decode(part.inlineData.data) + returned_image = bytesio_to_image_tensor(BytesIO(image_data)) + image_tensors.append(returned_image) + if len(image_tensors) == 0: + return torch.zeros((1,1024,1024,4)) + return torch.cat(image_tensors, dim=0) + + class GeminiNode(ComfyNodeABC): """ Node to generate text responses from a Gemini model. @@ -159,59 +299,6 @@ class GeminiNode(ComfyNodeABC): CATEGORY = "api node/text/Gemini" API_NODE = True - def get_parts_from_response( - self, response: GeminiGenerateContentResponse - ) -> list[GeminiPart]: - """ - Extract all parts from the Gemini API response. - - Args: - response: The API response from Gemini. - - Returns: - List of response parts from the first candidate. - """ - return response.candidates[0].content.parts - - def get_parts_by_type( - self, response: GeminiGenerateContentResponse, part_type: Literal["text"] | str - ) -> list[GeminiPart]: - """ - Filter response parts by their type. - - Args: - response: The API response from Gemini. - part_type: Type of parts to extract ("text" or a MIME type). - - Returns: - List of response parts matching the requested type. - """ - parts = [] - for part in self.get_parts_from_response(response): - if part_type == "text" and hasattr(part, "text") and part.text: - parts.append(part) - elif ( - hasattr(part, "inlineData") - and part.inlineData - and part.inlineData.mimeType == part_type - ): - parts.append(part) - # Skip parts that don't match the requested type - return parts - - def get_text_from_response(self, response: GeminiGenerateContentResponse) -> str: - """ - Extract and concatenate all text parts from the response. - - Args: - response: The API response from Gemini. - - Returns: - Combined text from all text parts in the response. - """ - parts = self.get_parts_by_type(response, "text") - return "\n".join([part.text for part in parts]) - def create_video_parts(self, video_input: IO.VIDEO, **kwargs) -> list[GeminiPart]: """ Convert video input to Gemini API compatible parts. @@ -271,43 +358,6 @@ class GeminiNode(ComfyNodeABC): ) return audio_parts - def create_image_parts(self, image_input: torch.Tensor) -> list[GeminiPart]: - """ - Convert image tensor input to Gemini API compatible parts. - - Args: - image_input: Batch of image tensors from ComfyUI. - - Returns: - List of GeminiPart objects containing the encoded images. - """ - image_parts: list[GeminiPart] = [] - for image_index in range(image_input.shape[0]): - image_as_b64 = tensor_to_base64_string( - image_input[image_index].unsqueeze(0) - ) - image_parts.append( - GeminiPart( - inlineData=GeminiInlineData( - mimeType=GeminiMimeType.image_png, - data=image_as_b64, - ) - ) - ) - return image_parts - - def create_text_part(self, text: str) -> GeminiPart: - """ - Create a text part for the Gemini API request. - - Args: - text: The text content to include in the request. - - Returns: - A GeminiPart object with the text content. - """ - return GeminiPart(text=text) - async def api_call( self, prompt: str, @@ -323,11 +373,11 @@ class GeminiNode(ComfyNodeABC): validate_string(prompt, strip_whitespace=False) # Create parts list with text prompt as the first part - parts: list[GeminiPart] = [self.create_text_part(prompt)] + parts: list[GeminiPart] = [create_text_part(prompt)] # Add other modal parts if images is not None: - image_parts = self.create_image_parts(images) + image_parts = create_image_parts(images) parts.extend(image_parts) if audio is not None: parts.extend(self.create_audio_parts(audio)) @@ -351,7 +401,7 @@ class GeminiNode(ComfyNodeABC): ).execute() # Get result output - output_text = self.get_text_from_response(response) + output_text = get_text_from_response(response) if unique_id and output_text: # Not a true chat history like the OpenAI Chat node. It is emulated so the frontend can show a copy button. render_spec = { @@ -462,12 +512,162 @@ class GeminiInputFiles(ComfyNodeABC): return (files,) +class GeminiImage(ComfyNodeABC): + """ + Node to generate text and image responses from a Gemini model. + + This node allows users to interact with Google's Gemini AI models, providing + multimodal inputs (text, images, files) to generate coherent + text and image responses. The node works with the latest Gemini models, handling the + API communication and response parsing. + """ + @classmethod + def INPUT_TYPES(cls) -> InputTypeDict: + return { + "required": { + "prompt": ( + IO.STRING, + { + "multiline": True, + "default": "", + "tooltip": "Text prompt for generation", + }, + ), + "model": ( + IO.COMBO, + { + "tooltip": "The Gemini model to use for generating responses.", + "options": [model.value for model in GeminiImageModel], + "default": GeminiImageModel.gemini_2_5_flash_image_preview.value, + }, + ), + "seed": ( + IO.INT, + { + "default": 42, + "min": 0, + "max": 0xFFFFFFFFFFFFFFFF, + "control_after_generate": True, + "tooltip": "When seed is fixed to a specific value, the model makes a best effort to provide the same response for repeated requests. Deterministic output isn't guaranteed. Also, changing the model or parameter settings, such as the temperature, can cause variations in the response even when you use the same seed value. By default, a random seed value is used.", + }, + ), + }, + "optional": { + "images": ( + IO.IMAGE, + { + "default": None, + "tooltip": "Optional image(s) to use as context for the model. To include multiple images, you can use the Batch Images node.", + }, + ), + "files": ( + "GEMINI_INPUT_FILES", + { + "default": None, + "tooltip": "Optional file(s) to use as context for the model. Accepts inputs from the Gemini Generate Content Input Files node.", + }, + ), + # TODO: later we can add this parameter later + # "n": ( + # IO.INT, + # { + # "default": 1, + # "min": 1, + # "max": 8, + # "step": 1, + # "display": "number", + # "tooltip": "How many images to generate", + # }, + # ), + }, + "hidden": { + "auth_token": "AUTH_TOKEN_COMFY_ORG", + "comfy_api_key": "API_KEY_COMFY_ORG", + "unique_id": "UNIQUE_ID", + }, + } + + RETURN_TYPES = (IO.IMAGE, IO.STRING) + FUNCTION = "api_call" + CATEGORY = "api node/image/Gemini" + DESCRIPTION = "Edit images synchronously via Google API." + API_NODE = True + + async def api_call( + self, + prompt: str, + model: GeminiImageModel, + images: Optional[IO.IMAGE] = None, + files: Optional[list[GeminiPart]] = None, + n=1, + unique_id: Optional[str] = None, + **kwargs, + ): + # Validate inputs + validate_string(prompt, strip_whitespace=True, min_length=1) + # Create parts list with text prompt as the first part + parts: list[GeminiPart] = [create_text_part(prompt)] + + # Add other modal parts + if images is not None: + image_parts = create_image_parts(images) + parts.extend(image_parts) + if files is not None: + parts.extend(files) + + response = await SynchronousOperation( + endpoint=get_gemini_image_endpoint(model), + request=GeminiImageGenerateContentRequest( + contents=[ + GeminiContent( + role="user", + parts=parts, + ), + ], + generationConfig=GeminiImageGenerationConfig( + responseModalities=["TEXT","IMAGE"] + ) + ), + auth_kwargs=kwargs, + ).execute() + + output_image = get_image_from_response(response) + output_text = get_text_from_response(response) + if unique_id and output_text: + # Not a true chat history like the OpenAI Chat node. It is emulated so the frontend can show a copy button. + render_spec = { + "node_id": unique_id, + "component": "ChatHistoryWidget", + "props": { + "history": json.dumps( + [ + { + "prompt": prompt, + "response": output_text, + "response_id": str(uuid.uuid4()), + "timestamp": time.time(), + } + ] + ), + }, + } + PromptServer.instance.send_sync( + "display_component", + render_spec, + ) + + output_text = output_text or "Empty response from Gemini model..." + return (output_image, output_text,) + + NODE_CLASS_MAPPINGS = { "GeminiNode": GeminiNode, + "GeminiImageNode": GeminiImage, "GeminiInputFiles": GeminiInputFiles, } NODE_DISPLAY_NAME_MAPPINGS = { "GeminiNode": "Google Gemini", + "GeminiImageNode": "Google Gemini Image", "GeminiInputFiles": "Gemini Input Files", } diff --git a/requirements.txt b/requirements.txt index db59bb38c..174f3d4d1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.25.10 +comfyui-frontend-package==1.25.11 comfyui-workflow-templates==0.1.66 comfyui-embedded-docs==0.2.6 torch From 6a193ac557b2b35a6d2ea1916b0b8d5d9ee9b1ba Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Wed, 27 Aug 2025 12:10:20 +0800 Subject: [PATCH 195/243] Update template to 0.1.68 (#9569) * Update template to 0.1.67 * Update template to 0.1.68 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 174f3d4d1..93d88859d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.25.11 -comfyui-workflow-templates==0.1.66 +comfyui-workflow-templates==0.1.68 comfyui-embedded-docs==0.2.6 torch torchsde From 88aee596a30e9b80ca831c42a0ae70e0d22b61ae Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 26 Aug 2025 22:10:34 -0700 Subject: [PATCH 196/243] WIP Wan 2.2 S2V model. (#9568) --- comfy/ldm/wan/model.py | 508 ++++++++++++++++++++++++++++++++++++-- comfy/model_base.py | 23 ++ comfy/model_detection.py | 2 + comfy/supported_models.py | 15 +- comfy_extras/nodes_wan.py | 175 +++++++++++++ 5 files changed, 707 insertions(+), 16 deletions(-) diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index 1885d9730..dedfb47e2 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -4,7 +4,7 @@ import math import torch import torch.nn as nn -from einops import repeat +from einops import rearrange from comfy.ldm.modules.attention import optimized_attention from comfy.ldm.flux.layers import EmbedND @@ -153,7 +153,10 @@ def repeat_e(e, x): repeats = x.size(1) // e.size(1) if repeats == 1: return e - return torch.repeat_interleave(e, repeats, dim=1) + if repeats * e.size(1) == x.size(1): + return torch.repeat_interleave(e, repeats, dim=1) + else: + return torch.repeat_interleave(e, repeats + 1, dim=1)[:, :x.size(1)] class WanAttentionBlock(nn.Module): @@ -573,6 +576,28 @@ class WanModel(torch.nn.Module): x = self.unpatchify(x, grid_sizes) return x + def rope_encode(self, t, h, w, t_start=0, steps_t=None, steps_h=None, steps_w=None, device=None, dtype=None): + patch_size = self.patch_size + t_len = ((t + (patch_size[0] // 2)) // patch_size[0]) + h_len = ((h + (patch_size[1] // 2)) // patch_size[1]) + w_len = ((w + (patch_size[2] // 2)) // patch_size[2]) + + if steps_t is None: + steps_t = t_len + if steps_h is None: + steps_h = h_len + if steps_w is None: + steps_w = w_len + + img_ids = torch.zeros((steps_t, steps_h, steps_w, 3), device=device, dtype=dtype) + img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(t_start, t_start + (t_len - 1), steps=steps_t, device=device, dtype=dtype).reshape(-1, 1, 1) + img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(0, h_len - 1, steps=steps_h, device=device, dtype=dtype).reshape(1, -1, 1) + img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(0, w_len - 1, steps=steps_w, device=device, dtype=dtype).reshape(1, 1, -1) + img_ids = img_ids.reshape(1, -1, img_ids.shape[-1]) + + freqs = self.rope_embedder(img_ids).movedim(1, 2) + return freqs + def forward(self, x, timestep, context, clip_fea=None, time_dim_concat=None, transformer_options={}, **kwargs): return comfy.patcher_extension.WrapperExecutor.new_class_executor( self._forward, @@ -584,26 +609,16 @@ class WanModel(torch.nn.Module): bs, c, t, h, w = x.shape x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size) - patch_size = self.patch_size - t_len = ((t + (patch_size[0] // 2)) // patch_size[0]) - h_len = ((h + (patch_size[1] // 2)) // patch_size[1]) - w_len = ((w + (patch_size[2] // 2)) // patch_size[2]) - + t_len = t if time_dim_concat is not None: time_dim_concat = comfy.ldm.common_dit.pad_to_patch_size(time_dim_concat, self.patch_size) x = torch.cat([x, time_dim_concat], dim=2) - t_len = ((x.shape[2] + (patch_size[0] // 2)) // patch_size[0]) + t_len = x.shape[2] if self.ref_conv is not None and "reference_latent" in kwargs: t_len += 1 - img_ids = torch.zeros((t_len, h_len, w_len, 3), device=x.device, dtype=x.dtype) - img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(0, t_len - 1, steps=t_len, device=x.device, dtype=x.dtype).reshape(-1, 1, 1) - img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).reshape(1, -1, 1) - img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).reshape(1, 1, -1) - img_ids = repeat(img_ids, "t h w c -> b (t h w) c", b=bs) - - freqs = self.rope_embedder(img_ids).movedim(1, 2) + freqs = self.rope_encode(t_len, h, w, device=x.device, dtype=x.dtype) return self.forward_orig(x, timestep, context, clip_fea=clip_fea, freqs=freqs, transformer_options=transformer_options, **kwargs)[:, :, :t, :h, :w] def unpatchify(self, x, grid_sizes): @@ -839,3 +854,466 @@ class CameraWanModel(WanModel): # unpatchify x = self.unpatchify(x, grid_sizes) return x + + +class CausalConv1d(nn.Module): + + def __init__(self, + chan_in, + chan_out, + kernel_size=3, + stride=1, + dilation=1, + pad_mode='replicate', + operations=None, + **kwargs): + super().__init__() + + self.pad_mode = pad_mode + padding = (kernel_size - 1, 0) # T + self.time_causal_padding = padding + + self.conv = operations.Conv1d( + chan_in, + chan_out, + kernel_size, + stride=stride, + dilation=dilation, + **kwargs) + + def forward(self, x): + x = torch.nn.functional.pad(x, self.time_causal_padding, mode=self.pad_mode) + return self.conv(x) + + +class MotionEncoder_tc(nn.Module): + + def __init__(self, + in_dim: int, + hidden_dim: int, + num_heads=int, + need_global=True, + dtype=None, + device=None, + operations=None,): + factory_kwargs = {"dtype": dtype, "device": device} + super().__init__() + + self.num_heads = num_heads + self.need_global = need_global + self.conv1_local = CausalConv1d(in_dim, hidden_dim // 4 * num_heads, 3, stride=1, operations=operations, **factory_kwargs) + if need_global: + self.conv1_global = CausalConv1d( + in_dim, hidden_dim // 4, 3, stride=1, operations=operations, **factory_kwargs) + self.norm1 = operations.LayerNorm( + hidden_dim // 4, + elementwise_affine=False, + eps=1e-6, + **factory_kwargs) + self.act = nn.SiLU() + self.conv2 = CausalConv1d(hidden_dim // 4, hidden_dim // 2, 3, stride=2, operations=operations, **factory_kwargs) + self.conv3 = CausalConv1d(hidden_dim // 2, hidden_dim, 3, stride=2, operations=operations, **factory_kwargs) + + if need_global: + self.final_linear = operations.Linear(hidden_dim, hidden_dim, **factory_kwargs) + + self.norm1 = operations.LayerNorm( + hidden_dim // 4, + elementwise_affine=False, + eps=1e-6, + **factory_kwargs) + + self.norm2 = operations.LayerNorm( + hidden_dim // 2, + elementwise_affine=False, + eps=1e-6, + **factory_kwargs) + + self.norm3 = operations.LayerNorm( + hidden_dim, elementwise_affine=False, eps=1e-6, **factory_kwargs) + + self.padding_tokens = nn.Parameter(torch.empty(1, 1, 1, hidden_dim, **factory_kwargs)) + + def forward(self, x): + x = rearrange(x, 'b t c -> b c t') + x_ori = x.clone() + b, c, t = x.shape + x = self.conv1_local(x) + x = rearrange(x, 'b (n c) t -> (b n) t c', n=self.num_heads) + x = self.norm1(x) + x = self.act(x) + x = rearrange(x, 'b t c -> b c t') + x = self.conv2(x) + x = rearrange(x, 'b c t -> b t c') + x = self.norm2(x) + x = self.act(x) + x = rearrange(x, 'b t c -> b c t') + x = self.conv3(x) + x = rearrange(x, 'b c t -> b t c') + x = self.norm3(x) + x = self.act(x) + x = rearrange(x, '(b n) t c -> b t n c', b=b) + padding = comfy.model_management.cast_to(self.padding_tokens, dtype=x.dtype, device=x.device).repeat(b, x.shape[1], 1, 1) + x = torch.cat([x, padding], dim=-2) + x_local = x.clone() + + if not self.need_global: + return x_local + + x = self.conv1_global(x_ori) + x = rearrange(x, 'b c t -> b t c') + x = self.norm1(x) + x = self.act(x) + x = rearrange(x, 'b t c -> b c t') + x = self.conv2(x) + x = rearrange(x, 'b c t -> b t c') + x = self.norm2(x) + x = self.act(x) + x = rearrange(x, 'b t c -> b c t') + x = self.conv3(x) + x = rearrange(x, 'b c t -> b t c') + x = self.norm3(x) + x = self.act(x) + x = self.final_linear(x) + x = rearrange(x, '(b n) t c -> b t n c', b=b) + + return x, x_local + + +class CausalAudioEncoder(nn.Module): + + def __init__(self, + dim=5120, + num_layers=25, + out_dim=2048, + video_rate=8, + num_token=4, + need_global=False, + dtype=None, + device=None, + operations=None): + super().__init__() + self.encoder = MotionEncoder_tc( + in_dim=dim, + hidden_dim=out_dim, + num_heads=num_token, + need_global=need_global, dtype=dtype, device=device, operations=operations) + weight = torch.empty((1, num_layers, 1, 1), dtype=dtype, device=device) + + self.weights = torch.nn.Parameter(weight) + self.act = torch.nn.SiLU() + + def forward(self, features): + # features B * num_layers * dim * video_length + weights = self.act(comfy.model_management.cast_to(self.weights, dtype=features.dtype, device=features.device)) + weights_sum = weights.sum(dim=1, keepdims=True) + weighted_feat = ((features * weights) / weights_sum).sum( + dim=1) # b dim f + weighted_feat = weighted_feat.permute(0, 2, 1) # b f dim + res = self.encoder(weighted_feat) # b f n dim + return res # b f n dim + + +class AdaLayerNorm(nn.Module): + def __init__(self, embedding_dim, output_dim=None, norm_elementwise_affine=False, norm_eps=1e-5, dtype=None, device=None, operations=None): + super().__init__() + + output_dim = output_dim or embedding_dim * 2 + + self.silu = nn.SiLU() + self.linear = operations.Linear(embedding_dim, output_dim, dtype=dtype, device=device) + self.norm = operations.LayerNorm(output_dim // 2, norm_eps, norm_elementwise_affine, dtype=dtype, device=device) + + def forward(self, x, temb): + temb = self.linear(self.silu(temb)) + shift, scale = temb.chunk(2, dim=1) + shift = shift[:, None, :] + scale = scale[:, None, :] + x = self.norm(x) * (1 + scale) + shift + return x + + +class AudioInjector_WAN(nn.Module): + + def __init__(self, + dim=2048, + num_heads=32, + inject_layer=[0, 27], + root_net=None, + enable_adain=False, + adain_dim=2048, + adain_mode=None, + dtype=None, + device=None, + operations=None): + super().__init__() + self.enable_adain = enable_adain + self.adain_mode = adain_mode + self.injected_block_id = {} + audio_injector_id = 0 + for inject_id in inject_layer: + self.injected_block_id[inject_id] = audio_injector_id + audio_injector_id += 1 + + self.injector = nn.ModuleList([ + WanT2VCrossAttention( + dim=dim, + num_heads=num_heads, + qk_norm=True, operation_settings={"operations": operations, "device": device, "dtype": dtype} + ) for _ in range(audio_injector_id) + ]) + self.injector_pre_norm_feat = nn.ModuleList([ + operations.LayerNorm( + dim, + elementwise_affine=False, + eps=1e-6, dtype=dtype, device=device + ) for _ in range(audio_injector_id) + ]) + self.injector_pre_norm_vec = nn.ModuleList([ + operations.LayerNorm( + dim, + elementwise_affine=False, + eps=1e-6, dtype=dtype, device=device + ) for _ in range(audio_injector_id) + ]) + if enable_adain: + self.injector_adain_layers = nn.ModuleList([ + AdaLayerNorm( + output_dim=dim * 2, embedding_dim=adain_dim, dtype=dtype, device=device, operations=operations) + for _ in range(audio_injector_id) + ]) + if adain_mode != "attn_norm": + self.injector_adain_output_layers = nn.ModuleList( + [operations.Linear(dim, dim, dtype=dtype, device=device) for _ in range(audio_injector_id)]) + + def forward(self, x, block_id, audio_emb, audio_emb_global, seq_len): + audio_attn_id = self.injected_block_id.get(block_id, None) + if audio_attn_id is None: + return x + + num_frames = audio_emb.shape[1] + input_hidden_states = rearrange(x[:, :seq_len], "b (t n) c -> (b t) n c", t=num_frames) + if self.enable_adain and self.adain_mode == "attn_norm": + audio_emb_global = rearrange(audio_emb_global, "b t n c -> (b t) n c") + adain_hidden_states = self.injector_adain_layers[audio_attn_id](input_hidden_states, temb=audio_emb_global[:, 0]) + attn_hidden_states = adain_hidden_states + else: + attn_hidden_states = self.injector_pre_norm_feat[audio_attn_id](input_hidden_states) + audio_emb = rearrange(audio_emb, "b t n c -> (b t) n c", t=num_frames) + attn_audio_emb = audio_emb + residual_out = self.injector[audio_attn_id](x=attn_hidden_states, context=attn_audio_emb) + residual_out = rearrange( + residual_out, "(b t) n c -> b (t n) c", t=num_frames) + x[:, :seq_len] = x[:, :seq_len] + residual_out + return x + + +class FramePackMotioner(nn.Module): + def __init__( + self, + inner_dim=1024, + num_heads=16, # Used to indicate the number of heads in the backbone network; unrelated to this module's design + zip_frame_buckets=[ + 1, 2, 16 + ], # Three numbers representing the number of frames sampled for patch operations from the nearest to the farthest frames + drop_mode="drop", # If not "drop", it will use "padd", meaning padding instead of deletion + dtype=None, + device=None, + operations=None): + super().__init__() + self.proj = operations.Conv3d(16, inner_dim, kernel_size=(1, 2, 2), stride=(1, 2, 2), dtype=dtype, device=device) + self.proj_2x = operations.Conv3d(16, inner_dim, kernel_size=(2, 4, 4), stride=(2, 4, 4), dtype=dtype, device=device) + self.proj_4x = operations.Conv3d(16, inner_dim, kernel_size=(4, 8, 8), stride=(4, 8, 8), dtype=dtype, device=device) + self.zip_frame_buckets = zip_frame_buckets + + self.inner_dim = inner_dim + self.num_heads = num_heads + + self.drop_mode = drop_mode + + def forward(self, motion_latents, rope_embedder, add_last_motion=2): + lat_height, lat_width = motion_latents.shape[3], motion_latents.shape[4] + padd_lat = torch.zeros(motion_latents.shape[0], 16, sum(self.zip_frame_buckets), lat_height, lat_width).to(device=motion_latents.device, dtype=motion_latents.dtype) + overlap_frame = min(padd_lat.shape[2], motion_latents.shape[2]) + if overlap_frame > 0: + padd_lat[:, :, -overlap_frame:] = motion_latents[:, :, -overlap_frame:] + + if add_last_motion < 2 and self.drop_mode != "drop": + zero_end_frame = sum(self.zip_frame_buckets[:len(self.zip_frame_buckets) - add_last_motion - 1]) + padd_lat[:, :, -zero_end_frame:] = 0 + + clean_latents_4x, clean_latents_2x, clean_latents_post = padd_lat[:, :, -sum(self.zip_frame_buckets):, :, :].split(self.zip_frame_buckets[::-1], dim=2) # 16, 2 ,1 + + # patchfy + clean_latents_post = self.proj(clean_latents_post).flatten(2).transpose(1, 2) + clean_latents_2x = self.proj_2x(clean_latents_2x) + l_2x_shape = clean_latents_2x.shape + clean_latents_2x = clean_latents_2x.flatten(2).transpose(1, 2) + clean_latents_4x = self.proj_4x(clean_latents_4x) + l_4x_shape = clean_latents_4x.shape + clean_latents_4x = clean_latents_4x.flatten(2).transpose(1, 2) + + if add_last_motion < 2 and self.drop_mode == "drop": + clean_latents_post = clean_latents_post[:, : + 0] if add_last_motion < 2 else clean_latents_post + clean_latents_2x = clean_latents_2x[:, : + 0] if add_last_motion < 1 else clean_latents_2x + + motion_lat = torch.cat([clean_latents_post, clean_latents_2x, clean_latents_4x], dim=1) + + rope_post = rope_embedder.rope_encode(1, lat_height, lat_width, t_start=-1, device=motion_latents.device, dtype=motion_latents.dtype) + rope_2x = rope_embedder.rope_encode(1, lat_height, lat_width, t_start=-3, steps_h=l_2x_shape[-2], steps_w=l_2x_shape[-1], device=motion_latents.device, dtype=motion_latents.dtype) + rope_4x = rope_embedder.rope_encode(4, lat_height, lat_width, t_start=-19, steps_h=l_4x_shape[-2], steps_w=l_4x_shape[-1], device=motion_latents.device, dtype=motion_latents.dtype) + + rope = torch.cat([rope_post, rope_2x, rope_4x], dim=1) + return motion_lat, rope + + +class WanModel_S2V(WanModel): + def __init__(self, + model_type='s2v', + patch_size=(1, 2, 2), + text_len=512, + in_dim=16, + dim=2048, + ffn_dim=8192, + freq_dim=256, + text_dim=4096, + out_dim=16, + num_heads=16, + num_layers=32, + window_size=(-1, -1), + qk_norm=True, + cross_attn_norm=True, + eps=1e-6, + audio_dim=1024, + num_audio_token=4, + enable_adain=True, + cond_dim=16, + audio_inject_layers=[0, 4, 8, 12, 16, 20, 24, 27, 30, 33, 36, 39], + adain_mode="attn_norm", + framepack_drop_mode="padd", + image_model=None, + device=None, + dtype=None, + operations=None, + ): + + super().__init__(model_type='t2v', patch_size=patch_size, text_len=text_len, in_dim=in_dim, dim=dim, ffn_dim=ffn_dim, freq_dim=freq_dim, text_dim=text_dim, out_dim=out_dim, num_heads=num_heads, num_layers=num_layers, window_size=window_size, qk_norm=qk_norm, cross_attn_norm=cross_attn_norm, eps=eps, image_model=image_model, device=device, dtype=dtype, operations=operations) + + self.trainable_cond_mask = operations.Embedding(3, self.dim, device=device, dtype=dtype) + + self.casual_audio_encoder = CausalAudioEncoder( + dim=audio_dim, + out_dim=self.dim, + num_token=num_audio_token, + need_global=enable_adain, dtype=dtype, device=device, operations=operations) + + if cond_dim > 0: + self.cond_encoder = operations.Conv3d( + cond_dim, + self.dim, + kernel_size=self.patch_size, + stride=self.patch_size, device=device, dtype=dtype) + + self.audio_injector = AudioInjector_WAN( + dim=self.dim, + num_heads=self.num_heads, + inject_layer=audio_inject_layers, + root_net=self, + enable_adain=enable_adain, + adain_dim=self.dim, + adain_mode=adain_mode, + dtype=dtype, device=device, operations=operations + ) + + self.frame_packer = FramePackMotioner( + inner_dim=self.dim, + num_heads=self.num_heads, + zip_frame_buckets=[1, 2, 16], + drop_mode=framepack_drop_mode, + dtype=dtype, device=device, operations=operations) + + def forward_orig( + self, + x, + t, + context, + audio_embed=None, + reference_latent=None, + control_video=None, + reference_motion=None, + clip_fea=None, + freqs=None, + transformer_options={}, + **kwargs, + ): + if audio_embed is not None: + num_embeds = x.shape[-3] * 4 + audio_emb_global, audio_emb = self.casual_audio_encoder(audio_embed[:, :, :, :num_embeds]) + else: + audio_emb = None + + # embeddings + x = self.patch_embedding(x.float()).to(x.dtype) + if control_video is not None: + x = x + self.cond_encoder(control_video) + + if t.ndim == 1: + t = t.unsqueeze(1).repeat(1, x.shape[2]) + + grid_sizes = x.shape[2:] + x = x.flatten(2).transpose(1, 2) + seq_len = x.size(1) + + cond_mask_weight = comfy.model_management.cast_to(self.trainable_cond_mask.weight, dtype=x.dtype, device=x.device).unsqueeze(1).unsqueeze(1) + x = x + cond_mask_weight[0] + + if reference_latent is not None: + ref = self.patch_embedding(reference_latent.float()).to(x.dtype) + ref = ref.flatten(2).transpose(1, 2) + freqs_ref = self.rope_encode(reference_latent.shape[-3], reference_latent.shape[-2], reference_latent.shape[-1], t_start=30, device=x.device, dtype=x.dtype) + ref = ref + cond_mask_weight[1] + x = torch.cat([x, ref], dim=1) + freqs = torch.cat([freqs, freqs_ref], dim=1) + t = torch.cat([t, torch.zeros((t.shape[0], reference_latent.shape[-3]), device=t.device, dtype=t.dtype)], dim=1) + + if reference_motion is not None: + motion_encoded, freqs_motion = self.frame_packer(reference_motion, self) + motion_encoded = motion_encoded + cond_mask_weight[2] + x = torch.cat([x, motion_encoded], dim=1) + freqs = torch.cat([freqs, freqs_motion], dim=1) + + t = torch.repeat_interleave(t, 2, dim=1) + t = torch.cat([t, torch.zeros((t.shape[0], 3), device=t.device, dtype=t.dtype)], dim=1) + + # time embeddings + e = self.time_embedding( + sinusoidal_embedding_1d(self.freq_dim, t.flatten()).to(dtype=x[0].dtype)) + e = e.reshape(t.shape[0], -1, e.shape[-1]) + e0 = self.time_projection(e).unflatten(2, (6, self.dim)) + + # context + context = self.text_embedding(context) + + + patches_replace = transformer_options.get("patches_replace", {}) + blocks_replace = patches_replace.get("dit", {}) + for i, block in enumerate(self.blocks): + if ("double_block", i) in blocks_replace: + def block_wrap(args): + out = {} + out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"]) + return out + out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs}, {"original_block": block_wrap}) + x = out["img"] + else: + x = block(x, e=e0, freqs=freqs, context=context) + if audio_emb is not None: + x = self.audio_injector(x, i, audio_emb, audio_emb_global, seq_len) + # head + x = self.head(x, e) + + # unpatchify + x = self.unpatchify(x, grid_sizes) + return x diff --git a/comfy/model_base.py b/comfy/model_base.py index 6c861b15e..18d55c1c4 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1201,6 +1201,29 @@ class WAN21_Camera(WAN21): out['camera_conditions'] = comfy.conds.CONDRegular(camera_conditions) return out +class WAN22_S2V(WAN21): + def __init__(self, model_config, model_type=ModelType.FLOW, device=None): + super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel_S2V) + + def extra_conds(self, **kwargs): + out = super().extra_conds(**kwargs) + audio_embed = kwargs.get("audio_embed", None) + if audio_embed is not None: + out['audio_embed'] = comfy.conds.CONDRegular(audio_embed) + + reference_latents = kwargs.get("reference_latents", None) + if reference_latents is not None: + out['reference_latent'] = comfy.conds.CONDRegular(self.process_latent_in(reference_latents[-1])) + + reference_motion = kwargs.get("reference_motion", None) + if reference_motion is not None: + out['reference_motion'] = comfy.conds.CONDRegular(self.process_latent_in(reference_motion)) + + control_video = kwargs.get("control_video", None) + if control_video is not None: + out['control_video'] = comfy.conds.CONDRegular(self.process_latent_in(control_video)) + return out + class WAN22(BaseModel): def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None): super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 0caff53e0..9f3ab64df 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -368,6 +368,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["model_type"] = "camera" else: dit_config["model_type"] = "camera_2.2" + elif '{}casual_audio_encoder.encoder.final_linear.weight'.format(key_prefix) in state_dict_keys: + dit_config["model_type"] = "s2v" else: if '{}img_emb.proj.0.bias'.format(key_prefix) in state_dict_keys: dit_config["model_type"] = "i2v" diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 7ed6dfd69..ce571e6cb 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1072,6 +1072,19 @@ class WAN21_Vace(WAN21_T2V): out = model_base.WAN21_Vace(self, image_to_video=False, device=device) return out +class WAN22_S2V(WAN21_T2V): + unet_config = { + "image_model": "wan2.1", + "model_type": "s2v", + } + + def __init__(self, unet_config): + super().__init__(unet_config) + + def get_model(self, state_dict, prefix="", device=None): + out = model_base.WAN22_S2V(self, device=device) + return out + class WAN22_T2V(WAN21_T2V): unet_config = { "image_model": "wan2.1", @@ -1272,6 +1285,6 @@ class QwenImage(supported_models_base.BASE): return supported_models_base.ClipTarget(comfy.text_encoders.qwen_image.QwenImageTokenizer, comfy.text_encoders.qwen_image.te(**hunyuan_detect)) -models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep, Omnigen2, QwenImage] +models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep, Omnigen2, QwenImage] models += [SVD_img2vid] diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index 0fff02f76..89ff74d85 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -786,6 +786,180 @@ class WanTrackToVideo(io.ComfyNode): return io.NodeOutput(positive, negative, out_latent) +def linear_interpolation(features, input_fps, output_fps, output_len=None): + """ + features: shape=[1, T, 512] + input_fps: fps for audio, f_a + output_fps: fps for video, f_m + output_len: video length + """ + features = features.transpose(1, 2) # [1, 512, T] + seq_len = features.shape[2] / float(input_fps) # T/f_a + if output_len is None: + output_len = int(seq_len * output_fps) # f_m*T/f_a + output_features = torch.nn.functional.interpolate( + features, size=output_len, align_corners=True, + mode='linear') # [1, 512, output_len] + return output_features.transpose(1, 2) # [1, output_len, 512] + + +def get_sample_indices(original_fps, + total_frames, + target_fps, + num_sample, + fixed_start=None): + required_duration = num_sample / target_fps + required_origin_frames = int(np.ceil(required_duration * original_fps)) + if required_duration > total_frames / original_fps: + raise ValueError("required_duration must be less than video length") + + if not fixed_start is None and fixed_start >= 0: + start_frame = fixed_start + else: + max_start = total_frames - required_origin_frames + if max_start < 0: + raise ValueError("video length is too short") + start_frame = np.random.randint(0, max_start + 1) + start_time = start_frame / original_fps + + end_time = start_time + required_duration + time_points = np.linspace(start_time, end_time, num_sample, endpoint=False) + + frame_indices = np.round(np.array(time_points) * original_fps).astype(int) + frame_indices = np.clip(frame_indices, 0, total_frames - 1) + return frame_indices + + +def get_audio_embed_bucket_fps(audio_embed, fps=16, batch_frames=81, m=0, video_rate=30): + num_layers, audio_frame_num, audio_dim = audio_embed.shape + + if num_layers > 1: + return_all_layers = True + else: + return_all_layers = False + + scale = video_rate / fps + + min_batch_num = int(audio_frame_num / (batch_frames * scale)) + 1 + + bucket_num = min_batch_num * batch_frames + padd_audio_num = math.ceil(min_batch_num * batch_frames / fps * video_rate) - audio_frame_num + batch_idx = get_sample_indices( + original_fps=video_rate, + total_frames=audio_frame_num + padd_audio_num, + target_fps=fps, + num_sample=bucket_num, + fixed_start=0) + batch_audio_eb = [] + audio_sample_stride = int(video_rate / fps) + for bi in batch_idx: + if bi < audio_frame_num: + + chosen_idx = list( + range(bi - m * audio_sample_stride, bi + (m + 1) * audio_sample_stride, audio_sample_stride)) + chosen_idx = [0 if c < 0 else c for c in chosen_idx] + chosen_idx = [ + audio_frame_num - 1 if c >= audio_frame_num else c + for c in chosen_idx + ] + + if return_all_layers: + frame_audio_embed = audio_embed[:, chosen_idx].flatten( + start_dim=-2, end_dim=-1) + else: + frame_audio_embed = audio_embed[0][chosen_idx].flatten() + else: + frame_audio_embed = torch.zeros([audio_dim * (2 * m + 1)], device=audio_embed.device) if not return_all_layers \ + else torch.zeros([num_layers, audio_dim * (2 * m + 1)], device=audio_embed.device) + batch_audio_eb.append(frame_audio_embed) + batch_audio_eb = torch.cat([c.unsqueeze(0) for c in batch_audio_eb], dim=0) + + return batch_audio_eb, min_batch_num + + +class WanSoundImageToVideo(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="WanSoundImageToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=77, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.AudioEncoderOutput.Input("audio_encoder_output", optional=True), + io.Image.Input("ref_image", optional=True), + io.Image.Input("control_video", optional=True), + io.Image.Input("ref_motion", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + is_experimental=True, + ) + + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, ref_image=None, audio_encoder_output=None, control_video=None, ref_motion=None) -> io.NodeOutput: + latent_t = ((length - 1) // 4) + 1 + if audio_encoder_output is not None: + feat = torch.cat(audio_encoder_output["encoded_audio_all_layers"]) + video_rate = 30 + fps = 16 + feat = linear_interpolation(feat, input_fps=50, output_fps=video_rate) + audio_embed_bucket, num_repeat = get_audio_embed_bucket_fps(feat, fps=fps, batch_frames=latent_t * 4, m=0, video_rate=video_rate) + audio_embed_bucket = audio_embed_bucket.unsqueeze(0) + if len(audio_embed_bucket.shape) == 3: + audio_embed_bucket = audio_embed_bucket.permute(0, 2, 1) + elif len(audio_embed_bucket.shape) == 4: + audio_embed_bucket = audio_embed_bucket.permute(0, 2, 3, 1) + + positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_embed_bucket}) + negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_embed_bucket}) + + if ref_image is not None: + ref_image = comfy.utils.common_upscale(ref_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + ref_latent = vae.encode(ref_image[:, :, :, :3]) + positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True) + negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [ref_latent]}, append=True) + + if ref_motion is not None: + if ref_motion.shape[0] > 73: + ref_motion = ref_motion[-73:] + + ref_motion = comfy.utils.common_upscale(ref_motion.movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + + if ref_motion.shape[0] < 73: + r = torch.ones([73, height, width, 3]) * 0.5 + r[-ref_motion.shape[0]:] = ref_motion + ref_motion = r + + ref_motion = vae.encode(ref_motion[:, :, :, :3]) + positive = node_helpers.conditioning_set_values(positive, {"reference_motion": ref_motion}) + negative = node_helpers.conditioning_set_values(negative, {"reference_motion": ref_motion}) + + latent = torch.zeros([batch_size, 16, latent_t, height // 8, width // 8], device=comfy.model_management.intermediate_device()) + + control_video_out = comfy.latent_formats.Wan21().process_out(torch.zeros_like(latent)) + if control_video is not None: + control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + control_video = vae.encode(control_video[:, :, :, :3]) + control_video_out[:, :, :control_video.shape[2]] = control_video + + # TODO: check if zero is better than none if none provided + positive = node_helpers.conditioning_set_values(positive, {"control_video": control_video_out}) + negative = node_helpers.conditioning_set_values(negative, {"control_video": control_video_out}) + + out_latent = {} + out_latent["samples"] = latent + return io.NodeOutput(positive, negative, out_latent) + + class Wan22ImageToVideoLatent(io.ComfyNode): @classmethod def define_schema(cls): @@ -844,6 +1018,7 @@ class WanExtension(ComfyExtension): TrimVideoLatent, WanCameraImageToVideo, WanPhantomSubjectToVideo, + WanSoundImageToVideo, Wan22ImageToVideoLatent, ] From 31a37686d02aeaba8ea827933832be7601b31fac Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 27 Aug 2025 09:44:29 -0700 Subject: [PATCH 197/243] Negative audio in s2v should be zeros. (#9578) --- comfy_extras/nodes_wan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index 89ff74d85..312260f00 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -920,7 +920,7 @@ class WanSoundImageToVideo(io.ComfyNode): audio_embed_bucket = audio_embed_bucket.permute(0, 2, 3, 1) positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_embed_bucket}) - negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_embed_bucket}) + negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_embed_bucket * 0.0}) if ref_image is not None: ref_image = comfy.utils.common_upscale(ref_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) From b20ba1f27cbd4e1c84cf8ec72b345723de9e7c80 Mon Sep 17 00:00:00 2001 From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com> Date: Thu, 28 Aug 2025 00:45:02 +0800 Subject: [PATCH 198/243] Fix #9537 (#9576) --- comfy/weight_adapter/lokr.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/comfy/weight_adapter/lokr.py b/comfy/weight_adapter/lokr.py index 49b0be55f..563c835f5 100644 --- a/comfy/weight_adapter/lokr.py +++ b/comfy/weight_adapter/lokr.py @@ -97,6 +97,9 @@ class LoKrAdapter(WeightAdapterBase): (mat1, mat2, alpha, None, None, None, None, None, None) ) + def to_train(self): + return LokrDiff(self.weights) + @classmethod def load( cls, From b5ac6ed7ce73294e0025ffe3b16452d8434b83c7 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 27 Aug 2025 12:26:28 -0700 Subject: [PATCH 199/243] Fixes to make controlnet type models work on qwen edit and kontext. (#9581) --- comfy/ldm/flux/model.py | 4 ++-- comfy/ldm/qwen_image/model.py | 2 +- comfy_extras/nodes_model_patch.py | 8 +++++--- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/comfy/ldm/flux/model.py b/comfy/ldm/flux/model.py index 0a77fa097..1344c3a57 100644 --- a/comfy/ldm/flux/model.py +++ b/comfy/ldm/flux/model.py @@ -158,7 +158,7 @@ class Flux(nn.Module): if i < len(control_i): add = control_i[i] if add is not None: - img += add + img[:, :add.shape[1]] += add if img.dtype == torch.float16: img = torch.nan_to_num(img, nan=0.0, posinf=65504, neginf=-65504) @@ -189,7 +189,7 @@ class Flux(nn.Module): if i < len(control_o): add = control_o[i] if add is not None: - img[:, txt.shape[1] :, ...] += add + img[:, txt.shape[1] : txt.shape[1] + add.shape[1], ...] += add img = img[:, txt.shape[1] :, ...] diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index 57a458210..04071f31c 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -459,7 +459,7 @@ class QwenImageTransformer2DModel(nn.Module): if i < len(control_i): add = control_i[i] if add is not None: - hidden_states += add + hidden_states[:, :add.shape[1]] += add hidden_states = self.norm_out(hidden_states, temb) hidden_states = self.proj_out(hidden_states) diff --git a/comfy_extras/nodes_model_patch.py b/comfy_extras/nodes_model_patch.py index 3eaada9bc..32c40ced3 100644 --- a/comfy_extras/nodes_model_patch.py +++ b/comfy_extras/nodes_model_patch.py @@ -89,6 +89,7 @@ class DiffSynthCnetPatch: self.strength = strength self.mask = mask self.encoded_image = model_patch.model.process_input_latent_image(self.encode_latent_cond(image)) + self.encoded_image_size = (image.shape[1], image.shape[2]) def encode_latent_cond(self, image): latent_image = self.vae.encode(image) @@ -106,14 +107,15 @@ class DiffSynthCnetPatch: x = kwargs.get("x") img = kwargs.get("img") block_index = kwargs.get("block_index") - if self.encoded_image is None or self.encoded_image.shape[1:] != img.shape[1:]: - spacial_compression = self.vae.spacial_compression_encode() + spacial_compression = self.vae.spacial_compression_encode() + if self.encoded_image is None or self.encoded_image_size != (x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression): image_scaled = comfy.utils.common_upscale(self.image.movedim(-1, 1), x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression, "area", "center") loaded_models = comfy.model_management.loaded_models(only_currently_used=True) self.encoded_image = self.model_patch.model.process_input_latent_image(self.encode_latent_cond(image_scaled.movedim(1, -1))) + self.encoded_image_size = (image_scaled.shape[-2], image_scaled.shape[-1]) comfy.model_management.load_models_gpu(loaded_models) - img = img + (self.model_patch.model.control_block(img, self.encoded_image.to(img.dtype), block_index) * self.strength) + img[:, :self.encoded_image.shape[1]] += (self.model_patch.model.control_block(img[:, :self.encoded_image.shape[1]], self.encoded_image.to(img.dtype), block_index) * self.strength) kwargs['img'] = img return kwargs From 496888fd68813033c260195bf70e4d11181e5454 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 27 Aug 2025 13:06:40 -0700 Subject: [PATCH 200/243] Improve s2v performance when generating videos longer than 120 frames. (#9582) --- comfy/ldm/wan/model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index dedfb47e2..e70446c86 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -1255,6 +1255,7 @@ class WanModel_S2V(WanModel): audio_emb = None # embeddings + bs, _, time, height, width = x.shape x = self.patch_embedding(x.float()).to(x.dtype) if control_video is not None: x = x + self.cond_encoder(control_video) @@ -1272,7 +1273,7 @@ class WanModel_S2V(WanModel): if reference_latent is not None: ref = self.patch_embedding(reference_latent.float()).to(x.dtype) ref = ref.flatten(2).transpose(1, 2) - freqs_ref = self.rope_encode(reference_latent.shape[-3], reference_latent.shape[-2], reference_latent.shape[-1], t_start=30, device=x.device, dtype=x.dtype) + freqs_ref = self.rope_encode(reference_latent.shape[-3], reference_latent.shape[-2], reference_latent.shape[-1], t_start=max(30, time + 9), device=x.device, dtype=x.dtype) ref = ref + cond_mask_weight[1] x = torch.cat([x, ref], dim=1) freqs = torch.cat([freqs, freqs_ref], dim=1) @@ -1296,7 +1297,6 @@ class WanModel_S2V(WanModel): # context context = self.text_embedding(context) - patches_replace = transformer_options.get("patches_replace", {}) blocks_replace = patches_replace.get("dit", {}) for i, block in enumerate(self.blocks): From 491755325cc189d0aa1513b12fac738c87e38de6 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 27 Aug 2025 16:02:42 -0700 Subject: [PATCH 201/243] Better s2v memory estimation. (#9584) --- comfy/ldm/wan/model.py | 2 ++ comfy/model_base.py | 25 +++++++++++++++++++++++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index e70446c86..47857dc2b 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -1278,6 +1278,7 @@ class WanModel_S2V(WanModel): x = torch.cat([x, ref], dim=1) freqs = torch.cat([freqs, freqs_ref], dim=1) t = torch.cat([t, torch.zeros((t.shape[0], reference_latent.shape[-3]), device=t.device, dtype=t.dtype)], dim=1) + del ref, freqs_ref if reference_motion is not None: motion_encoded, freqs_motion = self.frame_packer(reference_motion, self) @@ -1287,6 +1288,7 @@ class WanModel_S2V(WanModel): t = torch.repeat_interleave(t, 2, dim=1) t = torch.cat([t, torch.zeros((t.shape[0], 3), device=t.device, dtype=t.dtype)], dim=1) + del motion_encoded, freqs_motion # time embeddings e = self.time_embedding( diff --git a/comfy/model_base.py b/comfy/model_base.py index 18d55c1c4..ce29fdc49 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -150,6 +150,7 @@ class BaseModel(torch.nn.Module): logging.debug("adm {}".format(self.adm_channels)) self.memory_usage_factor = model_config.memory_usage_factor self.memory_usage_factor_conds = () + self.memory_usage_shape_process = {} def apply_model(self, x, t, c_concat=None, c_crossattn=None, control=None, transformer_options={}, **kwargs): return comfy.patcher_extension.WrapperExecutor.new_class_executor( @@ -350,8 +351,15 @@ class BaseModel(torch.nn.Module): input_shapes = [input_shape] for c in self.memory_usage_factor_conds: shape = cond_shapes.get(c, None) - if shape is not None and len(shape) > 0: - input_shapes += shape + if shape is not None: + if c in self.memory_usage_shape_process: + out = [] + for s in shape: + out.append(self.memory_usage_shape_process[c](s)) + shape = out + + if len(shape) > 0: + input_shapes += shape if comfy.model_management.xformers_enabled() or comfy.model_management.pytorch_attention_flash_attention(): dtype = self.get_dtype() @@ -1204,6 +1212,8 @@ class WAN21_Camera(WAN21): class WAN22_S2V(WAN21): def __init__(self, model_config, model_type=ModelType.FLOW, device=None): super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel_S2V) + self.memory_usage_factor_conds = ("reference_latent", "reference_motion") + self.memory_usage_shape_process = {"reference_motion": lambda shape: [shape[0], shape[1], 1.5, shape[-2], shape[-1]]} def extra_conds(self, **kwargs): out = super().extra_conds(**kwargs) @@ -1224,6 +1234,17 @@ class WAN22_S2V(WAN21): out['control_video'] = comfy.conds.CONDRegular(self.process_latent_in(control_video)) return out + def extra_conds_shapes(self, **kwargs): + out = {} + ref_latents = kwargs.get("reference_latents", None) + if ref_latents is not None: + out['reference_latent'] = list([1, 16, sum(map(lambda a: math.prod(a.size()), ref_latents)) // 16]) + + reference_motion = kwargs.get("reference_motion", None) + if reference_motion is not None: + out['reference_motion'] = reference_motion.shape + return out + class WAN22(BaseModel): def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None): super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel) From 3aad339b63f03e17dc6ebae035b90afc2fefb627 Mon Sep 17 00:00:00 2001 From: Gangin Park Date: Thu, 28 Aug 2025 08:07:31 +0900 Subject: [PATCH 202/243] Add DPM++ 2M SDE Heun (RES) sampler (#9542) --- comfy/k_diffusion/sampling.py | 15 +++++++++++++++ comfy/samplers.py | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) mode change 100644 => 100755 comfy/samplers.py diff --git a/comfy/k_diffusion/sampling.py b/comfy/k_diffusion/sampling.py index a2bc492fd..fe6844b17 100644 --- a/comfy/k_diffusion/sampling.py +++ b/comfy/k_diffusion/sampling.py @@ -853,6 +853,11 @@ def sample_dpmpp_2m_sde(model, x, sigmas, extra_args=None, callback=None, disabl return x +@torch.no_grad() +def sample_dpmpp_2m_sde_heun(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, solver_type='heun'): + return sample_dpmpp_2m_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler, solver_type=solver_type) + + @torch.no_grad() def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None): """DPM-Solver++(3M) SDE.""" @@ -925,6 +930,16 @@ def sample_dpmpp_3m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, di return sample_dpmpp_3m_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler) +@torch.no_grad() +def sample_dpmpp_2m_sde_heun_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, solver_type='heun'): + if len(sigmas) <= 1: + return x + extra_args = {} if extra_args is None else extra_args + sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max() + noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler + return sample_dpmpp_2m_sde_heun(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler, solver_type=solver_type) + + @torch.no_grad() def sample_dpmpp_2m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, solver_type='midpoint'): if len(sigmas) <= 1: diff --git a/comfy/samplers.py b/comfy/samplers.py old mode 100644 new mode 100755 index c7dfef4ea..b3202cec6 --- a/comfy/samplers.py +++ b/comfy/samplers.py @@ -729,7 +729,7 @@ class Sampler: KSAMPLER_NAMES = ["euler", "euler_cfg_pp", "euler_ancestral", "euler_ancestral_cfg_pp", "heun", "heunpp2","dpm_2", "dpm_2_ancestral", "lms", "dpm_fast", "dpm_adaptive", "dpmpp_2s_ancestral", "dpmpp_2s_ancestral_cfg_pp", "dpmpp_sde", "dpmpp_sde_gpu", - "dpmpp_2m", "dpmpp_2m_cfg_pp", "dpmpp_2m_sde", "dpmpp_2m_sde_gpu", "dpmpp_3m_sde", "dpmpp_3m_sde_gpu", "ddpm", "lcm", + "dpmpp_2m", "dpmpp_2m_cfg_pp", "dpmpp_2m_sde", "dpmpp_2m_sde_gpu", "dpmpp_2m_sde_heun", "dpmpp_2m_sde_heun_gpu", "dpmpp_3m_sde", "dpmpp_3m_sde_gpu", "ddpm", "lcm", "ipndm", "ipndm_v", "deis", "res_multistep", "res_multistep_cfg_pp", "res_multistep_ancestral", "res_multistep_ancestral_cfg_pp", "gradient_estimation", "gradient_estimation_cfg_pp", "er_sde", "seeds_2", "seeds_3", "sa_solver", "sa_solver_pece"] From 38f697d953c3989db67e543795768bf954ae0231 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 27 Aug 2025 19:28:10 -0700 Subject: [PATCH 203/243] Add a LatentConcat node. (#9587) --- comfy_extras/nodes_latent.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/comfy_extras/nodes_latent.py b/comfy_extras/nodes_latent.py index f33ed1bee..247d886a1 100644 --- a/comfy_extras/nodes_latent.py +++ b/comfy_extras/nodes_latent.py @@ -105,6 +105,38 @@ class LatentInterpolate: samples_out["samples"] = st * (m1 * ratio + m2 * (1.0 - ratio)) return (samples_out,) +class LatentConcat: + @classmethod + def INPUT_TYPES(s): + return {"required": { "samples1": ("LATENT",), "samples2": ("LATENT",), "dim": (["x", "-x", "y", "-y", "t", "-t"], )}} + + RETURN_TYPES = ("LATENT",) + FUNCTION = "op" + + CATEGORY = "latent/advanced" + + def op(self, samples1, samples2, dim): + samples_out = samples1.copy() + + s1 = samples1["samples"] + s2 = samples2["samples"] + s2 = comfy.utils.repeat_to_batch_size(s2, s1.shape[0]) + + if "-" in dim: + c = (s2, s1) + else: + c = (s1, s2) + + if "x" in dim: + dim = -1 + elif "y" in dim: + dim = -2 + elif "t" in dim: + dim = -3 + + samples_out["samples"] = torch.cat(c, dim=dim) + return (samples_out,) + class LatentBatch: @classmethod def INPUT_TYPES(s): @@ -279,6 +311,7 @@ NODE_CLASS_MAPPINGS = { "LatentSubtract": LatentSubtract, "LatentMultiply": LatentMultiply, "LatentInterpolate": LatentInterpolate, + "LatentConcat": LatentConcat, "LatentBatch": LatentBatch, "LatentBatchSeedBehavior": LatentBatchSeedBehavior, "LatentApplyOperation": LatentApplyOperation, From 4aa79dbf2c5118853659fc7f7f8590594ab72417 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 27 Aug 2025 20:08:17 -0700 Subject: [PATCH 204/243] Adjust flux mem usage factor a bit. (#9588) --- comfy/supported_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index ce571e6cb..76260de00 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -700,7 +700,7 @@ class Flux(supported_models_base.BASE): unet_extra_config = {} latent_format = latent_formats.Flux - memory_usage_factor = 2.8 + memory_usage_factor = 3.1 # TODO: debug why flux mem usage is so weird on windows. supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32] From 0eb821a7b6612af0fa3aaa8302739788a4bd629e Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Wed, 27 Aug 2025 23:09:06 -0400 Subject: [PATCH 205/243] ComfyUI 0.3.53 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 834c3e8c2..d6fdc47fe 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.52" +__version__ = "0.3.53" diff --git a/pyproject.toml b/pyproject.toml index f6e765a81..a71ad2bbf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.52" +version = "0.3.53" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From ce0052c087cb1e81ba01e8afbe362bec54eeb665 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 28 Aug 2025 07:37:42 -0700 Subject: [PATCH 206/243] Fix diffsynth controlnet regression. (#9597) --- comfy_extras/nodes_model_patch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_extras/nodes_model_patch.py b/comfy_extras/nodes_model_patch.py index 32c40ced3..65e766b52 100644 --- a/comfy_extras/nodes_model_patch.py +++ b/comfy_extras/nodes_model_patch.py @@ -108,7 +108,7 @@ class DiffSynthCnetPatch: img = kwargs.get("img") block_index = kwargs.get("block_index") spacial_compression = self.vae.spacial_compression_encode() - if self.encoded_image is None or self.encoded_image_size != (x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression): + if self.encoded_image is None or self.encoded_image_size != (x.shape[-2] * spacial_compression, x.shape[-1] * spacial_compression): image_scaled = comfy.utils.common_upscale(self.image.movedim(-1, 1), x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression, "area", "center") loaded_models = comfy.model_management.loaded_models(only_currently_used=True) self.encoded_image = self.model_patch.model.process_input_latent_image(self.encode_latent_cond(image_scaled.movedim(1, -1))) From 00636101771cb373354d6294cc6567deda2635f6 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Thu, 28 Aug 2025 10:44:57 -0400 Subject: [PATCH 207/243] ComfyUI version 0.3.54 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index d6fdc47fe..7034953fd 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.53" +__version__ = "0.3.54" diff --git a/pyproject.toml b/pyproject.toml index a71ad2bbf..9f9ac1e21 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.53" +version = "0.3.54" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From edde0b50431e296f61f79205e25cb01f653013a2 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 28 Aug 2025 14:59:48 -0700 Subject: [PATCH 208/243] WanSoundImageToVideoExtend node to manually extend s2v video. (#9606) --- comfy_extras/nodes_wan.py | 145 +++++++++++++++++++++++++------------- 1 file changed, 97 insertions(+), 48 deletions(-) diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index 312260f00..0a55bd5d0 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -877,6 +877,67 @@ def get_audio_embed_bucket_fps(audio_embed, fps=16, batch_frames=81, m=0, video_ return batch_audio_eb, min_batch_num +def wan_sound_to_video(positive, negative, vae, width, height, length, batch_size, frame_offset=0, ref_image=None, audio_encoder_output=None, control_video=None, ref_motion=None, ref_motion_latent=None): + latent_t = ((length - 1) // 4) + 1 + if audio_encoder_output is not None: + feat = torch.cat(audio_encoder_output["encoded_audio_all_layers"]) + video_rate = 30 + fps = 16 + feat = linear_interpolation(feat, input_fps=50, output_fps=video_rate) + batch_frames = latent_t * 4 + audio_embed_bucket, num_repeat = get_audio_embed_bucket_fps(feat, fps=fps, batch_frames=batch_frames, m=0, video_rate=video_rate) + audio_embed_bucket = audio_embed_bucket.unsqueeze(0) + if len(audio_embed_bucket.shape) == 3: + audio_embed_bucket = audio_embed_bucket.permute(0, 2, 1) + elif len(audio_embed_bucket.shape) == 4: + audio_embed_bucket = audio_embed_bucket.permute(0, 2, 3, 1) + + audio_embed_bucket = audio_embed_bucket[:, :, :, frame_offset:frame_offset + batch_frames] + positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_embed_bucket}) + negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_embed_bucket * 0.0}) + frame_offset += batch_frames + + if ref_image is not None: + ref_image = comfy.utils.common_upscale(ref_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + ref_latent = vae.encode(ref_image[:, :, :, :3]) + positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True) + negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [ref_latent]}, append=True) + + if ref_motion is not None: + if ref_motion.shape[0] > 73: + ref_motion = ref_motion[-73:] + + ref_motion = comfy.utils.common_upscale(ref_motion.movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + + if ref_motion.shape[0] < 73: + r = torch.ones([73, height, width, 3]) * 0.5 + r[-ref_motion.shape[0]:] = ref_motion + ref_motion = r + + ref_motion_latent = vae.encode(ref_motion[:, :, :, :3]) + + if ref_motion_latent is not None: + ref_motion_latent = ref_motion_latent[:, :, -19:] + positive = node_helpers.conditioning_set_values(positive, {"reference_motion": ref_motion_latent}) + negative = node_helpers.conditioning_set_values(negative, {"reference_motion": ref_motion_latent}) + + latent = torch.zeros([batch_size, 16, latent_t, height // 8, width // 8], device=comfy.model_management.intermediate_device()) + + control_video_out = comfy.latent_formats.Wan21().process_out(torch.zeros_like(latent)) + if control_video is not None: + control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + control_video = vae.encode(control_video[:, :, :, :3]) + control_video_out[:, :, :control_video.shape[2]] = control_video + + # TODO: check if zero is better than none if none provided + positive = node_helpers.conditioning_set_values(positive, {"control_video": control_video_out}) + negative = node_helpers.conditioning_set_values(negative, {"control_video": control_video_out}) + + out_latent = {} + out_latent["samples"] = latent + return positive, negative, out_latent, frame_offset + + class WanSoundImageToVideo(io.ComfyNode): @classmethod def define_schema(cls): @@ -906,57 +967,44 @@ class WanSoundImageToVideo(io.ComfyNode): @classmethod def execute(cls, positive, negative, vae, width, height, length, batch_size, ref_image=None, audio_encoder_output=None, control_video=None, ref_motion=None) -> io.NodeOutput: - latent_t = ((length - 1) // 4) + 1 - if audio_encoder_output is not None: - feat = torch.cat(audio_encoder_output["encoded_audio_all_layers"]) - video_rate = 30 - fps = 16 - feat = linear_interpolation(feat, input_fps=50, output_fps=video_rate) - audio_embed_bucket, num_repeat = get_audio_embed_bucket_fps(feat, fps=fps, batch_frames=latent_t * 4, m=0, video_rate=video_rate) - audio_embed_bucket = audio_embed_bucket.unsqueeze(0) - if len(audio_embed_bucket.shape) == 3: - audio_embed_bucket = audio_embed_bucket.permute(0, 2, 1) - elif len(audio_embed_bucket.shape) == 4: - audio_embed_bucket = audio_embed_bucket.permute(0, 2, 3, 1) + positive, negative, out_latent, frame_offset = wan_sound_to_video(positive, negative, vae, width, height, length, batch_size, ref_image=ref_image, audio_encoder_output=audio_encoder_output, + control_video=control_video, ref_motion=ref_motion) + return io.NodeOutput(positive, negative, out_latent) - positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_embed_bucket}) - negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_embed_bucket * 0.0}) - if ref_image is not None: - ref_image = comfy.utils.common_upscale(ref_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) - ref_latent = vae.encode(ref_image[:, :, :, :3]) - positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True) - negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [ref_latent]}, append=True) +class WanSoundImageToVideoExtend(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="WanSoundImageToVideoExtend", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("length", default=77, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Latent.Input("video_latent"), + io.AudioEncoderOutput.Input("audio_encoder_output", optional=True), + io.Image.Input("ref_image", optional=True), + io.Image.Input("control_video", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + is_experimental=True, + ) - if ref_motion is not None: - if ref_motion.shape[0] > 73: - ref_motion = ref_motion[-73:] - - ref_motion = comfy.utils.common_upscale(ref_motion.movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) - - if ref_motion.shape[0] < 73: - r = torch.ones([73, height, width, 3]) * 0.5 - r[-ref_motion.shape[0]:] = ref_motion - ref_motion = r - - ref_motion = vae.encode(ref_motion[:, :, :, :3]) - positive = node_helpers.conditioning_set_values(positive, {"reference_motion": ref_motion}) - negative = node_helpers.conditioning_set_values(negative, {"reference_motion": ref_motion}) - - latent = torch.zeros([batch_size, 16, latent_t, height // 8, width // 8], device=comfy.model_management.intermediate_device()) - - control_video_out = comfy.latent_formats.Wan21().process_out(torch.zeros_like(latent)) - if control_video is not None: - control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) - control_video = vae.encode(control_video[:, :, :, :3]) - control_video_out[:, :, :control_video.shape[2]] = control_video - - # TODO: check if zero is better than none if none provided - positive = node_helpers.conditioning_set_values(positive, {"control_video": control_video_out}) - negative = node_helpers.conditioning_set_values(negative, {"control_video": control_video_out}) - - out_latent = {} - out_latent["samples"] = latent + @classmethod + def execute(cls, positive, negative, vae, length, video_latent, ref_image=None, audio_encoder_output=None, control_video=None) -> io.NodeOutput: + video_latent = video_latent["samples"] + width = video_latent.shape[-1] * 8 + height = video_latent.shape[-2] * 8 + batch_size = video_latent.shape[0] + frame_offset = video_latent.shape[-3] * 4 + positive, negative, out_latent, frame_offset = wan_sound_to_video(positive, negative, vae, width, height, length, batch_size, frame_offset=frame_offset, ref_image=ref_image, audio_encoder_output=audio_encoder_output, + control_video=control_video, ref_motion=None, ref_motion_latent=video_latent) return io.NodeOutput(positive, negative, out_latent) @@ -1019,6 +1067,7 @@ class WanExtension(ComfyExtension): WanCameraImageToVideo, WanPhantomSubjectToVideo, WanSoundImageToVideo, + WanSoundImageToVideoExtend, Wan22ImageToVideoLatent, ] From 1c184c29eb2a8f6fdd4e49f27347809090038e3f Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 28 Aug 2025 15:34:01 -0700 Subject: [PATCH 209/243] Fix issue with s2v node when extending past audio length. (#9608) --- comfy_extras/nodes_wan.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index 0a55bd5d0..2cbc93ceb 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -893,9 +893,10 @@ def wan_sound_to_video(positive, negative, vae, width, height, length, batch_siz audio_embed_bucket = audio_embed_bucket.permute(0, 2, 3, 1) audio_embed_bucket = audio_embed_bucket[:, :, :, frame_offset:frame_offset + batch_frames] - positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_embed_bucket}) - negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_embed_bucket * 0.0}) - frame_offset += batch_frames + if audio_embed_bucket.shape[3] > 0: + positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_embed_bucket}) + negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_embed_bucket * 0.0}) + frame_offset += batch_frames if ref_image is not None: ref_image = comfy.utils.common_upscale(ref_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) From d28b39d93dc498110e28ca32c8f39e6de631aa42 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 28 Aug 2025 16:38:28 -0700 Subject: [PATCH 210/243] Add a LatentCut node to cut latents. (#9609) --- comfy_extras/nodes_latent.py | 37 ++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/comfy_extras/nodes_latent.py b/comfy_extras/nodes_latent.py index 247d886a1..0f90cf60c 100644 --- a/comfy_extras/nodes_latent.py +++ b/comfy_extras/nodes_latent.py @@ -1,6 +1,7 @@ import comfy.utils import comfy_extras.nodes_post_processing import torch +import nodes def reshape_latent_to(target_shape, latent, repeat_batch=True): @@ -137,6 +138,41 @@ class LatentConcat: samples_out["samples"] = torch.cat(c, dim=dim) return (samples_out,) +class LatentCut: + @classmethod + def INPUT_TYPES(s): + return {"required": {"samples": ("LATENT",), + "dim": (["x", "y", "t"], ), + "index": ("INT", {"default": 0, "min": -nodes.MAX_RESOLUTION, "max": nodes.MAX_RESOLUTION, "step": 1}), + "amount": ("INT", {"default": 1, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 1})}} + + RETURN_TYPES = ("LATENT",) + FUNCTION = "op" + + CATEGORY = "latent/advanced" + + def op(self, samples, dim, index, amount): + samples_out = samples.copy() + + s1 = samples["samples"] + + if "x" in dim: + dim = s1.ndim - 1 + elif "y" in dim: + dim = s1.ndim - 2 + elif "t" in dim: + dim = s1.ndim - 3 + + if index >= 0: + index = min(index, s1.shape[dim] - 1) + amount = min(s1.shape[dim] - index, amount) + else: + index = max(index, -s1.shape[dim]) + amount = min(-index, amount) + + samples_out["samples"] = torch.narrow(s1, dim, index, amount) + return (samples_out,) + class LatentBatch: @classmethod def INPUT_TYPES(s): @@ -312,6 +348,7 @@ NODE_CLASS_MAPPINGS = { "LatentMultiply": LatentMultiply, "LatentInterpolate": LatentInterpolate, "LatentConcat": LatentConcat, + "LatentCut": LatentCut, "LatentBatch": LatentBatch, "LatentBatchSeedBehavior": LatentBatchSeedBehavior, "LatentApplyOperation": LatentApplyOperation, From e80a14ad5073d9eba175c2d2c768a5ca8e4c63ea Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 28 Aug 2025 19:13:07 -0700 Subject: [PATCH 211/243] Support wan2.2 5B fun control model. (#9611) Use the Wan22FunControlToVideo node. --- comfy/model_base.py | 15 ++++++--------- comfy_extras/nodes_wan.py | 19 ++++++++++++------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/comfy/model_base.py b/comfy/model_base.py index ce29fdc49..56a6798be 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1110,9 +1110,10 @@ class WAN21(BaseModel): shape_image[1] = extra_channels image = torch.zeros(shape_image, dtype=noise.dtype, layout=noise.layout, device=noise.device) else: + latent_dim = self.latent_format.latent_channels image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center") - for i in range(0, image.shape[1], 16): - image[:, i: i + 16] = self.process_latent_in(image[:, i: i + 16]) + for i in range(0, image.shape[1], latent_dim): + image[:, i: i + latent_dim] = self.process_latent_in(image[:, i: i + latent_dim]) image = utils.resize_to_batch_size(image, noise.shape[0]) if extra_channels != image.shape[1] + 4: @@ -1245,18 +1246,14 @@ class WAN22_S2V(WAN21): out['reference_motion'] = reference_motion.shape return out -class WAN22(BaseModel): +class WAN22(WAN21): def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None): - super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel) + super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel) self.image_to_video = image_to_video def extra_conds(self, **kwargs): out = super().extra_conds(**kwargs) - cross_attn = kwargs.get("cross_attn", None) - if cross_attn is not None: - out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) - - denoise_mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None)) + denoise_mask = kwargs.get("denoise_mask", None) if denoise_mask is not None: out["denoise_mask"] = comfy.conds.CONDRegular(denoise_mask) return out diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index 2cbc93ceb..8c1d36613 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -139,16 +139,21 @@ class Wan22FunControlToVideo(io.ComfyNode): @classmethod def execute(cls, positive, negative, vae, width, height, length, batch_size, ref_image=None, start_image=None, control_video=None) -> io.NodeOutput: - latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) - concat_latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) - concat_latent = comfy.latent_formats.Wan21().process_out(concat_latent) + spacial_scale = vae.spacial_compression_encode() + latent_channels = vae.latent_channels + latent = torch.zeros([batch_size, latent_channels, ((length - 1) // 4) + 1, height // spacial_scale, width // spacial_scale], device=comfy.model_management.intermediate_device()) + concat_latent = torch.zeros([batch_size, latent_channels, ((length - 1) // 4) + 1, height // spacial_scale, width // spacial_scale], device=comfy.model_management.intermediate_device()) + if latent_channels == 48: + concat_latent = comfy.latent_formats.Wan22().process_out(concat_latent) + else: + concat_latent = comfy.latent_formats.Wan21().process_out(concat_latent) concat_latent = concat_latent.repeat(1, 2, 1, 1, 1) mask = torch.ones((1, 1, latent.shape[2] * 4, latent.shape[-2], latent.shape[-1])) if start_image is not None: start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) concat_latent_image = vae.encode(start_image[:, :, :, :3]) - concat_latent[:,16:,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]] + concat_latent[:,latent_channels:,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]] mask[:, :, :start_image.shape[0] + 3] = 0.0 ref_latent = None @@ -159,11 +164,11 @@ class Wan22FunControlToVideo(io.ComfyNode): if control_video is not None: control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) concat_latent_image = vae.encode(control_video[:, :, :, :3]) - concat_latent[:,:16,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]] + concat_latent[:,:latent_channels,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]] mask = mask.view(1, mask.shape[2] // 4, 4, mask.shape[3], mask.shape[4]).transpose(1, 2) - positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent, "concat_mask": mask, "concat_mask_index": 16}) - negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent, "concat_mask": mask, "concat_mask_index": 16}) + positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent, "concat_mask": mask, "concat_mask_index": latent_channels}) + negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent, "concat_mask": mask, "concat_mask_index": latent_channels}) if ref_latent is not None: positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True) From c7bb3e2bceaad7accd52c23d22b97a1b6808304b Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 28 Aug 2025 19:46:57 -0700 Subject: [PATCH 212/243] Support the 5B fun inpaint model. (#9614) Use the WanFunInpaintToVideo node without the clip_vision_output. --- comfy_extras/nodes_wan.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index 8c1d36613..4f73369f5 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -206,7 +206,8 @@ class WanFirstLastFrameToVideo(io.ComfyNode): @classmethod def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, end_image=None, clip_vision_start_image=None, clip_vision_end_image=None) -> io.NodeOutput: - latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) + spacial_scale = vae.spacial_compression_encode() + latent = torch.zeros([batch_size, vae.latent_channels, ((length - 1) // 4) + 1, height // spacial_scale, width // spacial_scale], device=comfy.model_management.intermediate_device()) if start_image is not None: start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) if end_image is not None: From 15aa9222c4d1fc74f5190d7c7e56ef986d0d7146 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 29 Aug 2025 01:12:00 -0700 Subject: [PATCH 213/243] Trim audio to video when saving video. (#9617) --- comfy_api/latest/_input_impl/video_types.py | 34 ++++++--------------- 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/comfy_api/latest/_input_impl/video_types.py b/comfy_api/latest/_input_impl/video_types.py index 28de9651d..f646504c8 100644 --- a/comfy_api/latest/_input_impl/video_types.py +++ b/comfy_api/latest/_input_impl/video_types.py @@ -8,6 +8,7 @@ import av import io import json import numpy as np +import math import torch from comfy_api.latest._util import VideoContainer, VideoCodec, VideoComponents @@ -282,8 +283,6 @@ class VideoFromComponents(VideoInput): if self.__components.audio: audio_sample_rate = int(self.__components.audio['sample_rate']) audio_stream = output.add_stream('aac', rate=audio_sample_rate) - audio_stream.sample_rate = audio_sample_rate - audio_stream.format = 'fltp' # Encode video for i, frame in enumerate(self.__components.images): @@ -298,27 +297,12 @@ class VideoFromComponents(VideoInput): output.mux(packet) if audio_stream and self.__components.audio: - # Encode audio - samples_per_frame = int(audio_sample_rate / frame_rate) - num_frames = self.__components.audio['waveform'].shape[2] // samples_per_frame - for i in range(num_frames): - start = i * samples_per_frame - end = start + samples_per_frame - # TODO(Feature) - Add support for stereo audio - chunk = ( - self.__components.audio["waveform"][0, 0, start:end] - .unsqueeze(0) - .contiguous() - .numpy() - ) - audio_frame = av.AudioFrame.from_ndarray(chunk, format='fltp', layout='mono') - audio_frame.sample_rate = audio_sample_rate - audio_frame.pts = i * samples_per_frame - for packet in audio_stream.encode(audio_frame): - output.mux(packet) - - # Flush audio - for packet in audio_stream.encode(None): - output.mux(packet) - + waveform = self.__components.audio['waveform'] + waveform = waveform[:, :, :math.ceil((audio_sample_rate / frame_rate) * self.__components.images.shape[0])] + frame = av.AudioFrame.from_ndarray(waveform.movedim(2, 1).reshape(1, -1).float().numpy(), format='flt', layout='mono' if waveform.shape[1] == 1 else 'stereo') + frame.sample_rate = audio_sample_rate + frame.pts = 0 + output.mux(audio_stream.encode(frame)) + # Flush encoder + output.mux(audio_stream.encode(None)) From 2efb2cbc38714074b0a48a9f4d70fa43f41499f4 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Fri, 29 Aug 2025 18:03:25 +0800 Subject: [PATCH 214/243] Update template to 0.1.70 (#9620) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 93d88859d..7f64aacca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.25.11 -comfyui-workflow-templates==0.1.68 +comfyui-workflow-templates==0.1.70 comfyui-embedded-docs==0.2.6 torch torchsde From a86aaa430183068e2a264495c802c81d05eb350a Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Fri, 29 Aug 2025 05:33:29 -0400 Subject: [PATCH 215/243] ComfyUI v0.3.55 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 7034953fd..36777e285 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.54" +__version__ = "0.3.55" diff --git a/pyproject.toml b/pyproject.toml index 9f9ac1e21..04514b4a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.54" +version = "0.3.55" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 885015eecf649d6e49e1ade68e4475b434517b82 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 29 Aug 2025 20:06:04 -0700 Subject: [PATCH 216/243] Lower ram usage on windows. (#9628) --- main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/main.py b/main.py index 9b2a33011..b23d50816 100644 --- a/main.py +++ b/main.py @@ -112,6 +112,7 @@ import gc if os.name == "nt": + os.environ['MIMALLOC_PURGE_DELAY'] = '0' logging.getLogger("xformers").addFilter(lambda record: 'A matching Triton is not available' not in record.getMessage()) if __name__ == "__main__": From 4449e147692366ac8b9bd3b8834c771bc81e91ac Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Sat, 30 Aug 2025 06:31:19 -0400 Subject: [PATCH 217/243] ComfyUI version 0.3.56 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 36777e285..e8e039373 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.55" +__version__ = "0.3.56" diff --git a/pyproject.toml b/pyproject.toml index 04514b4a8..cfd5d45ef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.55" +version = "0.3.56" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From f949094b3cbc33779dbf8d3fd140028f8044d5c1 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sun, 31 Aug 2025 06:19:21 +0300 Subject: [PATCH 218/243] convert Stable Cascade nodes to V3 schema (#9373) --- comfy_extras/nodes_stable_cascade.py | 165 +++++++++++++++------------ 1 file changed, 93 insertions(+), 72 deletions(-) diff --git a/comfy_extras/nodes_stable_cascade.py b/comfy_extras/nodes_stable_cascade.py index 003403215..04c0b366a 100644 --- a/comfy_extras/nodes_stable_cascade.py +++ b/comfy_extras/nodes_stable_cascade.py @@ -17,55 +17,61 @@ """ import torch -import nodes +from typing_extensions import override + import comfy.utils +import nodes +from comfy_api.latest import ComfyExtension, io -class StableCascade_EmptyLatentImage: - def __init__(self, device="cpu"): - self.device = device +class StableCascade_EmptyLatentImage(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="StableCascade_EmptyLatentImage", + category="latent/stable_cascade", + inputs=[ + io.Int.Input("width", default=1024, min=256, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("height", default=1024, min=256, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("compression", default=42, min=4, max=128, step=1), + io.Int.Input("batch_size", default=1, min=1, max=4096), + ], + outputs=[ + io.Latent.Output(display_name="stage_c"), + io.Latent.Output(display_name="stage_b"), + ], + ) @classmethod - def INPUT_TYPES(s): - return {"required": { - "width": ("INT", {"default": 1024, "min": 256, "max": nodes.MAX_RESOLUTION, "step": 8}), - "height": ("INT", {"default": 1024, "min": 256, "max": nodes.MAX_RESOLUTION, "step": 8}), - "compression": ("INT", {"default": 42, "min": 4, "max": 128, "step": 1}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}) - }} - RETURN_TYPES = ("LATENT", "LATENT") - RETURN_NAMES = ("stage_c", "stage_b") - FUNCTION = "generate" - - CATEGORY = "latent/stable_cascade" - - def generate(self, width, height, compression, batch_size=1): + def execute(cls, width, height, compression, batch_size=1): c_latent = torch.zeros([batch_size, 16, height // compression, width // compression]) b_latent = torch.zeros([batch_size, 4, height // 4, width // 4]) - return ({ + return io.NodeOutput({ "samples": c_latent, }, { "samples": b_latent, }) -class StableCascade_StageC_VAEEncode: - def __init__(self, device="cpu"): - self.device = device + +class StableCascade_StageC_VAEEncode(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="StableCascade_StageC_VAEEncode", + category="latent/stable_cascade", + inputs=[ + io.Image.Input("image"), + io.Vae.Input("vae"), + io.Int.Input("compression", default=42, min=4, max=128, step=1), + ], + outputs=[ + io.Latent.Output(display_name="stage_c"), + io.Latent.Output(display_name="stage_b"), + ], + ) @classmethod - def INPUT_TYPES(s): - return {"required": { - "image": ("IMAGE",), - "vae": ("VAE", ), - "compression": ("INT", {"default": 42, "min": 4, "max": 128, "step": 1}), - }} - RETURN_TYPES = ("LATENT", "LATENT") - RETURN_NAMES = ("stage_c", "stage_b") - FUNCTION = "generate" - - CATEGORY = "latent/stable_cascade" - - def generate(self, image, vae, compression): + def execute(cls, image, vae, compression): width = image.shape[-2] height = image.shape[-3] out_width = (width // compression) * vae.downscale_ratio @@ -75,51 +81,59 @@ class StableCascade_StageC_VAEEncode: c_latent = vae.encode(s[:,:,:,:3]) b_latent = torch.zeros([c_latent.shape[0], 4, (height // 8) * 2, (width // 8) * 2]) - return ({ + return io.NodeOutput({ "samples": c_latent, }, { "samples": b_latent, }) -class StableCascade_StageB_Conditioning: + +class StableCascade_StageB_Conditioning(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "conditioning": ("CONDITIONING",), - "stage_c": ("LATENT",), - }} - RETURN_TYPES = ("CONDITIONING",) + def define_schema(cls): + return io.Schema( + node_id="StableCascade_StageB_Conditioning", + category="conditioning/stable_cascade", + inputs=[ + io.Conditioning.Input("conditioning"), + io.Latent.Input("stage_c"), + ], + outputs=[ + io.Conditioning.Output(), + ], + ) - FUNCTION = "set_prior" - - CATEGORY = "conditioning/stable_cascade" - - def set_prior(self, conditioning, stage_c): + @classmethod + def execute(cls, conditioning, stage_c): c = [] for t in conditioning: d = t[1].copy() - d['stable_cascade_prior'] = stage_c['samples'] + d["stable_cascade_prior"] = stage_c["samples"] n = [t[0], d] c.append(n) - return (c, ) + return io.NodeOutput(c) -class StableCascade_SuperResolutionControlnet: - def __init__(self, device="cpu"): - self.device = device + +class StableCascade_SuperResolutionControlnet(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="StableCascade_SuperResolutionControlnet", + category="_for_testing/stable_cascade", + is_experimental=True, + inputs=[ + io.Image.Input("image"), + io.Vae.Input("vae"), + ], + outputs=[ + io.Image.Output(display_name="controlnet_input"), + io.Latent.Output(display_name="stage_c"), + io.Latent.Output(display_name="stage_b"), + ], + ) @classmethod - def INPUT_TYPES(s): - return {"required": { - "image": ("IMAGE",), - "vae": ("VAE", ), - }} - RETURN_TYPES = ("IMAGE", "LATENT", "LATENT") - RETURN_NAMES = ("controlnet_input", "stage_c", "stage_b") - FUNCTION = "generate" - - EXPERIMENTAL = True - CATEGORY = "_for_testing/stable_cascade" - - def generate(self, image, vae): + def execute(cls, image, vae): width = image.shape[-2] height = image.shape[-3] batch_size = image.shape[0] @@ -127,15 +141,22 @@ class StableCascade_SuperResolutionControlnet: c_latent = torch.zeros([batch_size, 16, height // 16, width // 16]) b_latent = torch.zeros([batch_size, 4, height // 2, width // 2]) - return (controlnet_input, { + return io.NodeOutput(controlnet_input, { "samples": c_latent, }, { "samples": b_latent, }) -NODE_CLASS_MAPPINGS = { - "StableCascade_EmptyLatentImage": StableCascade_EmptyLatentImage, - "StableCascade_StageB_Conditioning": StableCascade_StageB_Conditioning, - "StableCascade_StageC_VAEEncode": StableCascade_StageC_VAEEncode, - "StableCascade_SuperResolutionControlnet": StableCascade_SuperResolutionControlnet, -} + +class StableCascadeExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + StableCascade_EmptyLatentImage, + StableCascade_StageB_Conditioning, + StableCascade_StageC_VAEEncode, + StableCascade_SuperResolutionControlnet, + ] + +async def comfy_entrypoint() -> StableCascadeExtension: + return StableCascadeExtension() From fea9ea8268d9fc0f4245f3fdc4a417ab802033e9 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sun, 31 Aug 2025 06:19:54 +0300 Subject: [PATCH 219/243] convert Video nodes to V3 schema (#9489) --- comfy_extras/nodes_video.py | 286 +++++++++++++++++------------------- 1 file changed, 132 insertions(+), 154 deletions(-) diff --git a/comfy_extras/nodes_video.py b/comfy_extras/nodes_video.py index 969f888b9..69fabb12e 100644 --- a/comfy_extras/nodes_video.py +++ b/comfy_extras/nodes_video.py @@ -5,52 +5,49 @@ import av import torch import folder_paths import json -from typing import Optional, Literal +from typing import Optional +from typing_extensions import override from fractions import Fraction -from comfy.comfy_types import IO, FileLocator, ComfyNodeABC -from comfy_api.latest import Input, InputImpl, Types +from comfy_api.input import AudioInput, ImageInput, VideoInput +from comfy_api.input_impl import VideoFromComponents, VideoFromFile +from comfy_api.util import VideoCodec, VideoComponents, VideoContainer +from comfy_api.latest import ComfyExtension, io, ui from comfy.cli_args import args -class SaveWEBM: - def __init__(self): - self.output_dir = folder_paths.get_output_directory() - self.type = "output" - self.prefix_append = "" +class SaveWEBM(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="SaveWEBM", + category="image/video", + is_experimental=True, + inputs=[ + io.Image.Input("images"), + io.String.Input("filename_prefix", default="ComfyUI"), + io.Combo.Input("codec", options=["vp9", "av1"]), + io.Float.Input("fps", default=24.0, min=0.01, max=1000.0, step=0.01), + io.Float.Input("crf", default=32.0, min=0, max=63.0, step=1, tooltip="Higher crf means lower quality with a smaller file size, lower crf means higher quality higher filesize."), + ], + outputs=[], + hidden=[io.Hidden.prompt, io.Hidden.extra_pnginfo], + is_output_node=True, + ) @classmethod - def INPUT_TYPES(s): - return {"required": - {"images": ("IMAGE", ), - "filename_prefix": ("STRING", {"default": "ComfyUI"}), - "codec": (["vp9", "av1"],), - "fps": ("FLOAT", {"default": 24.0, "min": 0.01, "max": 1000.0, "step": 0.01}), - "crf": ("FLOAT", {"default": 32.0, "min": 0, "max": 63.0, "step": 1, "tooltip": "Higher crf means lower quality with a smaller file size, lower crf means higher quality higher filesize."}), - }, - "hidden": {"prompt": "PROMPT", "extra_pnginfo": "EXTRA_PNGINFO"}, - } - - RETURN_TYPES = () - FUNCTION = "save_images" - - OUTPUT_NODE = True - - CATEGORY = "image/video" - - EXPERIMENTAL = True - - def save_images(self, images, codec, fps, filename_prefix, crf, prompt=None, extra_pnginfo=None): - filename_prefix += self.prefix_append - full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(filename_prefix, self.output_dir, images[0].shape[1], images[0].shape[0]) + def execute(cls, images, codec, fps, filename_prefix, crf) -> io.NodeOutput: + full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path( + filename_prefix, folder_paths.get_output_directory(), images[0].shape[1], images[0].shape[0] + ) file = f"{filename}_{counter:05}_.webm" container = av.open(os.path.join(full_output_folder, file), mode="w") - if prompt is not None: - container.metadata["prompt"] = json.dumps(prompt) + if cls.hidden.prompt is not None: + container.metadata["prompt"] = json.dumps(cls.hidden.prompt) - if extra_pnginfo is not None: - for x in extra_pnginfo: - container.metadata[x] = json.dumps(extra_pnginfo[x]) + if cls.hidden.extra_pnginfo is not None: + for x in cls.hidden.extra_pnginfo: + container.metadata[x] = json.dumps(cls.hidden.extra_pnginfo[x]) codec_map = {"vp9": "libvpx-vp9", "av1": "libsvtav1"} stream = container.add_stream(codec_map[codec], rate=Fraction(round(fps * 1000), 1000)) @@ -69,63 +66,46 @@ class SaveWEBM: container.mux(stream.encode()) container.close() - results: list[FileLocator] = [{ - "filename": file, - "subfolder": subfolder, - "type": self.type - }] + return io.NodeOutput(ui=ui.PreviewVideo([ui.SavedResult(file, subfolder, io.FolderType.output)])) - return {"ui": {"images": results, "animated": (True,)}} # TODO: frontend side - -class SaveVideo(ComfyNodeABC): - def __init__(self): - self.output_dir = folder_paths.get_output_directory() - self.type: Literal["output"] = "output" - self.prefix_append = "" +class SaveVideo(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="SaveVideo", + display_name="Save Video", + category="image/video", + description="Saves the input images to your ComfyUI output directory.", + inputs=[ + io.Video.Input("video", tooltip="The video to save."), + io.String.Input("filename_prefix", default="video/ComfyUI", tooltip="The prefix for the file to save. This may include formatting information such as %date:yyyy-MM-dd% or %Empty Latent Image.width% to include values from nodes."), + io.Combo.Input("format", options=VideoContainer.as_input(), default="auto", tooltip="The format to save the video as."), + io.Combo.Input("codec", options=VideoCodec.as_input(), default="auto", tooltip="The codec to use for the video."), + ], + outputs=[], + hidden=[io.Hidden.prompt, io.Hidden.extra_pnginfo], + is_output_node=True, + ) @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "video": (IO.VIDEO, {"tooltip": "The video to save."}), - "filename_prefix": ("STRING", {"default": "video/ComfyUI", "tooltip": "The prefix for the file to save. This may include formatting information such as %date:yyyy-MM-dd% or %Empty Latent Image.width% to include values from nodes."}), - "format": (Types.VideoContainer.as_input(), {"default": "auto", "tooltip": "The format to save the video as."}), - "codec": (Types.VideoCodec.as_input(), {"default": "auto", "tooltip": "The codec to use for the video."}), - }, - "hidden": { - "prompt": "PROMPT", - "extra_pnginfo": "EXTRA_PNGINFO" - }, - } - - RETURN_TYPES = () - FUNCTION = "save_video" - - OUTPUT_NODE = True - - CATEGORY = "image/video" - DESCRIPTION = "Saves the input images to your ComfyUI output directory." - - def save_video(self, video: Input.Video, filename_prefix, format, codec, prompt=None, extra_pnginfo=None): - filename_prefix += self.prefix_append + def execute(cls, video: VideoInput, filename_prefix, format, codec) -> io.NodeOutput: width, height = video.get_dimensions() full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path( filename_prefix, - self.output_dir, + folder_paths.get_output_directory(), width, height ) - results: list[FileLocator] = list() saved_metadata = None if not args.disable_metadata: metadata = {} - if extra_pnginfo is not None: - metadata.update(extra_pnginfo) - if prompt is not None: - metadata["prompt"] = prompt + if cls.hidden.extra_pnginfo is not None: + metadata.update(cls.hidden.extra_pnginfo) + if cls.hidden.prompt is not None: + metadata["prompt"] = cls.hidden.prompt if len(metadata) > 0: saved_metadata = metadata - file = f"{filename}_{counter:05}_.{Types.VideoContainer.get_extension(format)}" + file = f"{filename}_{counter:05}_.{VideoContainer.get_extension(format)}" video.save_to( os.path.join(full_output_folder, file), format=format, @@ -133,83 +113,82 @@ class SaveVideo(ComfyNodeABC): metadata=saved_metadata ) - results.append({ - "filename": file, - "subfolder": subfolder, - "type": self.type - }) - counter += 1 + return io.NodeOutput(ui=ui.PreviewVideo([ui.SavedResult(file, subfolder, io.FolderType.output)])) - return { "ui": { "images": results, "animated": (True,) } } -class CreateVideo(ComfyNodeABC): +class CreateVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "images": (IO.IMAGE, {"tooltip": "The images to create a video from."}), - "fps": ("FLOAT", {"default": 30.0, "min": 1.0, "max": 120.0, "step": 1.0}), - }, - "optional": { - "audio": (IO.AUDIO, {"tooltip": "The audio to add to the video."}), - } - } + def define_schema(cls): + return io.Schema( + node_id="CreateVideo", + display_name="Create Video", + category="image/video", + description="Create a video from images.", + inputs=[ + io.Image.Input("images", tooltip="The images to create a video from."), + io.Float.Input("fps", default=30.0, min=1.0, max=120.0, step=1.0), + io.Audio.Input("audio", optional=True, tooltip="The audio to add to the video."), + ], + outputs=[ + io.Video.Output(), + ], + ) - RETURN_TYPES = (IO.VIDEO,) - FUNCTION = "create_video" - - CATEGORY = "image/video" - DESCRIPTION = "Create a video from images." - - def create_video(self, images: Input.Image, fps: float, audio: Optional[Input.Audio] = None): - return (InputImpl.VideoFromComponents( - Types.VideoComponents( - images=images, - audio=audio, - frame_rate=Fraction(fps), - ) - ),) - -class GetVideoComponents(ComfyNodeABC): @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "video": (IO.VIDEO, {"tooltip": "The video to extract components from."}), - } - } - RETURN_TYPES = (IO.IMAGE, IO.AUDIO, IO.FLOAT) - RETURN_NAMES = ("images", "audio", "fps") - FUNCTION = "get_components" + def execute(cls, images: ImageInput, fps: float, audio: Optional[AudioInput] = None) -> io.NodeOutput: + return io.NodeOutput( + VideoFromComponents(VideoComponents(images=images, audio=audio, frame_rate=Fraction(fps))) + ) - CATEGORY = "image/video" - DESCRIPTION = "Extracts all components from a video: frames, audio, and framerate." +class GetVideoComponents(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="GetVideoComponents", + display_name="Get Video Components", + category="image/video", + description="Extracts all components from a video: frames, audio, and framerate.", + inputs=[ + io.Video.Input("video", tooltip="The video to extract components from."), + ], + outputs=[ + io.Image.Output(display_name="images"), + io.Audio.Output(display_name="audio"), + io.Float.Output(display_name="fps"), + ], + ) - def get_components(self, video: Input.Video): + @classmethod + def execute(cls, video: VideoInput) -> io.NodeOutput: components = video.get_components() - return (components.images, components.audio, float(components.frame_rate)) + return io.NodeOutput(components.images, components.audio, float(components.frame_rate)) -class LoadVideo(ComfyNodeABC): +class LoadVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(cls): + def define_schema(cls): input_dir = folder_paths.get_input_directory() files = [f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))] files = folder_paths.filter_files_content_types(files, ["video"]) - return {"required": - {"file": (sorted(files), {"video_upload": True})}, - } - - CATEGORY = "image/video" - - RETURN_TYPES = (IO.VIDEO,) - FUNCTION = "load_video" - def load_video(self, file): - video_path = folder_paths.get_annotated_filepath(file) - return (InputImpl.VideoFromFile(video_path),) + return io.Schema( + node_id="LoadVideo", + display_name="Load Video", + category="image/video", + inputs=[ + io.Combo.Input("file", options=sorted(files), upload=io.UploadType.video), + ], + outputs=[ + io.Video.Output(), + ], + ) @classmethod - def IS_CHANGED(cls, file): + def execute(cls, file) -> io.NodeOutput: + video_path = folder_paths.get_annotated_filepath(file) + return io.NodeOutput(VideoFromFile(video_path)) + + @classmethod + def fingerprint_inputs(s, file): video_path = folder_paths.get_annotated_filepath(file) mod_time = os.path.getmtime(video_path) # Instead of hashing the file, we can just use the modification time to avoid @@ -217,24 +196,23 @@ class LoadVideo(ComfyNodeABC): return mod_time @classmethod - def VALIDATE_INPUTS(cls, file): + def validate_inputs(s, file): if not folder_paths.exists_annotated_filepath(file): return "Invalid video file: {}".format(file) return True -NODE_CLASS_MAPPINGS = { - "SaveWEBM": SaveWEBM, - "SaveVideo": SaveVideo, - "CreateVideo": CreateVideo, - "GetVideoComponents": GetVideoComponents, - "LoadVideo": LoadVideo, -} -NODE_DISPLAY_NAME_MAPPINGS = { - "SaveVideo": "Save Video", - "CreateVideo": "Create Video", - "GetVideoComponents": "Get Video Components", - "LoadVideo": "Load Video", -} +class VideoExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + SaveWEBM, + SaveVideo, + CreateVideo, + GetVideoComponents, + LoadVideo, + ] +async def comfy_entrypoint() -> VideoExtension: + return VideoExtension() From d2c502e629ba948029abc13ef1b456b9f4bbbdaa Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sun, 31 Aug 2025 06:20:17 +0300 Subject: [PATCH 220/243] convert nodes_stability.py to V3 schema (#9497) --- comfy_api_nodes/nodes_stability.py | 678 ++++++++++++++++------------- 1 file changed, 365 insertions(+), 313 deletions(-) diff --git a/comfy_api_nodes/nodes_stability.py b/comfy_api_nodes/nodes_stability.py index 31309d831..e05cb6bb2 100644 --- a/comfy_api_nodes/nodes_stability.py +++ b/comfy_api_nodes/nodes_stability.py @@ -1,5 +1,8 @@ from inspect import cleandoc -from comfy.comfy_types.node_typing import IO +from typing import Optional +from typing_extensions import override + +from comfy_api.latest import ComfyExtension, io as comfy_io from comfy_api_nodes.apis.stability_api import ( StabilityUpscaleConservativeRequest, StabilityUpscaleCreativeRequest, @@ -46,87 +49,94 @@ def get_async_dummy_status(x: StabilityResultsGetResponse): return StabilityPollStatus.in_progress -class StabilityStableImageUltraNode: +class StabilityStableImageUltraNode(comfy_io.ComfyNode): """ Generates images synchronously based on prompt and resolution. """ - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Stability AI" - @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "What you wish to see in the output image. A strong, descriptive prompt that clearly defines" + - "What you wish to see in the output image. A strong, descriptive prompt that clearly defines" + + def define_schema(cls): + return comfy_io.Schema( + node_id="StabilityStableImageUltraNode", + display_name="Stability AI Stable Image Ultra", + category="api node/image/Stability AI", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="What you wish to see in the output image. A strong, descriptive prompt that clearly defines" + "elements, colors, and subjects will lead to better results. " + "To control the weight of a given word use the format `(word:weight)`," + "where `word` is the word you'd like to control the weight of and `weight`" + "is a value between 0 and 1. For example: `The sky was a crisp (blue:0.3) and (green:0.8)`" + - "would convey a sky that was blue and green, but more green than blue." - }, + "would convey a sky that was blue and green, but more green than blue.", ), - "aspect_ratio": ([x.value for x in StabilityAspectRatio], - { - "default": StabilityAspectRatio.ratio_1_1, - "tooltip": "Aspect ratio of generated image.", - }, + comfy_io.Combo.Input( + "aspect_ratio", + options=[x.value for x in StabilityAspectRatio], + default=StabilityAspectRatio.ratio_1_1.value, + tooltip="Aspect ratio of generated image.", ), - "style_preset": (get_stability_style_presets(), - { - "tooltip": "Optional desired style of generated image.", - }, + comfy_io.Combo.Input( + "style_preset", + options=get_stability_style_presets(), + tooltip="Optional desired style of generated image.", ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 4294967294, - "control_after_generate": True, - "tooltip": "The random seed used for creating the noise.", - }, + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=4294967294, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="The random seed used for creating the noise.", ), - }, - "optional": { - "image": (IO.IMAGE,), - "negative_prompt": ( - IO.STRING, - { - "default": "", - "forceInput": True, - "tooltip": "A blurb of text describing what you do not wish to see in the output image. This is an advanced feature." - }, + comfy_io.Image.Input( + "image", + optional=True, ), - "image_denoise": ( - IO.FLOAT, - { - "default": 0.5, - "min": 0.0, - "max": 1.0, - "step": 0.01, - "tooltip": "Denoise of input image; 0.0 yields image identical to input, 1.0 is as if no image was provided at all.", - }, + comfy_io.String.Input( + "negative_prompt", + default="", + tooltip="A blurb of text describing what you do not wish to see in the output image. This is an advanced feature.", + force_input=True, + optional=True, ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } + comfy_io.Float.Input( + "image_denoise", + default=0.5, + min=0.0, + max=1.0, + step=0.01, + tooltip="Denoise of input image; 0.0 yields image identical to input, 1.0 is as if no image was provided at all.", + optional=True, + ), + ], + outputs=[ + comfy_io.Image.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) - async def api_call(self, prompt: str, aspect_ratio: str, style_preset: str, seed: int, - negative_prompt: str=None, image: torch.Tensor = None, image_denoise: float=None, - **kwargs): + @classmethod + async def execute( + cls, + prompt: str, + aspect_ratio: str, + style_preset: str, + seed: int, + image: Optional[torch.Tensor] = None, + negative_prompt: str = "", + image_denoise: Optional[float] = 0.5, + ) -> comfy_io.NodeOutput: validate_string(prompt, strip_whitespace=False) # prepare image binary if image present image_binary = None @@ -144,6 +154,11 @@ class StabilityStableImageUltraNode: "image": image_binary } + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + operation = SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/stability/v2beta/stable-image/generate/ultra", @@ -161,7 +176,7 @@ class StabilityStableImageUltraNode: ), files=files, content_type="multipart/form-data", - auth_kwargs=kwargs, + auth_kwargs=auth, ) response_api = await operation.execute() @@ -171,95 +186,106 @@ class StabilityStableImageUltraNode: image_data = base64.b64decode(response_api.image) returned_image = bytesio_to_image_tensor(BytesIO(image_data)) - return (returned_image,) + return comfy_io.NodeOutput(returned_image) -class StabilityStableImageSD_3_5Node: +class StabilityStableImageSD_3_5Node(comfy_io.ComfyNode): """ Generates images synchronously based on prompt and resolution. """ - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Stability AI" + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="StabilityStableImageSD_3_5Node", + display_name="Stability AI Stable Diffusion 3.5 Image", + category="api node/image/Stability AI", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="What you wish to see in the output image. A strong, descriptive prompt that clearly defines elements, colors, and subjects will lead to better results.", + ), + comfy_io.Combo.Input( + "model", + options=[x.value for x in Stability_SD3_5_Model], + ), + comfy_io.Combo.Input( + "aspect_ratio", + options=[x.value for x in StabilityAspectRatio], + default=StabilityAspectRatio.ratio_1_1.value, + tooltip="Aspect ratio of generated image.", + ), + comfy_io.Combo.Input( + "style_preset", + options=get_stability_style_presets(), + tooltip="Optional desired style of generated image.", + ), + comfy_io.Float.Input( + "cfg_scale", + default=4.0, + min=1.0, + max=10.0, + step=0.1, + tooltip="How strictly the diffusion process adheres to the prompt text (higher values keep your image closer to your prompt)", + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=4294967294, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="The random seed used for creating the noise.", + ), + comfy_io.Image.Input( + "image", + optional=True, + ), + comfy_io.String.Input( + "negative_prompt", + default="", + tooltip="Keywords of what you do not wish to see in the output image. This is an advanced feature.", + force_input=True, + optional=True, + ), + comfy_io.Float.Input( + "image_denoise", + default=0.5, + min=0.0, + max=1.0, + step=0.01, + tooltip="Denoise of input image; 0.0 yields image identical to input, 1.0 is as if no image was provided at all.", + optional=True, + ), + ], + outputs=[ + comfy_io.Image.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "What you wish to see in the output image. A strong, descriptive prompt that clearly defines elements, colors, and subjects will lead to better results." - }, - ), - "model": ([x.value for x in Stability_SD3_5_Model],), - "aspect_ratio": ([x.value for x in StabilityAspectRatio], - { - "default": StabilityAspectRatio.ratio_1_1, - "tooltip": "Aspect ratio of generated image.", - }, - ), - "style_preset": (get_stability_style_presets(), - { - "tooltip": "Optional desired style of generated image.", - }, - ), - "cfg_scale": ( - IO.FLOAT, - { - "default": 4.0, - "min": 1.0, - "max": 10.0, - "step": 0.1, - "tooltip": "How strictly the diffusion process adheres to the prompt text (higher values keep your image closer to your prompt)", - }, - ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 4294967294, - "control_after_generate": True, - "tooltip": "The random seed used for creating the noise.", - }, - ), - }, - "optional": { - "image": (IO.IMAGE,), - "negative_prompt": ( - IO.STRING, - { - "default": "", - "forceInput": True, - "tooltip": "Keywords of what you do not wish to see in the output image. This is an advanced feature." - }, - ), - "image_denoise": ( - IO.FLOAT, - { - "default": 0.5, - "min": 0.0, - "max": 1.0, - "step": 0.01, - "tooltip": "Denoise of input image; 0.0 yields image identical to input, 1.0 is as if no image was provided at all.", - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } - - async def api_call(self, model: str, prompt: str, aspect_ratio: str, style_preset: str, seed: int, cfg_scale: float, - negative_prompt: str=None, image: torch.Tensor = None, image_denoise: float=None, - **kwargs): + async def execute( + cls, + model: str, + prompt: str, + aspect_ratio: str, + style_preset: str, + seed: int, + cfg_scale: float, + image: Optional[torch.Tensor] = None, + negative_prompt: str = "", + image_denoise: Optional[float] = 0.5, + ) -> comfy_io.NodeOutput: validate_string(prompt, strip_whitespace=False) # prepare image binary if image present image_binary = None @@ -280,6 +306,11 @@ class StabilityStableImageSD_3_5Node: "image": image_binary } + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + operation = SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/stability/v2beta/stable-image/generate/sd3", @@ -300,7 +331,7 @@ class StabilityStableImageSD_3_5Node: ), files=files, content_type="multipart/form-data", - auth_kwargs=kwargs, + auth_kwargs=auth, ) response_api = await operation.execute() @@ -310,72 +341,75 @@ class StabilityStableImageSD_3_5Node: image_data = base64.b64decode(response_api.image) returned_image = bytesio_to_image_tensor(BytesIO(image_data)) - return (returned_image,) + return comfy_io.NodeOutput(returned_image) -class StabilityUpscaleConservativeNode: +class StabilityUpscaleConservativeNode(comfy_io.ComfyNode): """ Upscale image with minimal alterations to 4K resolution. """ - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Stability AI" + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="StabilityUpscaleConservativeNode", + display_name="Stability AI Upscale Conservative", + category="api node/image/Stability AI", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.Image.Input("image"), + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="What you wish to see in the output image. A strong, descriptive prompt that clearly defines elements, colors, and subjects will lead to better results.", + ), + comfy_io.Float.Input( + "creativity", + default=0.35, + min=0.2, + max=0.5, + step=0.01, + tooltip="Controls the likelihood of creating additional details not heavily conditioned by the init image.", + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=4294967294, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="The random seed used for creating the noise.", + ), + comfy_io.String.Input( + "negative_prompt", + default="", + tooltip="Keywords of what you do not wish to see in the output image. This is an advanced feature.", + force_input=True, + optional=True, + ), + ], + outputs=[ + comfy_io.Image.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": (IO.IMAGE,), - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "What you wish to see in the output image. A strong, descriptive prompt that clearly defines elements, colors, and subjects will lead to better results." - }, - ), - "creativity": ( - IO.FLOAT, - { - "default": 0.35, - "min": 0.2, - "max": 0.5, - "step": 0.01, - "tooltip": "Controls the likelihood of creating additional details not heavily conditioned by the init image.", - }, - ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 4294967294, - "control_after_generate": True, - "tooltip": "The random seed used for creating the noise.", - }, - ), - }, - "optional": { - "negative_prompt": ( - IO.STRING, - { - "default": "", - "forceInput": True, - "tooltip": "Keywords of what you do not wish to see in the output image. This is an advanced feature." - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } - - async def api_call(self, image: torch.Tensor, prompt: str, creativity: float, seed: int, negative_prompt: str=None, - **kwargs): + async def execute( + cls, + image: torch.Tensor, + prompt: str, + creativity: float, + seed: int, + negative_prompt: str = "", + ) -> comfy_io.NodeOutput: validate_string(prompt, strip_whitespace=False) image_binary = tensor_to_bytesio(image, total_pixels=1024*1024).read() @@ -386,6 +420,11 @@ class StabilityUpscaleConservativeNode: "image": image_binary } + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + operation = SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/stability/v2beta/stable-image/upscale/conservative", @@ -401,7 +440,7 @@ class StabilityUpscaleConservativeNode: ), files=files, content_type="multipart/form-data", - auth_kwargs=kwargs, + auth_kwargs=auth, ) response_api = await operation.execute() @@ -411,77 +450,81 @@ class StabilityUpscaleConservativeNode: image_data = base64.b64decode(response_api.image) returned_image = bytesio_to_image_tensor(BytesIO(image_data)) - return (returned_image,) + return comfy_io.NodeOutput(returned_image) -class StabilityUpscaleCreativeNode: +class StabilityUpscaleCreativeNode(comfy_io.ComfyNode): """ Upscale image with minimal alterations to 4K resolution. """ - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Stability AI" + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="StabilityUpscaleCreativeNode", + display_name="Stability AI Upscale Creative", + category="api node/image/Stability AI", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.Image.Input("image"), + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="What you wish to see in the output image. A strong, descriptive prompt that clearly defines elements, colors, and subjects will lead to better results.", + ), + comfy_io.Float.Input( + "creativity", + default=0.3, + min=0.1, + max=0.5, + step=0.01, + tooltip="Controls the likelihood of creating additional details not heavily conditioned by the init image.", + ), + comfy_io.Combo.Input( + "style_preset", + options=get_stability_style_presets(), + tooltip="Optional desired style of generated image.", + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=4294967294, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="The random seed used for creating the noise.", + ), + comfy_io.String.Input( + "negative_prompt", + default="", + tooltip="Keywords of what you do not wish to see in the output image. This is an advanced feature.", + force_input=True, + optional=True, + ), + ], + outputs=[ + comfy_io.Image.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": (IO.IMAGE,), - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "What you wish to see in the output image. A strong, descriptive prompt that clearly defines elements, colors, and subjects will lead to better results." - }, - ), - "creativity": ( - IO.FLOAT, - { - "default": 0.3, - "min": 0.1, - "max": 0.5, - "step": 0.01, - "tooltip": "Controls the likelihood of creating additional details not heavily conditioned by the init image.", - }, - ), - "style_preset": (get_stability_style_presets(), - { - "tooltip": "Optional desired style of generated image.", - }, - ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 4294967294, - "control_after_generate": True, - "tooltip": "The random seed used for creating the noise.", - }, - ), - }, - "optional": { - "negative_prompt": ( - IO.STRING, - { - "default": "", - "forceInput": True, - "tooltip": "Keywords of what you do not wish to see in the output image. This is an advanced feature." - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } - - async def api_call(self, image: torch.Tensor, prompt: str, creativity: float, style_preset: str, seed: int, negative_prompt: str=None, - **kwargs): + async def execute( + cls, + image: torch.Tensor, + prompt: str, + creativity: float, + style_preset: str, + seed: int, + negative_prompt: str = "", + ) -> comfy_io.NodeOutput: validate_string(prompt, strip_whitespace=False) image_binary = tensor_to_bytesio(image, total_pixels=1024*1024).read() @@ -494,6 +537,11 @@ class StabilityUpscaleCreativeNode: "image": image_binary } + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + operation = SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/stability/v2beta/stable-image/upscale/creative", @@ -510,7 +558,7 @@ class StabilityUpscaleCreativeNode: ), files=files, content_type="multipart/form-data", - auth_kwargs=kwargs, + auth_kwargs=auth, ) response_api = await operation.execute() @@ -525,7 +573,8 @@ class StabilityUpscaleCreativeNode: completed_statuses=[StabilityPollStatus.finished], failed_statuses=[StabilityPollStatus.failed], status_extractor=lambda x: get_async_dummy_status(x), - auth_kwargs=kwargs, + auth_kwargs=auth, + node_id=cls.hidden.unique_id, ) response_poll: StabilityResultsGetResponse = await operation.execute() @@ -535,41 +584,48 @@ class StabilityUpscaleCreativeNode: image_data = base64.b64decode(response_poll.result) returned_image = bytesio_to_image_tensor(BytesIO(image_data)) - return (returned_image,) + return comfy_io.NodeOutput(returned_image) -class StabilityUpscaleFastNode: +class StabilityUpscaleFastNode(comfy_io.ComfyNode): """ Quickly upscales an image via Stability API call to 4x its original size; intended for upscaling low-quality/compressed images. """ - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Stability AI" + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="StabilityUpscaleFastNode", + display_name="Stability AI Upscale Fast", + category="api node/image/Stability AI", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.Image.Input("image"), + ], + outputs=[ + comfy_io.Image.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": (IO.IMAGE,), - }, - "optional": { - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } - - async def api_call(self, image: torch.Tensor, **kwargs): + async def execute(cls, image: torch.Tensor) -> comfy_io.NodeOutput: image_binary = tensor_to_bytesio(image, total_pixels=4096*4096).read() files = { "image": image_binary } + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + operation = SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/stability/v2beta/stable-image/upscale/fast", @@ -580,7 +636,7 @@ class StabilityUpscaleFastNode: request=EmptyRequest(), files=files, content_type="multipart/form-data", - auth_kwargs=kwargs, + auth_kwargs=auth, ) response_api = await operation.execute() @@ -590,24 +646,20 @@ class StabilityUpscaleFastNode: image_data = base64.b64decode(response_api.image) returned_image = bytesio_to_image_tensor(BytesIO(image_data)) - return (returned_image,) + return comfy_io.NodeOutput(returned_image) -# A dictionary that contains all nodes you want to export with their names -# NOTE: names should be globally unique -NODE_CLASS_MAPPINGS = { - "StabilityStableImageUltraNode": StabilityStableImageUltraNode, - "StabilityStableImageSD_3_5Node": StabilityStableImageSD_3_5Node, - "StabilityUpscaleConservativeNode": StabilityUpscaleConservativeNode, - "StabilityUpscaleCreativeNode": StabilityUpscaleCreativeNode, - "StabilityUpscaleFastNode": StabilityUpscaleFastNode, -} +class StabilityExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + return [ + StabilityStableImageUltraNode, + StabilityStableImageSD_3_5Node, + StabilityUpscaleConservativeNode, + StabilityUpscaleCreativeNode, + StabilityUpscaleFastNode, + ] -# A dictionary that contains the friendly/humanly readable titles for the nodes -NODE_DISPLAY_NAME_MAPPINGS = { - "StabilityStableImageUltraNode": "Stability AI Stable Image Ultra", - "StabilityStableImageSD_3_5Node": "Stability AI Stable Diffusion 3.5 Image", - "StabilityUpscaleConservativeNode": "Stability AI Upscale Conservative", - "StabilityUpscaleCreativeNode": "Stability AI Upscale Creative", - "StabilityUpscaleFastNode": "Stability AI Upscale Fast", -} + +async def comfy_entrypoint() -> StabilityExtension: + return StabilityExtension() From fe442fac2eccd0cc66999b48d3c518623cafe4fc Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sun, 31 Aug 2025 06:21:58 +0300 Subject: [PATCH 221/243] convert Primitive nodes to V3 schema (#9372) --- comfy_extras/nodes_primitive.py | 169 +++++++++++++++++--------------- 1 file changed, 90 insertions(+), 79 deletions(-) diff --git a/comfy_extras/nodes_primitive.py b/comfy_extras/nodes_primitive.py index 1f93f87a7..5a1aeba80 100644 --- a/comfy_extras/nodes_primitive.py +++ b/comfy_extras/nodes_primitive.py @@ -1,98 +1,109 @@ -# Primitive nodes that are evaluated at backend. -from __future__ import annotations - import sys +from typing_extensions import override -from comfy.comfy_types.node_typing import ComfyNodeABC, InputTypeDict, IO +from comfy_api.latest import ComfyExtension, io -class String(ComfyNodeABC): +class String(io.ComfyNode): @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": {"value": (IO.STRING, {})}, - } + def define_schema(cls): + return io.Schema( + node_id="PrimitiveString", + display_name="String", + category="utils/primitive", + inputs=[ + io.String.Input("value"), + ], + outputs=[io.String.Output()], + ) - RETURN_TYPES = (IO.STRING,) - FUNCTION = "execute" - CATEGORY = "utils/primitive" - - def execute(self, value: str) -> tuple[str]: - return (value,) - - -class StringMultiline(ComfyNodeABC): @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": {"value": (IO.STRING, {"multiline": True,},)}, - } - - RETURN_TYPES = (IO.STRING,) - FUNCTION = "execute" - CATEGORY = "utils/primitive" - - def execute(self, value: str) -> tuple[str]: - return (value,) + def execute(cls, value: str) -> io.NodeOutput: + return io.NodeOutput(value) -class Int(ComfyNodeABC): +class StringMultiline(io.ComfyNode): @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": {"value": (IO.INT, {"min": -sys.maxsize, "max": sys.maxsize, "control_after_generate": True})}, - } + def define_schema(cls): + return io.Schema( + node_id="PrimitiveStringMultiline", + display_name="String (Multiline)", + category="utils/primitive", + inputs=[ + io.String.Input("value", multiline=True), + ], + outputs=[io.String.Output()], + ) - RETURN_TYPES = (IO.INT,) - FUNCTION = "execute" - CATEGORY = "utils/primitive" - - def execute(self, value: int) -> tuple[int]: - return (value,) - - -class Float(ComfyNodeABC): @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": {"value": (IO.FLOAT, {"min": -sys.maxsize, "max": sys.maxsize})}, - } - - RETURN_TYPES = (IO.FLOAT,) - FUNCTION = "execute" - CATEGORY = "utils/primitive" - - def execute(self, value: float) -> tuple[float]: - return (value,) + def execute(cls, value: str) -> io.NodeOutput: + return io.NodeOutput(value) -class Boolean(ComfyNodeABC): +class Int(io.ComfyNode): @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": {"value": (IO.BOOLEAN, {})}, - } + def define_schema(cls): + return io.Schema( + node_id="PrimitiveInt", + display_name="Int", + category="utils/primitive", + inputs=[ + io.Int.Input("value", min=-sys.maxsize, max=sys.maxsize, control_after_generate=True), + ], + outputs=[io.Int.Output()], + ) - RETURN_TYPES = (IO.BOOLEAN,) - FUNCTION = "execute" - CATEGORY = "utils/primitive" - - def execute(self, value: bool) -> tuple[bool]: - return (value,) + @classmethod + def execute(cls, value: int) -> io.NodeOutput: + return io.NodeOutput(value) -NODE_CLASS_MAPPINGS = { - "PrimitiveString": String, - "PrimitiveStringMultiline": StringMultiline, - "PrimitiveInt": Int, - "PrimitiveFloat": Float, - "PrimitiveBoolean": Boolean, -} +class Float(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="PrimitiveFloat", + display_name="Float", + category="utils/primitive", + inputs=[ + io.Float.Input("value", min=-sys.maxsize, max=sys.maxsize), + ], + outputs=[io.Float.Output()], + ) -NODE_DISPLAY_NAME_MAPPINGS = { - "PrimitiveString": "String", - "PrimitiveStringMultiline": "String (Multiline)", - "PrimitiveInt": "Int", - "PrimitiveFloat": "Float", - "PrimitiveBoolean": "Boolean", -} + @classmethod + def execute(cls, value: float) -> io.NodeOutput: + return io.NodeOutput(value) + + +class Boolean(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="PrimitiveBoolean", + display_name="Boolean", + category="utils/primitive", + inputs=[ + io.Boolean.Input("value"), + ], + outputs=[io.Boolean.Output()], + ) + + @classmethod + def execute(cls, value: bool) -> io.NodeOutput: + return io.NodeOutput(value) + + +class PrimitivesExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + String, + StringMultiline, + Int, + Float, + Boolean, + ] + +async def comfy_entrypoint() -> PrimitivesExtension: + return PrimitivesExtension() From 32a627bf1feadb83abba97906a27978b927abd33 Mon Sep 17 00:00:00 2001 From: chaObserv <154517000+chaObserv@users.noreply.github.com> Date: Sun, 31 Aug 2025 12:01:45 +0800 Subject: [PATCH 222/243] SEEDS: update noise decomposition and refactor (#9633) - Update the decomposition to reflect interval dependency - Extract phi computations into functions - Use torch.lerp for interpolation --- comfy/k_diffusion/sampling.py | 135 ++++++++++++++++++---------------- 1 file changed, 73 insertions(+), 62 deletions(-) diff --git a/comfy/k_diffusion/sampling.py b/comfy/k_diffusion/sampling.py index fe6844b17..2d7e09838 100644 --- a/comfy/k_diffusion/sampling.py +++ b/comfy/k_diffusion/sampling.py @@ -171,6 +171,16 @@ def offset_first_sigma_for_snr(sigmas, model_sampling, percent_offset=1e-4): return sigmas +def ei_h_phi_1(h: torch.Tensor) -> torch.Tensor: + """Compute the result of h*phi_1(h) in exponential integrator methods.""" + return torch.expm1(h) + + +def ei_h_phi_2(h: torch.Tensor) -> torch.Tensor: + """Compute the result of h*phi_2(h) in exponential integrator methods.""" + return (torch.expm1(h) - h) / h + + @torch.no_grad() def sample_euler(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.): """Implements Algorithm 2 (Euler steps) from Karras et al. (2022).""" @@ -1550,13 +1560,12 @@ def sample_er_sde(model, x, sigmas, extra_args=None, callback=None, disable=None @torch.no_grad() def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=0.5): """SEEDS-2 - Stochastic Explicit Exponential Derivative-free Solvers (VP Data Prediction) stage 2. - arXiv: https://arxiv.org/abs/2305.14267 + arXiv: https://arxiv.org/abs/2305.14267 (NeurIPS 2023) """ extra_args = {} if extra_args is None else extra_args seed = extra_args.get("seed", None) noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler s_in = x.new_ones([x.shape[0]]) - inject_noise = eta > 0 and s_noise > 0 model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling') @@ -1564,55 +1573,53 @@ def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=Non lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling) sigmas = offset_first_sigma_for_snr(sigmas, model_sampling) + fac = 1 / (2 * r) + for i in trange(len(sigmas) - 1, disable=disable): denoised = model(x, sigmas[i] * s_in, **extra_args) if callback is not None: callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised}) + if sigmas[i + 1] == 0: x = denoised - else: - lambda_s, lambda_t = lambda_fn(sigmas[i]), lambda_fn(sigmas[i + 1]) - h = lambda_t - lambda_s - h_eta = h * (eta + 1) - lambda_s_1 = lambda_s + r * h - fac = 1 / (2 * r) - sigma_s_1 = sigma_fn(lambda_s_1) + continue - # alpha_t = sigma_t * exp(log(alpha_t / sigma_t)) = sigma_t * exp(lambda_t) - alpha_s_1 = sigma_s_1 * lambda_s_1.exp() - alpha_t = sigmas[i + 1] * lambda_t.exp() + lambda_s, lambda_t = lambda_fn(sigmas[i]), lambda_fn(sigmas[i + 1]) + h = lambda_t - lambda_s + h_eta = h * (eta + 1) + lambda_s_1 = torch.lerp(lambda_s, lambda_t, r) + sigma_s_1 = sigma_fn(lambda_s_1) - coeff_1, coeff_2 = (-r * h_eta).expm1(), (-h_eta).expm1() - if inject_noise: - # 0 < r < 1 - noise_coeff_1 = (-2 * r * h * eta).expm1().neg().sqrt() - noise_coeff_2 = (-r * h * eta).exp() * (-2 * (1 - r) * h * eta).expm1().neg().sqrt() - noise_1, noise_2 = noise_sampler(sigmas[i], sigma_s_1), noise_sampler(sigma_s_1, sigmas[i + 1]) + alpha_s_1 = sigma_s_1 * lambda_s_1.exp() + alpha_t = sigmas[i + 1] * lambda_t.exp() - # Step 1 - x_2 = sigma_s_1 / sigmas[i] * (-r * h * eta).exp() * x - alpha_s_1 * coeff_1 * denoised - if inject_noise: - x_2 = x_2 + sigma_s_1 * (noise_coeff_1 * noise_1) * s_noise - denoised_2 = model(x_2, sigma_s_1 * s_in, **extra_args) + # Step 1 + x_2 = sigma_s_1 / sigmas[i] * (-r * h * eta).exp() * x - alpha_s_1 * ei_h_phi_1(-r * h_eta) * denoised + if inject_noise: + sde_noise = (-2 * r * h * eta).expm1().neg().sqrt() * noise_sampler(sigmas[i], sigma_s_1) + x_2 = x_2 + sde_noise * sigma_s_1 * s_noise + denoised_2 = model(x_2, sigma_s_1 * s_in, **extra_args) - # Step 2 - denoised_d = (1 - fac) * denoised + fac * denoised_2 - x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x - alpha_t * coeff_2 * denoised_d - if inject_noise: - x = x + sigmas[i + 1] * (noise_coeff_2 * noise_1 + noise_coeff_1 * noise_2) * s_noise + # Step 2 + denoised_d = torch.lerp(denoised, denoised_2, fac) + x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x - alpha_t * ei_h_phi_1(-h_eta) * denoised_d + if inject_noise: + segment_factor = (r - 1) * h * eta + sde_noise = sde_noise * segment_factor.exp() + sde_noise = sde_noise + segment_factor.mul(2).expm1().neg().sqrt() * noise_sampler(sigma_s_1, sigmas[i + 1]) + x = x + sde_noise * sigmas[i + 1] * s_noise return x @torch.no_grad() def sample_seeds_3(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r_1=1./3, r_2=2./3): """SEEDS-3 - Stochastic Explicit Exponential Derivative-free Solvers (VP Data Prediction) stage 3. - arXiv: https://arxiv.org/abs/2305.14267 + arXiv: https://arxiv.org/abs/2305.14267 (NeurIPS 2023) """ extra_args = {} if extra_args is None else extra_args seed = extra_args.get("seed", None) noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler s_in = x.new_ones([x.shape[0]]) - inject_noise = eta > 0 and s_noise > 0 model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling') @@ -1624,45 +1631,49 @@ def sample_seeds_3(model, x, sigmas, extra_args=None, callback=None, disable=Non denoised = model(x, sigmas[i] * s_in, **extra_args) if callback is not None: callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised}) + if sigmas[i + 1] == 0: x = denoised - else: - lambda_s, lambda_t = lambda_fn(sigmas[i]), lambda_fn(sigmas[i + 1]) - h = lambda_t - lambda_s - h_eta = h * (eta + 1) - lambda_s_1 = lambda_s + r_1 * h - lambda_s_2 = lambda_s + r_2 * h - sigma_s_1, sigma_s_2 = sigma_fn(lambda_s_1), sigma_fn(lambda_s_2) + continue - # alpha_t = sigma_t * exp(log(alpha_t / sigma_t)) = sigma_t * exp(lambda_t) - alpha_s_1 = sigma_s_1 * lambda_s_1.exp() - alpha_s_2 = sigma_s_2 * lambda_s_2.exp() - alpha_t = sigmas[i + 1] * lambda_t.exp() + lambda_s, lambda_t = lambda_fn(sigmas[i]), lambda_fn(sigmas[i + 1]) + h = lambda_t - lambda_s + h_eta = h * (eta + 1) + lambda_s_1 = torch.lerp(lambda_s, lambda_t, r_1) + lambda_s_2 = torch.lerp(lambda_s, lambda_t, r_2) + sigma_s_1, sigma_s_2 = sigma_fn(lambda_s_1), sigma_fn(lambda_s_2) - coeff_1, coeff_2, coeff_3 = (-r_1 * h_eta).expm1(), (-r_2 * h_eta).expm1(), (-h_eta).expm1() - if inject_noise: - # 0 < r_1 < r_2 < 1 - noise_coeff_1 = (-2 * r_1 * h * eta).expm1().neg().sqrt() - noise_coeff_2 = (-r_1 * h * eta).exp() * (-2 * (r_2 - r_1) * h * eta).expm1().neg().sqrt() - noise_coeff_3 = (-r_2 * h * eta).exp() * (-2 * (1 - r_2) * h * eta).expm1().neg().sqrt() - noise_1, noise_2, noise_3 = noise_sampler(sigmas[i], sigma_s_1), noise_sampler(sigma_s_1, sigma_s_2), noise_sampler(sigma_s_2, sigmas[i + 1]) + alpha_s_1 = sigma_s_1 * lambda_s_1.exp() + alpha_s_2 = sigma_s_2 * lambda_s_2.exp() + alpha_t = sigmas[i + 1] * lambda_t.exp() - # Step 1 - x_2 = sigma_s_1 / sigmas[i] * (-r_1 * h * eta).exp() * x - alpha_s_1 * coeff_1 * denoised - if inject_noise: - x_2 = x_2 + sigma_s_1 * (noise_coeff_1 * noise_1) * s_noise - denoised_2 = model(x_2, sigma_s_1 * s_in, **extra_args) + # Step 1 + x_2 = sigma_s_1 / sigmas[i] * (-r_1 * h * eta).exp() * x - alpha_s_1 * ei_h_phi_1(-r_1 * h_eta) * denoised + if inject_noise: + sde_noise = (-2 * r_1 * h * eta).expm1().neg().sqrt() * noise_sampler(sigmas[i], sigma_s_1) + x_2 = x_2 + sde_noise * sigma_s_1 * s_noise + denoised_2 = model(x_2, sigma_s_1 * s_in, **extra_args) - # Step 2 - x_3 = sigma_s_2 / sigmas[i] * (-r_2 * h * eta).exp() * x - alpha_s_2 * coeff_2 * denoised + (r_2 / r_1) * alpha_s_2 * (coeff_2 / (r_2 * h_eta) + 1) * (denoised_2 - denoised) - if inject_noise: - x_3 = x_3 + sigma_s_2 * (noise_coeff_2 * noise_1 + noise_coeff_1 * noise_2) * s_noise - denoised_3 = model(x_3, sigma_s_2 * s_in, **extra_args) + # Step 2 + a3_2 = r_2 / r_1 * ei_h_phi_2(-r_2 * h_eta) + a3_1 = ei_h_phi_1(-r_2 * h_eta) - a3_2 + x_3 = sigma_s_2 / sigmas[i] * (-r_2 * h * eta).exp() * x - alpha_s_2 * (a3_1 * denoised + a3_2 * denoised_2) + if inject_noise: + segment_factor = (r_1 - r_2) * h * eta + sde_noise = sde_noise * segment_factor.exp() + sde_noise = sde_noise + segment_factor.mul(2).expm1().neg().sqrt() * noise_sampler(sigma_s_1, sigma_s_2) + x_3 = x_3 + sde_noise * sigma_s_2 * s_noise + denoised_3 = model(x_3, sigma_s_2 * s_in, **extra_args) - # Step 3 - x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x - alpha_t * coeff_3 * denoised + (1. / r_2) * alpha_t * (coeff_3 / h_eta + 1) * (denoised_3 - denoised) - if inject_noise: - x = x + sigmas[i + 1] * (noise_coeff_3 * noise_1 + noise_coeff_2 * noise_2 + noise_coeff_1 * noise_3) * s_noise + # Step 3 + b3 = ei_h_phi_2(-h_eta) / r_2 + b1 = ei_h_phi_1(-h_eta) - b3 + x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x - alpha_t * (b1 * denoised + b3 * denoised_3) + if inject_noise: + segment_factor = (r_2 - 1) * h * eta + sde_noise = sde_noise * segment_factor.exp() + sde_noise = sde_noise + segment_factor.mul(2).expm1().neg().sqrt() * noise_sampler(sigma_s_2, sigmas[i + 1]) + x = x + sde_noise * sigmas[i + 1] * s_noise return x From 9b151559721ff6c8d93150f3d8a53259a23911cd Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 30 Aug 2025 22:32:10 -0700 Subject: [PATCH 223/243] Probably not necessary anymore. (#9646) --- main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/main.py b/main.py index b23d50816..c33f0e17b 100644 --- a/main.py +++ b/main.py @@ -113,7 +113,6 @@ import gc if os.name == "nt": os.environ['MIMALLOC_PURGE_DELAY'] = '0' - logging.getLogger("xformers").addFilter(lambda record: 'A matching Triton is not available' not in record.getMessage()) if __name__ == "__main__": if args.default_device is not None: From 27e067ce505c102fd0f2be0f1242016c59a6816f Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 1 Sep 2025 15:54:02 -0700 Subject: [PATCH 224/243] Implement the USO subject identity lora. (#9674) Use the lora with FluxContextMultiReferenceLatentMethod node set to "uso" and a ReferenceLatent node with the reference image. --- comfy/ldm/flux/model.py | 10 ++++++++-- comfy/lora.py | 4 ++++ comfy/lora_convert.py | 19 +++++++++++++++++++ comfy_extras/nodes_flux.py | 2 +- 4 files changed, 32 insertions(+), 3 deletions(-) diff --git a/comfy/ldm/flux/model.py b/comfy/ldm/flux/model.py index 1344c3a57..1e62f4626 100644 --- a/comfy/ldm/flux/model.py +++ b/comfy/ldm/flux/model.py @@ -233,12 +233,18 @@ class Flux(nn.Module): h = 0 w = 0 index = 0 - index_ref_method = kwargs.get("ref_latents_method", "offset") == "index" + ref_latents_method = kwargs.get("ref_latents_method", "offset") for ref in ref_latents: - if index_ref_method: + if ref_latents_method == "index": index += 1 h_offset = 0 w_offset = 0 + elif ref_latents_method == "uso": + index = 0 + h_offset = h_len * patch_size + h + w_offset = w_len * patch_size + w + h += ref.shape[-2] + w += ref.shape[-1] else: index = 1 h_offset = 0 diff --git a/comfy/lora.py b/comfy/lora.py index 00358884b..4a44f1318 100644 --- a/comfy/lora.py +++ b/comfy/lora.py @@ -260,6 +260,10 @@ def model_lora_keys_unet(model, key_map={}): key_map["transformer.{}".format(k[:-len(".weight")])] = to #simpletrainer and probably regular diffusers flux lora format key_map["lycoris_{}".format(k[:-len(".weight")].replace(".", "_"))] = to #simpletrainer lycoris key_map["lora_transformer_{}".format(k[:-len(".weight")].replace(".", "_"))] = to #onetrainer + for k in sdk: + hidden_size = model.model_config.unet_config.get("hidden_size", 0) + if k.endswith(".weight") and ".linear1." in k: + key_map["{}".format(k.replace(".linear1.weight", ".linear1_qkv"))] = (k, (0, 0, hidden_size * 3)) if isinstance(model, comfy.model_base.GenmoMochi): for k in sdk: diff --git a/comfy/lora_convert.py b/comfy/lora_convert.py index 3e00b63db..9d8d21efe 100644 --- a/comfy/lora_convert.py +++ b/comfy/lora_convert.py @@ -15,10 +15,29 @@ def convert_lora_bfl_control(sd): #BFL loras for Flux def convert_lora_wan_fun(sd): #Wan Fun loras return comfy.utils.state_dict_prefix_replace(sd, {"lora_unet__": "lora_unet_"}) +def convert_uso_lora(sd): + sd_out = {} + for k in sd: + tensor = sd[k] + k_to = "diffusion_model.{}".format(k.replace(".down.weight", ".lora_down.weight") + .replace(".up.weight", ".lora_up.weight") + .replace(".qkv_lora2.", ".txt_attn.qkv.") + .replace(".qkv_lora1.", ".img_attn.qkv.") + .replace(".proj_lora1.", ".img_attn.proj.") + .replace(".proj_lora2.", ".txt_attn.proj.") + .replace(".qkv_lora.", ".linear1_qkv.") + .replace(".proj_lora.", ".linear2.") + .replace(".processor.", ".") + ) + sd_out[k_to] = tensor + return sd_out + def convert_lora(sd): if "img_in.lora_A.weight" in sd and "single_blocks.0.norm.key_norm.scale" in sd: return convert_lora_bfl_control(sd) if "lora_unet__blocks_0_cross_attn_k.lora_down.weight" in sd: return convert_lora_wan_fun(sd) + if "single_blocks.37.processor.qkv_lora.up.weight" in sd and "double_blocks.18.processor.qkv_lora2.up.weight" in sd: + return convert_uso_lora(sd) return sd diff --git a/comfy_extras/nodes_flux.py b/comfy_extras/nodes_flux.py index c8db75bb3..1bf7ddd92 100644 --- a/comfy_extras/nodes_flux.py +++ b/comfy_extras/nodes_flux.py @@ -105,7 +105,7 @@ class FluxKontextMultiReferenceLatentMethod: def INPUT_TYPES(s): return {"required": { "conditioning": ("CONDITIONING", ), - "reference_latents_method": (("offset", "index"), ), + "reference_latents_method": (("offset", "index", "uso"), ), }} RETURN_TYPES = ("CONDITIONING",) From e2d1e5dad98dbbcf505703ea8663f20101e6570a Mon Sep 17 00:00:00 2001 From: contentis Date: Tue, 2 Sep 2025 02:33:50 +0200 Subject: [PATCH 225/243] Enable Convolution AutoTuning (#9301) --- comfy/cli_args.py | 1 + comfy/ops.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/comfy/cli_args.py b/comfy/cli_args.py index de3e85c08..72eeaea9a 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -143,6 +143,7 @@ class PerformanceFeature(enum.Enum): Fp16Accumulation = "fp16_accumulation" Fp8MatrixMultiplication = "fp8_matrix_mult" CublasOps = "cublas_ops" + AutoTune = "autotune" parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: fp16_accumulation fp8_matrix_mult cublas_ops") diff --git a/comfy/ops.py b/comfy/ops.py index 18e7db705..55e958adb 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -52,6 +52,9 @@ except (ModuleNotFoundError, TypeError): cast_to = comfy.model_management.cast_to #TODO: remove once no more references +if torch.cuda.is_available() and torch.backends.cudnn.is_available() and PerformanceFeature.AutoTune in args.fast: + torch.backends.cudnn.benchmark = True + def cast_to_input(weight, input, non_blocking=False, copy=True): return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy) From 3412d53b1d69e4dfedf7e86c3092cea085094053 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 2 Sep 2025 12:36:22 -0700 Subject: [PATCH 226/243] USO style reference. (#9677) Load the projector.safetensors file with the ModelPatchLoader node and use the siglip_vision_patch14_384.safetensors "clip vision" model and the USOStyleReferenceNode. --- comfy/clip_model.py | 12 +- comfy/clip_vision.py | 18 ++- comfy/ldm/flux/model.py | 11 +- comfy/model_patcher.py | 3 + comfy_extras/nodes_model_patch.py | 186 +++++++++++++++++++++++++++++- 5 files changed, 222 insertions(+), 8 deletions(-) diff --git a/comfy/clip_model.py b/comfy/clip_model.py index 7e47d8a55..7c0cadab5 100644 --- a/comfy/clip_model.py +++ b/comfy/clip_model.py @@ -61,8 +61,12 @@ class CLIPEncoder(torch.nn.Module): def forward(self, x, mask=None, intermediate_output=None): optimized_attention = optimized_attention_for_device(x.device, mask=mask is not None, small_input=True) + all_intermediate = None if intermediate_output is not None: - if intermediate_output < 0: + if intermediate_output == "all": + all_intermediate = [] + intermediate_output = None + elif intermediate_output < 0: intermediate_output = len(self.layers) + intermediate_output intermediate = None @@ -70,6 +74,12 @@ class CLIPEncoder(torch.nn.Module): x = l(x, mask, optimized_attention) if i == intermediate_output: intermediate = x.clone() + if all_intermediate is not None: + all_intermediate.append(x.unsqueeze(1).clone()) + + if all_intermediate is not None: + intermediate = torch.cat(all_intermediate, dim=1) + return x, intermediate class CLIPEmbeddings(torch.nn.Module): diff --git a/comfy/clip_vision.py b/comfy/clip_vision.py index 00aab9164..2fa410cb7 100644 --- a/comfy/clip_vision.py +++ b/comfy/clip_vision.py @@ -50,7 +50,13 @@ class ClipVisionModel(): self.image_size = config.get("image_size", 224) self.image_mean = config.get("image_mean", [0.48145466, 0.4578275, 0.40821073]) self.image_std = config.get("image_std", [0.26862954, 0.26130258, 0.27577711]) - model_class = IMAGE_ENCODERS.get(config.get("model_type", "clip_vision_model")) + model_type = config.get("model_type", "clip_vision_model") + model_class = IMAGE_ENCODERS.get(model_type) + if model_type == "siglip_vision_model": + self.return_all_hidden_states = True + else: + self.return_all_hidden_states = False + self.load_device = comfy.model_management.text_encoder_device() offload_device = comfy.model_management.text_encoder_offload_device() self.dtype = comfy.model_management.text_encoder_dtype(self.load_device) @@ -68,12 +74,18 @@ class ClipVisionModel(): def encode_image(self, image, crop=True): comfy.model_management.load_model_gpu(self.patcher) pixel_values = clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=crop).float() - out = self.model(pixel_values=pixel_values, intermediate_output=-2) + out = self.model(pixel_values=pixel_values, intermediate_output='all' if self.return_all_hidden_states else -2) outputs = Output() outputs["last_hidden_state"] = out[0].to(comfy.model_management.intermediate_device()) outputs["image_embeds"] = out[2].to(comfy.model_management.intermediate_device()) - outputs["penultimate_hidden_states"] = out[1].to(comfy.model_management.intermediate_device()) + if self.return_all_hidden_states: + all_hs = out[1].to(comfy.model_management.intermediate_device()) + outputs["penultimate_hidden_states"] = all_hs[:, -2] + outputs["all_hidden_states"] = all_hs + else: + outputs["penultimate_hidden_states"] = out[1].to(comfy.model_management.intermediate_device()) + outputs["mm_projected"] = out[3] return outputs diff --git a/comfy/ldm/flux/model.py b/comfy/ldm/flux/model.py index 1e62f4626..d4be6bb61 100644 --- a/comfy/ldm/flux/model.py +++ b/comfy/ldm/flux/model.py @@ -106,6 +106,7 @@ class Flux(nn.Module): if y is None: y = torch.zeros((img.shape[0], self.params.vec_in_dim), device=img.device, dtype=img.dtype) + patches = transformer_options.get("patches", {}) patches_replace = transformer_options.get("patches_replace", {}) if img.ndim != 3 or txt.ndim != 3: raise ValueError("Input img and txt tensors must have 3 dimensions.") @@ -117,9 +118,17 @@ class Flux(nn.Module): if guidance is not None: vec = vec + self.guidance_in(timestep_embedding(guidance, 256).to(img.dtype)) - vec = vec + self.vector_in(y[:,:self.params.vec_in_dim]) + vec = vec + self.vector_in(y[:, :self.params.vec_in_dim]) txt = self.txt_in(txt) + if "post_input" in patches: + for p in patches["post_input"]: + out = p({"img": img, "txt": txt, "img_ids": img_ids, "txt_ids": txt_ids}) + img = out["img"] + txt = out["txt"] + img_ids = out["img_ids"] + txt_ids = out["txt_ids"] + if img_ids is not None: ids = torch.cat((txt_ids, img_ids), dim=1) pe = self.pe_embedder(ids) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index a944cb421..1fd03d9d1 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -433,6 +433,9 @@ class ModelPatcher: def set_model_double_block_patch(self, patch): self.set_model_patch(patch, "double_block") + def set_model_post_input_patch(self, patch): + self.set_model_patch(patch, "post_input") + def add_object_patch(self, name, obj): self.object_patches[name] = obj diff --git a/comfy_extras/nodes_model_patch.py b/comfy_extras/nodes_model_patch.py index 65e766b52..783c59b6b 100644 --- a/comfy_extras/nodes_model_patch.py +++ b/comfy_extras/nodes_model_patch.py @@ -1,4 +1,5 @@ import torch +from torch import nn import folder_paths import comfy.utils import comfy.ops @@ -58,6 +59,136 @@ class QwenImageBlockWiseControlNet(torch.nn.Module): return self.controlnet_blocks[block_id](img, controlnet_conditioning) +class SigLIPMultiFeatProjModel(torch.nn.Module): + """ + SigLIP Multi-Feature Projection Model for processing style features from different layers + and projecting them into a unified hidden space. + + Args: + siglip_token_nums (int): Number of SigLIP tokens, default 257 + style_token_nums (int): Number of style tokens, default 256 + siglip_token_dims (int): Dimension of SigLIP tokens, default 1536 + hidden_size (int): Hidden layer size, default 3072 + context_layer_norm (bool): Whether to use context layer normalization, default False + """ + + def __init__( + self, + siglip_token_nums: int = 729, + style_token_nums: int = 64, + siglip_token_dims: int = 1152, + hidden_size: int = 3072, + context_layer_norm: bool = True, + device=None, dtype=None, operations=None + ): + super().__init__() + + # High-level feature processing (layer -2) + self.high_embedding_linear = nn.Sequential( + operations.Linear(siglip_token_nums, style_token_nums), + nn.SiLU() + ) + self.high_layer_norm = ( + operations.LayerNorm(siglip_token_dims) if context_layer_norm else nn.Identity() + ) + self.high_projection = operations.Linear(siglip_token_dims, hidden_size, bias=True) + + # Mid-level feature processing (layer -11) + self.mid_embedding_linear = nn.Sequential( + operations.Linear(siglip_token_nums, style_token_nums), + nn.SiLU() + ) + self.mid_layer_norm = ( + operations.LayerNorm(siglip_token_dims) if context_layer_norm else nn.Identity() + ) + self.mid_projection = operations.Linear(siglip_token_dims, hidden_size, bias=True) + + # Low-level feature processing (layer -20) + self.low_embedding_linear = nn.Sequential( + operations.Linear(siglip_token_nums, style_token_nums), + nn.SiLU() + ) + self.low_layer_norm = ( + operations.LayerNorm(siglip_token_dims) if context_layer_norm else nn.Identity() + ) + self.low_projection = operations.Linear(siglip_token_dims, hidden_size, bias=True) + + def forward(self, siglip_outputs): + """ + Forward pass function + + Args: + siglip_outputs: Output from SigLIP model, containing hidden_states + + Returns: + torch.Tensor: Concatenated multi-layer features with shape [bs, 3*style_token_nums, hidden_size] + """ + dtype = next(self.high_embedding_linear.parameters()).dtype + + # Process high-level features (layer -2) + high_embedding = self._process_layer_features( + siglip_outputs[2], + self.high_embedding_linear, + self.high_layer_norm, + self.high_projection, + dtype + ) + + # Process mid-level features (layer -11) + mid_embedding = self._process_layer_features( + siglip_outputs[1], + self.mid_embedding_linear, + self.mid_layer_norm, + self.mid_projection, + dtype + ) + + # Process low-level features (layer -20) + low_embedding = self._process_layer_features( + siglip_outputs[0], + self.low_embedding_linear, + self.low_layer_norm, + self.low_projection, + dtype + ) + + # Concatenate features from all layersmodel_patch + return torch.cat((high_embedding, mid_embedding, low_embedding), dim=1) + + def _process_layer_features( + self, + hidden_states: torch.Tensor, + embedding_linear: nn.Module, + layer_norm: nn.Module, + projection: nn.Module, + dtype: torch.dtype + ) -> torch.Tensor: + """ + Helper function to process features from a single layer + + Args: + hidden_states: Input hidden states [bs, seq_len, dim] + embedding_linear: Embedding linear layer + layer_norm: Layer normalization + projection: Projection layer + dtype: Target data type + + Returns: + torch.Tensor: Processed features [bs, style_token_nums, hidden_size] + """ + # Transform dimensions: [bs, seq_len, dim] -> [bs, dim, seq_len] -> [bs, dim, style_token_nums] -> [bs, style_token_nums, dim] + embedding = embedding_linear( + hidden_states.to(dtype).transpose(1, 2) + ).transpose(1, 2) + + # Apply layer normalization + embedding = layer_norm(embedding) + + # Project to target hidden space + embedding = projection(embedding) + + return embedding + class ModelPatchLoader: @classmethod def INPUT_TYPES(s): @@ -73,9 +204,14 @@ class ModelPatchLoader: model_patch_path = folder_paths.get_full_path_or_raise("model_patches", name) sd = comfy.utils.load_torch_file(model_patch_path, safe_load=True) dtype = comfy.utils.weight_dtype(sd) - # TODO: this node will work with more types of model patches - additional_in_dim = sd["img_in.weight"].shape[1] - 64 - model = QwenImageBlockWiseControlNet(additional_in_dim=additional_in_dim, device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast) + + if 'controlnet_blocks.0.y_rms.weight' in sd: + additional_in_dim = sd["img_in.weight"].shape[1] - 64 + model = QwenImageBlockWiseControlNet(additional_in_dim=additional_in_dim, device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast) + elif 'feature_embedder.mid_layer_norm.bias' in sd: + sd = comfy.utils.state_dict_prefix_replace(sd, {"feature_embedder.": ""}, filter_keys=True) + model = SigLIPMultiFeatProjModel(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast) + model.load_state_dict(sd) model = comfy.model_patcher.ModelPatcher(model, load_device=comfy.model_management.get_torch_device(), offload_device=comfy.model_management.unet_offload_device()) return (model,) @@ -157,7 +293,51 @@ class QwenImageDiffsynthControlnet: return (model_patched,) +class UsoStyleProjectorPatch: + def __init__(self, model_patch, encoded_image): + self.model_patch = model_patch + self.encoded_image = encoded_image + + def __call__(self, kwargs): + txt_ids = kwargs.get("txt_ids") + txt = kwargs.get("txt") + siglip_embedding = self.model_patch.model(self.encoded_image.to(txt.dtype)).to(txt.dtype) + txt = torch.cat([siglip_embedding, txt], dim=1) + kwargs['txt'] = txt + kwargs['txt_ids'] = torch.cat([torch.zeros(siglip_embedding.shape[0], siglip_embedding.shape[1], 3, dtype=txt_ids.dtype, device=txt_ids.device), txt_ids], dim=1) + return kwargs + + def to(self, device_or_dtype): + if isinstance(device_or_dtype, torch.device): + self.encoded_image = self.encoded_image.to(device_or_dtype) + return self + + def models(self): + return [self.model_patch] + + +class USOStyleReference: + @classmethod + def INPUT_TYPES(s): + return {"required": {"model": ("MODEL",), + "model_patch": ("MODEL_PATCH",), + "clip_vision_output": ("CLIP_VISION_OUTPUT", ), + }} + RETURN_TYPES = ("MODEL",) + FUNCTION = "apply_patch" + EXPERIMENTAL = True + + CATEGORY = "advanced/model_patches/flux" + + def apply_patch(self, model, model_patch, clip_vision_output): + encoded_image = torch.stack((clip_vision_output.all_hidden_states[:, -20], clip_vision_output.all_hidden_states[:, -11], clip_vision_output.penultimate_hidden_states)) + model_patched = model.clone() + model_patched.set_model_post_input_patch(UsoStyleProjectorPatch(model_patch, encoded_image)) + return (model_patched,) + + NODE_CLASS_MAPPINGS = { "ModelPatchLoader": ModelPatchLoader, "QwenImageDiffsynthControlnet": QwenImageDiffsynthControlnet, + "USOStyleReference": USOStyleReference, } From e3018c2a5aeb99f0c5b595621949a451686ce55a Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 2 Sep 2025 13:12:07 -0700 Subject: [PATCH 227/243] uso -> uxo/uno as requested. (#9688) --- comfy/ldm/flux/model.py | 2 +- comfy_extras/nodes_flux.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/comfy/ldm/flux/model.py b/comfy/ldm/flux/model.py index d4be6bb61..8ea7d4f57 100644 --- a/comfy/ldm/flux/model.py +++ b/comfy/ldm/flux/model.py @@ -248,7 +248,7 @@ class Flux(nn.Module): index += 1 h_offset = 0 w_offset = 0 - elif ref_latents_method == "uso": + elif ref_latents_method == "uxo": index = 0 h_offset = h_len * patch_size + h w_offset = w_len * patch_size + w diff --git a/comfy_extras/nodes_flux.py b/comfy_extras/nodes_flux.py index 1bf7ddd92..25e029ffd 100644 --- a/comfy_extras/nodes_flux.py +++ b/comfy_extras/nodes_flux.py @@ -105,7 +105,7 @@ class FluxKontextMultiReferenceLatentMethod: def INPUT_TYPES(s): return {"required": { "conditioning": ("CONDITIONING", ), - "reference_latents_method": (("offset", "index", "uso"), ), + "reference_latents_method": (("offset", "index", "uxo/uno"), ), }} RETURN_TYPES = ("CONDITIONING",) @@ -115,6 +115,8 @@ class FluxKontextMultiReferenceLatentMethod: CATEGORY = "advanced/conditioning/flux" def append(self, conditioning, reference_latents_method): + if "uxo" in reference_latents_method or "uso" in reference_latents_method: + reference_latents_method = "uxo" c = node_helpers.conditioning_set_values(conditioning, {"reference_latents_method": reference_latents_method}) return (c, ) From 464ba1d6140eda6a0173703ac00c69f7fddab6ba Mon Sep 17 00:00:00 2001 From: Deep Roy Date: Tue, 2 Sep 2025 19:41:10 -0400 Subject: [PATCH 228/243] Accept prompt_id in interrupt handler (#9607) * Accept prompt_id in interrupt handler * remove a log --- server.py | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/server.py b/server.py index 8f9c88ebf..3d323eaf8 100644 --- a/server.py +++ b/server.py @@ -729,7 +729,34 @@ class PromptServer(): @routes.post("/interrupt") async def post_interrupt(request): - nodes.interrupt_processing() + try: + json_data = await request.json() + except json.JSONDecodeError: + json_data = {} + + # Check if a specific prompt_id was provided for targeted interruption + prompt_id = json_data.get('prompt_id') + if prompt_id: + currently_running, _ = self.prompt_queue.get_current_queue() + + # Check if the prompt_id matches any currently running prompt + should_interrupt = False + for item in currently_running: + # item structure: (number, prompt_id, prompt, extra_data, outputs_to_execute) + if item[1] == prompt_id: + logging.info(f"Interrupting prompt {prompt_id}") + should_interrupt = True + break + + if should_interrupt: + nodes.interrupt_processing() + else: + logging.info(f"Prompt {prompt_id} is not currently running, skipping interrupt") + else: + # No prompt_id provided, do a global interrupt + logging.info("Global interrupt (no prompt_id specified)") + nodes.interrupt_processing() + return web.Response(status=200) @routes.post("/free") From 1bcb469089a71fb1946b9f14e994df1b42b83def Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 2 Sep 2025 17:05:57 -0700 Subject: [PATCH 229/243] ImageScaleToMaxDimension node. (#9689) --- comfy_extras/nodes_images.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/comfy_extras/nodes_images.py b/comfy_extras/nodes_images.py index fba80e2ae..392aea32c 100644 --- a/comfy_extras/nodes_images.py +++ b/comfy_extras/nodes_images.py @@ -625,6 +625,37 @@ class ImageFlip: return (image,) +class ImageScaleToMaxDimension: + upscale_methods = ["area", "lanczos", "bilinear", "nearest-exact", "bilinear", "bicubic"] + + @classmethod + def INPUT_TYPES(s): + return {"required": {"image": ("IMAGE",), + "upscale_method": (s.upscale_methods,), + "largest_size": ("INT", {"default": 512, "min": 0, "max": MAX_RESOLUTION, "step": 1})}} + RETURN_TYPES = ("IMAGE",) + FUNCTION = "upscale" + + CATEGORY = "image/upscaling" + + def upscale(self, image, upscale_method, largest_size): + height = image.shape[1] + width = image.shape[2] + + if height > width: + width = round((width / height) * largest_size) + height = largest_size + elif width > height: + height = round((height / width) * largest_size) + width = largest_size + else: + height = largest_size + width = largest_size + + samples = image.movedim(-1, 1) + s = comfy.utils.common_upscale(samples, width, height, upscale_method, "disabled") + s = s.movedim(1, -1) + return (s,) NODE_CLASS_MAPPINGS = { "ImageCrop": ImageCrop, @@ -639,4 +670,5 @@ NODE_CLASS_MAPPINGS = { "GetImageSize": GetImageSize, "ImageRotate": ImageRotate, "ImageFlip": ImageFlip, + "ImageScaleToMaxDimension": ImageScaleToMaxDimension, } From 4f5812b93712e0f52ae8fe80a89e8b5e7d0fa309 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Wed, 3 Sep 2025 08:06:41 +0800 Subject: [PATCH 230/243] Update template to 0.1.73 (#9686) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 7f64aacca..4ebe6cc2a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.25.11 -comfyui-workflow-templates==0.1.70 +comfyui-workflow-templates==0.1.73 comfyui-embedded-docs==0.2.6 torch torchsde From 26d5b86da8ceb4589ee70f12ff2209b93a2d99e0 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 3 Sep 2025 23:17:07 +0300 Subject: [PATCH 231/243] feat(api-nodes): add ByteDance Image nodes (#9477) --- comfy_api_nodes/nodes_bytedance.py | 336 +++++++++++++++++++++++++++++ nodes.py | 1 + 2 files changed, 337 insertions(+) create mode 100644 comfy_api_nodes/nodes_bytedance.py diff --git a/comfy_api_nodes/nodes_bytedance.py b/comfy_api_nodes/nodes_bytedance.py new file mode 100644 index 000000000..fb6aba7fa --- /dev/null +++ b/comfy_api_nodes/nodes_bytedance.py @@ -0,0 +1,336 @@ +import logging +from enum import Enum +from typing import Optional +from typing_extensions import override + +import torch +from pydantic import BaseModel, Field + +from comfy_api.latest import ComfyExtension, io as comfy_io +from comfy_api_nodes.util.validation_utils import ( + validate_image_aspect_ratio_range, + get_number_of_images, +) +from comfy_api_nodes.apis.client import ( + ApiEndpoint, + HttpMethod, + SynchronousOperation, +) +from comfy_api_nodes.apinode_utils import download_url_to_image_tensor, upload_images_to_comfyapi, validate_string + + +BYTEPLUS_ENDPOINT = "/proxy/byteplus/api/v3/images/generations" + + +class Text2ImageModelName(str, Enum): + seedream3 = "seedream-3-0-t2i-250415" + + +class Image2ImageModelName(str, Enum): + seededit3 = "seededit-3-0-i2i-250628" + + +class Text2ImageTaskCreationRequest(BaseModel): + model: Text2ImageModelName = Text2ImageModelName.seedream3 + prompt: str = Field(...) + response_format: Optional[str] = Field("url") + size: Optional[str] = Field(None) + seed: Optional[int] = Field(0, ge=0, le=2147483647) + guidance_scale: Optional[float] = Field(..., ge=1.0, le=10.0) + watermark: Optional[bool] = Field(True) + + +class Image2ImageTaskCreationRequest(BaseModel): + model: Image2ImageModelName = Image2ImageModelName.seededit3 + prompt: str = Field(...) + response_format: Optional[str] = Field("url") + image: str = Field(..., description="Base64 encoded string or image URL") + size: Optional[str] = Field("adaptive") + seed: Optional[int] = Field(..., ge=0, le=2147483647) + guidance_scale: Optional[float] = Field(..., ge=1.0, le=10.0) + watermark: Optional[bool] = Field(True) + + +class ImageTaskCreationResponse(BaseModel): + model: str = Field(...) + created: int = Field(..., description="Unix timestamp (in seconds) indicating time when the request was created.") + data: list = Field([], description="Contains information about the generated image(s).") + error: dict = Field({}, description="Contains `code` and `message` fields in case of error.") + + +RECOMMENDED_PRESETS = [ + ("1024x1024 (1:1)", 1024, 1024), + ("864x1152 (3:4)", 864, 1152), + ("1152x864 (4:3)", 1152, 864), + ("1280x720 (16:9)", 1280, 720), + ("720x1280 (9:16)", 720, 1280), + ("832x1248 (2:3)", 832, 1248), + ("1248x832 (3:2)", 1248, 832), + ("1512x648 (21:9)", 1512, 648), + ("2048x2048 (1:1)", 2048, 2048), + ("Custom", None, None), +] + + +def get_image_url_from_response(response: ImageTaskCreationResponse) -> str: + if response.error: + error_msg = f"ByteDance request failed. Code: {response.error['code']}, message: {response.error['message']}" + logging.info(error_msg) + raise RuntimeError(error_msg) + logging.info("ByteDance task succeeded, image URL: %s", response.data[0]["url"]) + return response.data[0]["url"] + + +class ByteDanceImageNode(comfy_io.ComfyNode): + + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="ByteDanceImageNode", + display_name="ByteDance Image", + category="api node/image/ByteDance", + description="Generate images using ByteDance models via api based on prompt", + inputs=[ + comfy_io.Combo.Input( + "model", + options=[model.value for model in Text2ImageModelName], + default=Text2ImageModelName.seedream3.value, + tooltip="Model name", + ), + comfy_io.String.Input( + "prompt", + multiline=True, + tooltip="The text prompt used to generate the image", + ), + comfy_io.Combo.Input( + "size_preset", + options=[label for label, _, _ in RECOMMENDED_PRESETS], + tooltip="Pick a recommended size. Select Custom to use the width and height below", + ), + comfy_io.Int.Input( + "width", + default=1024, + min=512, + max=2048, + step=64, + tooltip="Custom width for image. Value is working only if `size_preset` is set to `Custom`", + ), + comfy_io.Int.Input( + "height", + default=1024, + min=512, + max=2048, + step=64, + tooltip="Custom height for image. Value is working only if `size_preset` is set to `Custom`", + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed to use for generation", + optional=True, + ), + comfy_io.Float.Input( + "guidance_scale", + default=2.5, + min=1.0, + max=10.0, + step=0.01, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Higher value makes the image follow the prompt more closely", + optional=True, + ), + comfy_io.Boolean.Input( + "watermark", + default=True, + tooltip="Whether to add an \"AI generated\" watermark to the image", + optional=True, + ), + ], + outputs=[ + comfy_io.Image.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + prompt: str, + size_preset: str, + width: int, + height: int, + seed: int, + guidance_scale: float, + watermark: bool, + ) -> comfy_io.NodeOutput: + validate_string(prompt, strip_whitespace=True, min_length=1) + w = h = None + for label, tw, th in RECOMMENDED_PRESETS: + if label == size_preset: + w, h = tw, th + break + + if w is None or h is None: + w, h = width, height + if not (512 <= w <= 2048) or not (512 <= h <= 2048): + raise ValueError( + f"Custom size out of range: {w}x{h}. " + "Both width and height must be between 512 and 2048 pixels." + ) + + payload = Text2ImageTaskCreationRequest( + model=model, + prompt=prompt, + size=f"{w}x{h}", + seed=seed, + guidance_scale=guidance_scale, + watermark=watermark, + ) + auth_kwargs = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + response = await SynchronousOperation( + endpoint=ApiEndpoint( + path=BYTEPLUS_ENDPOINT, + method=HttpMethod.POST, + request_model=Text2ImageTaskCreationRequest, + response_model=ImageTaskCreationResponse, + ), + request=payload, + auth_kwargs=auth_kwargs, + ).execute() + return comfy_io.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response))) + + +class ByteDanceImageEditNode(comfy_io.ComfyNode): + + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="ByteDanceImageEditNode", + display_name="ByteDance Image Edit", + category="api node/video/ByteDance", + description="Edit images using ByteDance models via api based on prompt", + inputs=[ + comfy_io.Combo.Input( + "model", + options=[model.value for model in Image2ImageModelName], + default=Image2ImageModelName.seededit3.value, + tooltip="Model name", + ), + comfy_io.Image.Input( + "image", + tooltip="The base image to edit", + ), + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Instruction to edit image", + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed to use for generation", + optional=True, + ), + comfy_io.Float.Input( + "guidance_scale", + default=5.5, + min=1.0, + max=10.0, + step=0.01, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Higher value makes the image follow the prompt more closely", + optional=True, + ), + comfy_io.Boolean.Input( + "watermark", + default=True, + tooltip="Whether to add an \"AI generated\" watermark to the image", + optional=True, + ), + ], + outputs=[ + comfy_io.Image.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + image: torch.Tensor, + prompt: str, + seed: int, + guidance_scale: float, + watermark: bool, + ) -> comfy_io.NodeOutput: + validate_string(prompt, strip_whitespace=True, min_length=1) + if get_number_of_images(image) != 1: + raise ValueError("Exactly one input image is required.") + validate_image_aspect_ratio_range(image, (1, 3), (3, 1)) + auth_kwargs = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + source_url = (await upload_images_to_comfyapi( + image, + max_images=1, + mime_type="image/png", + auth_kwargs=auth_kwargs, + ))[0] + payload = Image2ImageTaskCreationRequest( + model=model, + prompt=prompt, + image=source_url, + seed=seed, + guidance_scale=guidance_scale, + watermark=watermark, + ) + response = await SynchronousOperation( + endpoint=ApiEndpoint( + path=BYTEPLUS_ENDPOINT, + method=HttpMethod.POST, + request_model=Image2ImageTaskCreationRequest, + response_model=ImageTaskCreationResponse, + ), + request=payload, + auth_kwargs=auth_kwargs, + ).execute() + return comfy_io.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response))) + + +class ByteDanceExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + return [ + ByteDanceImageNode, + ByteDanceImageEditNode, + ] + +async def comfy_entrypoint() -> ByteDanceExtension: + return ByteDanceExtension() diff --git a/nodes.py b/nodes.py index 0aff6b14a..6c2f9dd14 100644 --- a/nodes.py +++ b/nodes.py @@ -2344,6 +2344,7 @@ async def init_builtin_api_nodes(): "nodes_veo2.py", "nodes_kling.py", "nodes_bfl.py", + "nodes_bytedance.py", "nodes_luma.py", "nodes_recraft.py", "nodes_pixverse.py", From 50333f1715c03aa4100711eb6d075516a4021d24 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 3 Sep 2025 23:17:37 +0300 Subject: [PATCH 232/243] api nodes(Ideogram): add Ideogram Character (#9616) * api nodes(Ideogram): add Ideogram Character * rename renderingSpeed default value from 'balanced' to 'default' --- comfy_api_nodes/apis/__init__.py | 22 ++++++++- comfy_api_nodes/nodes_ideogram.py | 77 ++++++++++++++++++++++++++++--- 2 files changed, 91 insertions(+), 8 deletions(-) diff --git a/comfy_api_nodes/apis/__init__.py b/comfy_api_nodes/apis/__init__.py index 7a09df55b..78a23db30 100644 --- a/comfy_api_nodes/apis/__init__.py +++ b/comfy_api_nodes/apis/__init__.py @@ -951,7 +951,11 @@ class MagicPrompt2(str, Enum): class StyleType1(str, Enum): + AUTO = 'AUTO' GENERAL = 'GENERAL' + REALISTIC = 'REALISTIC' + DESIGN = 'DESIGN' + FICTION = 'FICTION' class ImagenImageGenerationInstance(BaseModel): @@ -2676,7 +2680,7 @@ class ReleaseNote(BaseModel): class RenderingSpeed(str, Enum): - BALANCED = 'BALANCED' + DEFAULT = 'DEFAULT' TURBO = 'TURBO' QUALITY = 'QUALITY' @@ -4918,6 +4922,14 @@ class IdeogramV3EditRequest(BaseModel): None, description='A set of images to use as style references (maximum total size 10MB across all style references). The images should be in JPEG, PNG or WebP format.', ) + character_reference_images: Optional[List[str]] = Field( + None, + description='Generations with character reference are subject to the character reference pricing. A set of images to use as character references (maximum total size 10MB across all character references), currently only supports 1 character reference image. The images should be in JPEG, PNG or WebP format.' + ) + character_reference_images_mask: Optional[List[str]] = Field( + None, + description='Optional masks for character reference images. When provided, must match the number of character_reference_images. Each mask should be a grayscale image of the same dimensions as the corresponding character reference image. The images should be in JPEG, PNG or WebP format.' + ) class IdeogramV3Request(BaseModel): @@ -4951,6 +4963,14 @@ class IdeogramV3Request(BaseModel): style_type: Optional[StyleType1] = Field( None, description='The type of style to apply' ) + character_reference_images: Optional[List[str]] = Field( + None, + description='Generations with character reference are subject to the character reference pricing. A set of images to use as character references (maximum total size 10MB across all character references), currently only supports 1 character reference image. The images should be in JPEG, PNG or WebP format.' + ) + character_reference_images_mask: Optional[List[str]] = Field( + None, + description='Optional masks for character reference images. When provided, must match the number of character_reference_images. Each mask should be a grayscale image of the same dimensions as the corresponding character reference image. The images should be in JPEG, PNG or WebP format.' + ) class ImagenGenerateImageResponse(BaseModel): diff --git a/comfy_api_nodes/nodes_ideogram.py b/comfy_api_nodes/nodes_ideogram.py index d28895f3e..2d1c32e4f 100644 --- a/comfy_api_nodes/nodes_ideogram.py +++ b/comfy_api_nodes/nodes_ideogram.py @@ -255,6 +255,7 @@ class IdeogramV1(comfy_io.ComfyNode): display_name="Ideogram V1", category="api node/image/Ideogram", description="Generates images using the Ideogram V1 model.", + is_api_node=True, inputs=[ comfy_io.String.Input( "prompt", @@ -383,6 +384,7 @@ class IdeogramV2(comfy_io.ComfyNode): display_name="Ideogram V2", category="api node/image/Ideogram", description="Generates images using the Ideogram V2 model.", + is_api_node=True, inputs=[ comfy_io.String.Input( "prompt", @@ -552,6 +554,7 @@ class IdeogramV3(comfy_io.ComfyNode): category="api node/image/Ideogram", description="Generates images using the Ideogram V3 model. " "Supports both regular image generation from text prompts and image editing with mask.", + is_api_node=True, inputs=[ comfy_io.String.Input( "prompt", @@ -612,11 +615,21 @@ class IdeogramV3(comfy_io.ComfyNode): ), comfy_io.Combo.Input( "rendering_speed", - options=["BALANCED", "TURBO", "QUALITY"], - default="BALANCED", + options=["DEFAULT", "TURBO", "QUALITY"], + default="DEFAULT", tooltip="Controls the trade-off between generation speed and quality", optional=True, ), + comfy_io.Image.Input( + "character_image", + tooltip="Image to use as character reference.", + optional=True, + ), + comfy_io.Mask.Input( + "character_mask", + tooltip="Optional mask for character reference image.", + optional=True, + ), ], outputs=[ comfy_io.Image.Output(), @@ -639,12 +652,46 @@ class IdeogramV3(comfy_io.ComfyNode): magic_prompt_option="AUTO", seed=0, num_images=1, - rendering_speed="BALANCED", + rendering_speed="DEFAULT", + character_image=None, + character_mask=None, ): auth = { "auth_token": cls.hidden.auth_token_comfy_org, "comfy_api_key": cls.hidden.api_key_comfy_org, } + if rendering_speed == "BALANCED": # for backward compatibility + rendering_speed = "DEFAULT" + + character_img_binary = None + character_mask_binary = None + + if character_image is not None: + input_tensor = character_image.squeeze().cpu() + if character_mask is not None: + character_mask = resize_mask_to_image(character_mask, character_image, allow_gradient=False) + character_mask = 1.0 - character_mask + if character_mask.shape[1:] != character_image.shape[1:-1]: + raise Exception("Character mask and image must be the same size") + + mask_np = (character_mask.squeeze().cpu().numpy() * 255).astype(np.uint8) + mask_img = Image.fromarray(mask_np) + mask_byte_arr = BytesIO() + mask_img.save(mask_byte_arr, format="PNG") + mask_byte_arr.seek(0) + character_mask_binary = mask_byte_arr + character_mask_binary.name = "mask.png" + + img_np = (input_tensor.numpy() * 255).astype(np.uint8) + img = Image.fromarray(img_np) + img_byte_arr = BytesIO() + img.save(img_byte_arr, format="PNG") + img_byte_arr.seek(0) + character_img_binary = img_byte_arr + character_img_binary.name = "image.png" + elif character_mask is not None: + raise Exception("Character mask requires character image to be present") + # Check if both image and mask are provided for editing mode if image is not None and mask is not None: # Edit mode @@ -693,6 +740,15 @@ class IdeogramV3(comfy_io.ComfyNode): if num_images > 1: edit_request.num_images = num_images + files = { + "image": img_binary, + "mask": mask_binary, + } + if character_img_binary: + files["character_reference_images"] = character_img_binary + if character_mask_binary: + files["character_mask_binary"] = character_mask_binary + # Execute the operation for edit mode operation = SynchronousOperation( endpoint=ApiEndpoint( @@ -702,10 +758,7 @@ class IdeogramV3(comfy_io.ComfyNode): response_model=IdeogramGenerateResponse, ), request=edit_request, - files={ - "image": img_binary, - "mask": mask_binary, - }, + files=files, content_type="multipart/form-data", auth_kwargs=auth, ) @@ -739,6 +792,14 @@ class IdeogramV3(comfy_io.ComfyNode): if num_images > 1: gen_request.num_images = num_images + files = {} + if character_img_binary: + files["character_reference_images"] = character_img_binary + if character_mask_binary: + files["character_mask_binary"] = character_mask_binary + if files: + gen_request.style_type = "AUTO" + # Execute the operation for generation mode operation = SynchronousOperation( endpoint=ApiEndpoint( @@ -748,6 +809,8 @@ class IdeogramV3(comfy_io.ComfyNode): response_model=IdeogramGenerateResponse, ), request=gen_request, + files=files if files else None, + content_type="multipart/form-data", auth_kwargs=auth, ) From 22da0a83e9a251ca16b9753bf808bfa9f4b023d8 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 3 Sep 2025 23:18:27 +0300 Subject: [PATCH 233/243] [V3] convert Runway API nodes to the V3 schema (#9487) * convert RunAway API nodes to the V3 schema * fixed small typo * fix: add tooltip for "seed" input --- comfy_api_nodes/nodes_runway.py | 744 +++++++++++++++----------------- 1 file changed, 357 insertions(+), 387 deletions(-) diff --git a/comfy_api_nodes/nodes_runway.py b/comfy_api_nodes/nodes_runway.py index 98024a9fa..27b2bf748 100644 --- a/comfy_api_nodes/nodes_runway.py +++ b/comfy_api_nodes/nodes_runway.py @@ -12,6 +12,7 @@ User Guides: """ from typing import Union, Optional, Any +from typing_extensions import override from enum import Enum import torch @@ -46,9 +47,9 @@ from comfy_api_nodes.apinode_utils import ( validate_string, download_url_to_image_tensor, ) -from comfy_api_nodes.mapper_utils import model_field_to_node_input from comfy_api.input_impl import VideoFromFile -from comfy.comfy_types.node_typing import IO, ComfyNodeABC +from comfy_api.latest import ComfyExtension, io as comfy_io +from comfy_api_nodes.util.validation_utils import validate_image_dimensions, validate_image_aspect_ratio PATH_IMAGE_TO_VIDEO = "/proxy/runway/image_to_video" PATH_TEXT_TO_IMAGE = "/proxy/runway/text_to_image" @@ -85,20 +86,11 @@ class RunwayGen3aAspectRatio(str, Enum): def get_video_url_from_task_status(response: TaskStatusResponse) -> Union[str, None]: """Returns the video URL from the task status response if it exists.""" - if response.output and len(response.output) > 0: + if hasattr(response, "output") and len(response.output) > 0: return response.output[0] return None -# TODO: replace with updated image validation utils (upstream) -def validate_input_image(image: torch.Tensor) -> bool: - """ - Validate the input image is within the size limits for the Runway API. - See: https://docs.dev.runwayml.com/assets/inputs/#common-error-reasons - """ - return image.shape[2] < 8000 and image.shape[1] < 8000 - - async def poll_until_finished( auth_kwargs: dict[str, str], api_endpoint: ApiEndpoint[Any, TaskStatusResponse], @@ -134,458 +126,438 @@ def extract_progress_from_task_status( def get_image_url_from_task_status(response: TaskStatusResponse) -> Union[str, None]: """Returns the image URL from the task status response if it exists.""" - if response.output and len(response.output) > 0: + if hasattr(response, "output") and len(response.output) > 0: return response.output[0] return None -class RunwayVideoGenNode(ComfyNodeABC): - """Runway Video Node Base.""" +async def get_response( + task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None, estimated_duration: Optional[int] = None +) -> TaskStatusResponse: + """Poll the task status until it is finished then get the response.""" + return await poll_until_finished( + auth_kwargs, + ApiEndpoint( + path=f"{PATH_GET_TASK_STATUS}/{task_id}", + method=HttpMethod.GET, + request_model=EmptyRequest, + response_model=TaskStatusResponse, + ), + estimated_duration=estimated_duration, + node_id=node_id, + ) - RETURN_TYPES = ("VIDEO",) - FUNCTION = "api_call" - CATEGORY = "api node/video/Runway" - API_NODE = True - def validate_task_created(self, response: RunwayImageToVideoResponse) -> bool: - """ - Validate the task creation response from the Runway API matches - expected format. - """ - if not bool(response.id): - raise RunwayApiError("Invalid initial response from Runway API.") - return True +async def generate_video( + request: RunwayImageToVideoRequest, + auth_kwargs: dict[str, str], + node_id: Optional[str] = None, + estimated_duration: Optional[int] = None, +) -> VideoFromFile: + initial_operation = SynchronousOperation( + endpoint=ApiEndpoint( + path=PATH_IMAGE_TO_VIDEO, + method=HttpMethod.POST, + request_model=RunwayImageToVideoRequest, + response_model=RunwayImageToVideoResponse, + ), + request=request, + auth_kwargs=auth_kwargs, + ) - def validate_response(self, response: RunwayImageToVideoResponse) -> bool: - """ - Validate the successful task status response from the Runway API - matches expected format. - """ - if not response.output or len(response.output) == 0: - raise RunwayApiError( - "Runway task succeeded but no video data found in response." - ) - return True + initial_response = await initial_operation.execute() - async def get_response( - self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None - ) -> RunwayImageToVideoResponse: - """Poll the task status until it is finished then get the response.""" - return await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{PATH_GET_TASK_STATUS}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=TaskStatusResponse, - ), - estimated_duration=AVERAGE_DURATION_FLF_SECONDS, - node_id=node_id, + final_response = await get_response(initial_response.id, auth_kwargs, node_id, estimated_duration) + if not final_response.output: + raise RunwayApiError("Runway task succeeded but no video data found in response.") + + video_url = get_video_url_from_task_status(final_response) + return await download_url_to_video_output(video_url) + + +class RunwayImageToVideoNodeGen3a(comfy_io.ComfyNode): + + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="RunwayImageToVideoNodeGen3a", + display_name="Runway Image to Video (Gen3a Turbo)", + category="api node/video/Runway", + description="Generate a video from a single starting frame using Gen3a Turbo model. " + "Before diving in, review these best practices to ensure that " + "your input selections will set your generation up for success: " + "https://help.runwayml.com/hc/en-us/articles/33927968552339-Creating-with-Act-One-on-Gen-3-Alpha-and-Turbo.", + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Text prompt for the generation", + ), + comfy_io.Image.Input( + "start_frame", + tooltip="Start frame to be used for the video", + ), + comfy_io.Combo.Input( + "duration", + options=[model.value for model in Duration], + ), + comfy_io.Combo.Input( + "ratio", + options=[model.value for model in RunwayGen3aAspectRatio], + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=4294967295, + step=1, + control_after_generate=True, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Random seed for generation", + ), + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, ) - async def generate_video( - self, - request: RunwayImageToVideoRequest, - auth_kwargs: dict[str, str], - node_id: Optional[str] = None, - ) -> tuple[VideoFromFile]: - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_IMAGE_TO_VIDEO, - method=HttpMethod.POST, - request_model=RunwayImageToVideoRequest, - response_model=RunwayImageToVideoResponse, - ), - request=request, + @classmethod + async def execute( + cls, + prompt: str, + start_frame: torch.Tensor, + duration: str, + ratio: str, + seed: int, + ) -> comfy_io.NodeOutput: + validate_string(prompt, min_length=1) + validate_image_dimensions(start_frame, max_width=7999, max_height=7999) + validate_image_aspect_ratio(start_frame, min_aspect_ratio=0.5, max_aspect_ratio=2.0) + + auth_kwargs = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + + download_urls = await upload_images_to_comfyapi( + start_frame, + max_images=1, + mime_type="image/png", auth_kwargs=auth_kwargs, ) - initial_response = await initial_operation.execute() - self.validate_task_created(initial_response) - task_id = initial_response.id - - final_response = await self.get_response(task_id, auth_kwargs, node_id) - self.validate_response(final_response) - - video_url = get_video_url_from_task_status(final_response) - return (await download_url_to_video_output(video_url),) + return comfy_io.NodeOutput( + await generate_video( + RunwayImageToVideoRequest( + promptText=prompt, + seed=seed, + model=Model("gen3a_turbo"), + duration=Duration(duration), + ratio=AspectRatio(ratio), + promptImage=RunwayPromptImageObject( + root=[ + RunwayPromptImageDetailedObject( + uri=str(download_urls[0]), position="first" + ) + ] + ), + ), + auth_kwargs=auth_kwargs, + node_id=cls.hidden.unique_id, + ) + ) -class RunwayImageToVideoNodeGen3a(RunwayVideoGenNode): - """Runway Image to Video Node using Gen3a Turbo model.""" - - DESCRIPTION = "Generate a video from a single starting frame using Gen3a Turbo model. Before diving in, review these best practices to ensure that your input selections will set your generation up for success: https://help.runwayml.com/hc/en-us/articles/33927968552339-Creating-with-Act-One-on-Gen-3-Alpha-and-Turbo." +class RunwayImageToVideoNodeGen4(comfy_io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": model_field_to_node_input( - IO.STRING, RunwayImageToVideoRequest, "promptText", multiline=True + def define_schema(cls): + return comfy_io.Schema( + node_id="RunwayImageToVideoNodeGen4", + display_name="Runway Image to Video (Gen4 Turbo)", + category="api node/video/Runway", + description="Generate a video from a single starting frame using Gen4 Turbo model. " + "Before diving in, review these best practices to ensure that " + "your input selections will set your generation up for success: " + "https://help.runwayml.com/hc/en-us/articles/37327109429011-Creating-with-Gen-4-Video.", + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Text prompt for the generation", ), - "start_frame": ( - IO.IMAGE, - {"tooltip": "Start frame to be used for the video"}, + comfy_io.Image.Input( + "start_frame", + tooltip="Start frame to be used for the video", ), - "duration": model_field_to_node_input( - IO.COMBO, RunwayImageToVideoRequest, "duration", enum_type=Duration + comfy_io.Combo.Input( + "duration", + options=[model.value for model in Duration], ), - "ratio": model_field_to_node_input( - IO.COMBO, - RunwayImageToVideoRequest, + comfy_io.Combo.Input( "ratio", - enum_type=RunwayGen3aAspectRatio, + options=[model.value for model in RunwayGen4TurboAspectRatio], ), - "seed": model_field_to_node_input( - IO.INT, - RunwayImageToVideoRequest, + comfy_io.Int.Input( "seed", + default=0, + min=0, + max=4294967295, + step=1, control_after_generate=True, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Random seed for generation", ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) - async def api_call( - self, + @classmethod + async def execute( + cls, prompt: str, start_frame: torch.Tensor, duration: str, ratio: str, seed: int, - unique_id: Optional[str] = None, - **kwargs, - ) -> tuple[VideoFromFile]: - # Validate inputs + ) -> comfy_io.NodeOutput: validate_string(prompt, min_length=1) - validate_input_image(start_frame) + validate_image_dimensions(start_frame, max_width=7999, max_height=7999) + validate_image_aspect_ratio(start_frame, min_aspect_ratio=0.5, max_aspect_ratio=2.0) + + auth_kwargs = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } - # Upload image download_urls = await upload_images_to_comfyapi( start_frame, max_images=1, mime_type="image/png", - auth_kwargs=kwargs, + auth_kwargs=auth_kwargs, ) - if len(download_urls) != 1: - raise RunwayApiError("Failed to upload one or more images to comfy api.") - return await self.generate_video( - RunwayImageToVideoRequest( - promptText=prompt, - seed=seed, - model=Model("gen3a_turbo"), - duration=Duration(duration), - ratio=AspectRatio(ratio), - promptImage=RunwayPromptImageObject( - root=[ - RunwayPromptImageDetailedObject( - uri=str(download_urls[0]), position="first" - ) - ] + return comfy_io.NodeOutput( + await generate_video( + RunwayImageToVideoRequest( + promptText=prompt, + seed=seed, + model=Model("gen4_turbo"), + duration=Duration(duration), + ratio=AspectRatio(ratio), + promptImage=RunwayPromptImageObject( + root=[ + RunwayPromptImageDetailedObject( + uri=str(download_urls[0]), position="first" + ) + ] + ), ), - ), - auth_kwargs=kwargs, - node_id=unique_id, + auth_kwargs=auth_kwargs, + node_id=cls.hidden.unique_id, + estimated_duration=AVERAGE_DURATION_FLF_SECONDS, + ) ) -class RunwayImageToVideoNodeGen4(RunwayVideoGenNode): - """Runway Image to Video Node using Gen4 Turbo model.""" - - DESCRIPTION = "Generate a video from a single starting frame using Gen4 Turbo model. Before diving in, review these best practices to ensure that your input selections will set your generation up for success: https://help.runwayml.com/hc/en-us/articles/37327109429011-Creating-with-Gen-4-Video." +class RunwayFirstLastFrameNode(comfy_io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": model_field_to_node_input( - IO.STRING, RunwayImageToVideoRequest, "promptText", multiline=True + def define_schema(cls): + return comfy_io.Schema( + node_id="RunwayFirstLastFrameNode", + display_name="Runway First-Last-Frame to Video", + category="api node/video/Runway", + description="Upload first and last keyframes, draft a prompt, and generate a video. " + "More complex transitions, such as cases where the Last frame is completely different " + "from the First frame, may benefit from the longer 10s duration. " + "This would give the generation more time to smoothly transition between the two inputs. " + "Before diving in, review these best practices to ensure that your input selections " + "will set your generation up for success: " + "https://help.runwayml.com/hc/en-us/articles/34170748696595-Creating-with-Keyframes-on-Gen-3.", + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Text prompt for the generation", ), - "start_frame": ( - IO.IMAGE, - {"tooltip": "Start frame to be used for the video"}, + comfy_io.Image.Input( + "start_frame", + tooltip="Start frame to be used for the video", ), - "duration": model_field_to_node_input( - IO.COMBO, RunwayImageToVideoRequest, "duration", enum_type=Duration + comfy_io.Image.Input( + "end_frame", + tooltip="End frame to be used for the video. Supported for gen3a_turbo only.", ), - "ratio": model_field_to_node_input( - IO.COMBO, - RunwayImageToVideoRequest, + comfy_io.Combo.Input( + "duration", + options=[model.value for model in Duration], + ), + comfy_io.Combo.Input( "ratio", - enum_type=RunwayGen4TurboAspectRatio, + options=[model.value for model in RunwayGen3aAspectRatio], ), - "seed": model_field_to_node_input( - IO.INT, - RunwayImageToVideoRequest, + comfy_io.Int.Input( "seed", + default=0, + min=0, + max=4294967295, + step=1, control_after_generate=True, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Random seed for generation", ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - async def api_call( - self, - prompt: str, - start_frame: torch.Tensor, - duration: str, - ratio: str, - seed: int, - unique_id: Optional[str] = None, - **kwargs, - ) -> tuple[VideoFromFile]: - # Validate inputs - validate_string(prompt, min_length=1) - validate_input_image(start_frame) - - # Upload image - download_urls = await upload_images_to_comfyapi( - start_frame, - max_images=1, - mime_type="image/png", - auth_kwargs=kwargs, - ) - if len(download_urls) != 1: - raise RunwayApiError("Failed to upload one or more images to comfy api.") - - return await self.generate_video( - RunwayImageToVideoRequest( - promptText=prompt, - seed=seed, - model=Model("gen4_turbo"), - duration=Duration(duration), - ratio=AspectRatio(ratio), - promptImage=RunwayPromptImageObject( - root=[ - RunwayPromptImageDetailedObject( - uri=str(download_urls[0]), position="first" - ) - ] - ), - ), - auth_kwargs=kwargs, - node_id=unique_id, - ) - - -class RunwayFirstLastFrameNode(RunwayVideoGenNode): - """Runway First-Last Frame Node.""" - - DESCRIPTION = "Upload first and last keyframes, draft a prompt, and generate a video. More complex transitions, such as cases where the Last frame is completely different from the First frame, may benefit from the longer 10s duration. This would give the generation more time to smoothly transition between the two inputs. Before diving in, review these best practices to ensure that your input selections will set your generation up for success: https://help.runwayml.com/hc/en-us/articles/34170748696595-Creating-with-Keyframes-on-Gen-3." - - async def get_response( - self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None - ) -> RunwayImageToVideoResponse: - return await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{PATH_GET_TASK_STATUS}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=TaskStatusResponse, - ), - estimated_duration=AVERAGE_DURATION_FLF_SECONDS, - node_id=node_id, + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": model_field_to_node_input( - IO.STRING, RunwayImageToVideoRequest, "promptText", multiline=True - ), - "start_frame": ( - IO.IMAGE, - {"tooltip": "Start frame to be used for the video"}, - ), - "end_frame": ( - IO.IMAGE, - { - "tooltip": "End frame to be used for the video. Supported for gen3a_turbo only." - }, - ), - "duration": model_field_to_node_input( - IO.COMBO, RunwayImageToVideoRequest, "duration", enum_type=Duration - ), - "ratio": model_field_to_node_input( - IO.COMBO, - RunwayImageToVideoRequest, - "ratio", - enum_type=RunwayGen3aAspectRatio, - ), - "seed": model_field_to_node_input( - IO.INT, - RunwayImageToVideoRequest, - "seed", - control_after_generate=True, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "unique_id": "UNIQUE_ID", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } - - async def api_call( - self, + async def execute( + cls, prompt: str, start_frame: torch.Tensor, end_frame: torch.Tensor, duration: str, ratio: str, seed: int, - unique_id: Optional[str] = None, - **kwargs, - ) -> tuple[VideoFromFile]: - # Validate inputs + ) -> comfy_io.NodeOutput: validate_string(prompt, min_length=1) - validate_input_image(start_frame) - validate_input_image(end_frame) + validate_image_dimensions(start_frame, max_width=7999, max_height=7999) + validate_image_dimensions(end_frame, max_width=7999, max_height=7999) + validate_image_aspect_ratio(start_frame, min_aspect_ratio=0.5, max_aspect_ratio=2.0) + validate_image_aspect_ratio(end_frame, min_aspect_ratio=0.5, max_aspect_ratio=2.0) + + auth_kwargs = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } - # Upload images stacked_input_images = image_tensor_pair_to_batch(start_frame, end_frame) download_urls = await upload_images_to_comfyapi( stacked_input_images, max_images=2, mime_type="image/png", - auth_kwargs=kwargs, + auth_kwargs=auth_kwargs, ) if len(download_urls) != 2: raise RunwayApiError("Failed to upload one or more images to comfy api.") - return await self.generate_video( - RunwayImageToVideoRequest( - promptText=prompt, - seed=seed, - model=Model("gen3a_turbo"), - duration=Duration(duration), - ratio=AspectRatio(ratio), - promptImage=RunwayPromptImageObject( - root=[ - RunwayPromptImageDetailedObject( - uri=str(download_urls[0]), position="first" - ), - RunwayPromptImageDetailedObject( - uri=str(download_urls[1]), position="last" - ), - ] + return comfy_io.NodeOutput( + await generate_video( + RunwayImageToVideoRequest( + promptText=prompt, + seed=seed, + model=Model("gen3a_turbo"), + duration=Duration(duration), + ratio=AspectRatio(ratio), + promptImage=RunwayPromptImageObject( + root=[ + RunwayPromptImageDetailedObject( + uri=str(download_urls[0]), position="first" + ), + RunwayPromptImageDetailedObject( + uri=str(download_urls[1]), position="last" + ), + ] + ), ), - ), - auth_kwargs=kwargs, - node_id=unique_id, + auth_kwargs=auth_kwargs, + node_id=cls.hidden.unique_id, + estimated_duration=AVERAGE_DURATION_FLF_SECONDS, + ) ) -class RunwayTextToImageNode(ComfyNodeABC): - """Runway Text to Image Node.""" - - RETURN_TYPES = ("IMAGE",) - FUNCTION = "api_call" - CATEGORY = "api node/image/Runway" - API_NODE = True - DESCRIPTION = "Generate an image from a text prompt using Runway's Gen 4 model. You can also include reference images to guide the generation." +class RunwayTextToImageNode(comfy_io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": model_field_to_node_input( - IO.STRING, RunwayTextToImageRequest, "promptText", multiline=True + def define_schema(cls): + return comfy_io.Schema( + node_id="RunwayTextToImageNode", + display_name="Runway Text to Image", + category="api node/image/Runway", + description="Generate an image from a text prompt using Runway's Gen 4 model. " + "You can also include reference image to guide the generation.", + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Text prompt for the generation", ), - "ratio": model_field_to_node_input( - IO.COMBO, - RunwayTextToImageRequest, + comfy_io.Combo.Input( "ratio", - enum_type=RunwayTextToImageAspectRatioEnum, + options=[model.value for model in RunwayTextToImageAspectRatioEnum], ), - }, - "optional": { - "reference_image": ( - IO.IMAGE, - {"tooltip": "Optional reference image to guide the generation"}, - ) - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - def validate_task_created(self, response: RunwayTextToImageResponse) -> bool: - """ - Validate the task creation response from the Runway API matches - expected format. - """ - if not bool(response.id): - raise RunwayApiError("Invalid initial response from Runway API.") - return True - - def validate_response(self, response: TaskStatusResponse) -> bool: - """ - Validate the successful task status response from the Runway API - matches expected format. - """ - if not response.output or len(response.output) == 0: - raise RunwayApiError( - "Runway task succeeded but no image data found in response." - ) - return True - - async def get_response( - self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None - ) -> TaskStatusResponse: - """Poll the task status until it is finished then get the response.""" - return await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{PATH_GET_TASK_STATUS}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=TaskStatusResponse, - ), - estimated_duration=AVERAGE_DURATION_T2I_SECONDS, - node_id=node_id, + comfy_io.Image.Input( + "reference_image", + tooltip="Optional reference image to guide the generation", + optional=True, + ), + ], + outputs=[ + comfy_io.Image.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, ) - async def api_call( - self, + @classmethod + async def execute( + cls, prompt: str, ratio: str, reference_image: Optional[torch.Tensor] = None, - unique_id: Optional[str] = None, - **kwargs, - ) -> tuple[torch.Tensor]: - # Validate inputs + ) -> comfy_io.NodeOutput: validate_string(prompt, min_length=1) + auth_kwargs = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + # Prepare reference images if provided reference_images = None if reference_image is not None: - validate_input_image(reference_image) + validate_image_dimensions(reference_image, max_width=7999, max_height=7999) + validate_image_aspect_ratio(reference_image, min_aspect_ratio=0.5, max_aspect_ratio=2.0) download_urls = await upload_images_to_comfyapi( reference_image, max_images=1, mime_type="image/png", - auth_kwargs=kwargs, + auth_kwargs=auth_kwargs, ) - if len(download_urls) != 1: - raise RunwayApiError("Failed to upload reference image to comfy api.") - reference_images = [ReferenceImage(uri=str(download_urls[0]))] - # Create request request = RunwayTextToImageRequest( promptText=prompt, model=Model4.gen4_image, @@ -593,7 +565,6 @@ class RunwayTextToImageNode(ComfyNodeABC): referenceImages=reference_images, ) - # Execute initial request initial_operation = SynchronousOperation( endpoint=ApiEndpoint( path=PATH_TEXT_TO_IMAGE, @@ -602,34 +573,33 @@ class RunwayTextToImageNode(ComfyNodeABC): response_model=RunwayTextToImageResponse, ), request=request, - auth_kwargs=kwargs, + auth_kwargs=auth_kwargs, ) initial_response = await initial_operation.execute() - self.validate_task_created(initial_response) - task_id = initial_response.id # Poll for completion - final_response = await self.get_response( - task_id, auth_kwargs=kwargs, node_id=unique_id + final_response = await get_response( + initial_response.id, + auth_kwargs=auth_kwargs, + node_id=cls.hidden.unique_id, + estimated_duration=AVERAGE_DURATION_T2I_SECONDS, ) - self.validate_response(final_response) + if not final_response.output: + raise RunwayApiError("Runway task succeeded but no image data found in response.") - # Download and return image - image_url = get_image_url_from_task_status(final_response) - return (await download_url_to_image_tensor(image_url),) + return comfy_io.NodeOutput(await download_url_to_image_tensor(get_image_url_from_task_status(final_response))) -NODE_CLASS_MAPPINGS = { - "RunwayFirstLastFrameNode": RunwayFirstLastFrameNode, - "RunwayImageToVideoNodeGen3a": RunwayImageToVideoNodeGen3a, - "RunwayImageToVideoNodeGen4": RunwayImageToVideoNodeGen4, - "RunwayTextToImageNode": RunwayTextToImageNode, -} +class RunwayExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + return [ + RunwayFirstLastFrameNode, + RunwayImageToVideoNodeGen3a, + RunwayImageToVideoNodeGen4, + RunwayTextToImageNode, + ] -NODE_DISPLAY_NAME_MAPPINGS = { - "RunwayFirstLastFrameNode": "Runway First-Last-Frame to Video", - "RunwayImageToVideoNodeGen3a": "Runway Image to Video (Gen3a Turbo)", - "RunwayImageToVideoNodeGen4": "Runway Image to Video (Gen4 Turbo)", - "RunwayTextToImageNode": "Runway Text to Image", -} +async def comfy_entrypoint() -> RunwayExtension: + return RunwayExtension() From 4368d8f87f580f526e8b2bc43bf0ac88e2b67033 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 3 Sep 2025 15:43:29 -0700 Subject: [PATCH 234/243] Update comment in api example. (#9708) --- script_examples/basic_api_example.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/script_examples/basic_api_example.py b/script_examples/basic_api_example.py index 9128420c4..7e20cc2c1 100644 --- a/script_examples/basic_api_example.py +++ b/script_examples/basic_api_example.py @@ -3,11 +3,7 @@ from urllib import request #This is the ComfyUI api prompt format. -#If you want it for a specific workflow you can "enable dev mode options" -#in the settings of the UI (gear beside the "Queue Size: ") this will enable -#a button on the UI to save workflows in api format. - -#keep in mind ComfyUI is pre alpha software so this format will change a bit. +#If you want it for a specific workflow you can "File -> Export (API)" in the interface. #this is the one for the default workflow prompt_text = """ From f48d05a2d17fe1a69e08fbabfb080e3779b36225 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Thu, 4 Sep 2025 04:21:38 +0300 Subject: [PATCH 235/243] convert AlignYourStepsScheduler node to V3 schema (#9226) --- comfy_extras/nodes_align_your_steps.py | 50 +++++++++++++++++--------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/comfy_extras/nodes_align_your_steps.py b/comfy_extras/nodes_align_your_steps.py index 8d856d0e8..edd5dadd4 100644 --- a/comfy_extras/nodes_align_your_steps.py +++ b/comfy_extras/nodes_align_your_steps.py @@ -1,6 +1,10 @@ #from: https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/howto.html import numpy as np import torch +from typing_extensions import override + +from comfy_api.latest import ComfyExtension, io + def loglinear_interp(t_steps, num_steps): """ @@ -19,25 +23,30 @@ NOISE_LEVELS = {"SD1": [14.6146412293, 6.4745760956, 3.8636745985, 2.694615152 "SDXL":[14.6146412293, 6.3184485287, 3.7681790315, 2.1811480769, 1.3405244945, 0.8620721141, 0.5550693289, 0.3798540708, 0.2332364134, 0.1114188177, 0.0291671582], "SVD": [700.00, 54.5, 15.886, 7.977, 4.248, 1.789, 0.981, 0.403, 0.173, 0.034, 0.002]} -class AlignYourStepsScheduler: +class AlignYourStepsScheduler(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"model_type": (["SD1", "SDXL", "SVD"], ), - "steps": ("INT", {"default": 10, "min": 1, "max": 10000}), - "denoise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}), - } - } - RETURN_TYPES = ("SIGMAS",) - CATEGORY = "sampling/custom_sampling/schedulers" - - FUNCTION = "get_sigmas" + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="AlignYourStepsScheduler", + category="sampling/custom_sampling/schedulers", + inputs=[ + io.Combo.Input("model_type", options=["SD1", "SDXL", "SVD"]), + io.Int.Input("steps", default=10, min=1, max=10000), + io.Float.Input("denoise", default=1.0, min=0.0, max=1.0, step=0.01), + ], + outputs=[io.Sigmas.Output()], + ) def get_sigmas(self, model_type, steps, denoise): + # Deprecated: use the V3 schema's `execute` method instead of this. + return AlignYourStepsScheduler().execute(model_type, steps, denoise).result + + @classmethod + def execute(cls, model_type, steps, denoise) -> io.NodeOutput: total_steps = steps if denoise < 1.0: if denoise <= 0.0: - return (torch.FloatTensor([]),) + return io.NodeOutput(torch.FloatTensor([])) total_steps = round(steps * denoise) sigmas = NOISE_LEVELS[model_type][:] @@ -46,8 +55,15 @@ class AlignYourStepsScheduler: sigmas = sigmas[-(total_steps + 1):] sigmas[-1] = 0 - return (torch.FloatTensor(sigmas), ) + return io.NodeOutput(torch.FloatTensor(sigmas)) -NODE_CLASS_MAPPINGS = { - "AlignYourStepsScheduler": AlignYourStepsScheduler, -} + +class AlignYourStepsExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + AlignYourStepsScheduler, + ] + +async def comfy_entrypoint() -> AlignYourStepsExtension: + return AlignYourStepsExtension() From 72855db715096bc378817b1aaffcf232fdc39659 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 3 Sep 2025 19:20:13 -0700 Subject: [PATCH 236/243] Fix potential rope issue. (#9710) --- comfy/ldm/audio/dit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/ldm/audio/dit.py b/comfy/ldm/audio/dit.py index 179c5b67e..d0d69bbdc 100644 --- a/comfy/ldm/audio/dit.py +++ b/comfy/ldm/audio/dit.py @@ -632,7 +632,7 @@ class ContinuousTransformer(nn.Module): # Attention layers if self.rotary_pos_emb is not None: - rotary_pos_emb = self.rotary_pos_emb.forward_from_seq_len(x.shape[1], dtype=x.dtype, device=x.device) + rotary_pos_emb = self.rotary_pos_emb.forward_from_seq_len(x.shape[1], dtype=torch.float, device=x.device) else: rotary_pos_emb = None From b71f9bcb7143b8cd4fff627bb91b60739c915d4c Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Thu, 4 Sep 2025 14:14:02 +0800 Subject: [PATCH 237/243] Update template to 0.1.75 (#9711) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 4ebe6cc2a..3008a5dc3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.25.11 -comfyui-workflow-templates==0.1.73 +comfyui-workflow-templates==0.1.75 comfyui-embedded-docs==0.2.6 torch torchsde From b0338e930bbc1f9d01f005f224573d5994732932 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Thu, 4 Sep 2025 02:15:57 -0400 Subject: [PATCH 238/243] ComfyUI 0.3.57 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index e8e039373..4cc3c8647 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.56" +__version__ = "0.3.57" diff --git a/pyproject.toml b/pyproject.toml index cfd5d45ef..d75cd04a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.56" +version = "0.3.57" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From a9f1bb10a52ce08a3f21e6fc562554671c85c3d5 Mon Sep 17 00:00:00 2001 From: guill Date: Thu, 4 Sep 2025 16:13:28 -0700 Subject: [PATCH 239/243] Fix progress update crossover between users (#9706) * Fix showing progress from other sessions Because `client_id` was missing from ths `progress_state` message, it was being sent to all connected sessions. This technically meant that if someone had a graph with the same nodes, they would see the progress updates for others. Also added a test to prevent reoccurance and moved the tests around to make CI easier to hook up. * Fix CI issues related to timing-sensitive tests --- .github/workflows/test-execution.yml | 30 +++ comfy_execution/progress.py | 3 +- tests/conftest.py | 6 + .../extra_model_paths.yaml | 0 .../test_async_nodes.py | 28 ++- .../test_execution.py | 13 +- tests/execution/test_progress_isolation.py | 233 ++++++++++++++++++ .../testing_nodes/testing-pack/__init__.py | 0 .../testing-pack/api_test_nodes.py | 0 .../testing-pack/async_test_nodes.py | 0 .../testing_nodes/testing-pack/conditions.py | 0 .../testing-pack/flow_control.py | 0 .../testing-pack/specific_tests.py | 0 .../testing_nodes/testing-pack/stubs.py | 0 .../testing_nodes/testing-pack/tools.py | 0 .../testing_nodes/testing-pack/util.py | 0 16 files changed, 295 insertions(+), 18 deletions(-) create mode 100644 .github/workflows/test-execution.yml rename tests/{inference => execution}/extra_model_paths.yaml (100%) rename tests/{inference => execution}/test_async_nodes.py (95%) rename tests/{inference => execution}/test_execution.py (98%) create mode 100644 tests/execution/test_progress_isolation.py rename tests/{inference => execution}/testing_nodes/testing-pack/__init__.py (100%) rename tests/{inference => execution}/testing_nodes/testing-pack/api_test_nodes.py (100%) rename tests/{inference => execution}/testing_nodes/testing-pack/async_test_nodes.py (100%) rename tests/{inference => execution}/testing_nodes/testing-pack/conditions.py (100%) rename tests/{inference => execution}/testing_nodes/testing-pack/flow_control.py (100%) rename tests/{inference => execution}/testing_nodes/testing-pack/specific_tests.py (100%) rename tests/{inference => execution}/testing_nodes/testing-pack/stubs.py (100%) rename tests/{inference => execution}/testing_nodes/testing-pack/tools.py (100%) rename tests/{inference => execution}/testing_nodes/testing-pack/util.py (100%) diff --git a/.github/workflows/test-execution.yml b/.github/workflows/test-execution.yml new file mode 100644 index 000000000..00ef07ebf --- /dev/null +++ b/.github/workflows/test-execution.yml @@ -0,0 +1,30 @@ +name: Execution Tests + +on: + push: + branches: [ main, master ] + pull_request: + branches: [ main, master ] + +jobs: + test: + strategy: + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + runs-on: ${{ matrix.os }} + continue-on-error: true + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.12' + - name: Install requirements + run: | + python -m pip install --upgrade pip + pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + pip install -r requirements.txt + pip install -r tests-unit/requirements.txt + - name: Run Execution Tests + run: | + python -m pytest tests/execution -v --skip-timing-checks diff --git a/comfy_execution/progress.py b/comfy_execution/progress.py index e8f5ede1e..f951a3350 100644 --- a/comfy_execution/progress.py +++ b/comfy_execution/progress.py @@ -181,8 +181,9 @@ class WebUIProgressHandler(ProgressHandler): } # Send a combined progress_state message with all node states + # Include client_id to ensure message is only sent to the initiating client self.server_instance.send_sync( - "progress_state", {"prompt_id": prompt_id, "nodes": active_nodes} + "progress_state", {"prompt_id": prompt_id, "nodes": active_nodes}, self.server_instance.client_id ) @override diff --git a/tests/conftest.py b/tests/conftest.py index 4e30eb581..290e3a5c0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,6 +6,7 @@ def pytest_addoption(parser): parser.addoption('--output_dir', action="store", default='tests/inference/samples', help='Output directory for generated images') parser.addoption("--listen", type=str, default="127.0.0.1", metavar="IP", nargs="?", const="0.0.0.0", help="Specify the IP address to listen on (default: 127.0.0.1). If --listen is provided without an argument, it defaults to 0.0.0.0. (listens on all)") parser.addoption("--port", type=int, default=8188, help="Set the listen port.") + parser.addoption("--skip-timing-checks", action="store_true", default=False, help="Skip timing-related assertions in tests (useful for CI environments with variable performance)") # This initializes args at the beginning of the test session @pytest.fixture(scope="session", autouse=True) @@ -19,6 +20,11 @@ def args_pytest(pytestconfig): return args +@pytest.fixture(scope="session") +def skip_timing_checks(pytestconfig): + """Fixture that returns whether timing checks should be skipped.""" + return pytestconfig.getoption("--skip-timing-checks") + def pytest_collection_modifyitems(items): # Modifies items so tests run in the correct order diff --git a/tests/inference/extra_model_paths.yaml b/tests/execution/extra_model_paths.yaml similarity index 100% rename from tests/inference/extra_model_paths.yaml rename to tests/execution/extra_model_paths.yaml diff --git a/tests/inference/test_async_nodes.py b/tests/execution/test_async_nodes.py similarity index 95% rename from tests/inference/test_async_nodes.py rename to tests/execution/test_async_nodes.py index f029953dd..c771b4b36 100644 --- a/tests/inference/test_async_nodes.py +++ b/tests/execution/test_async_nodes.py @@ -7,7 +7,7 @@ import subprocess from pytest import fixture from comfy_execution.graph_utils import GraphBuilder -from tests.inference.test_execution import ComfyClient, run_warmup +from tests.execution.test_execution import ComfyClient, run_warmup @pytest.mark.execution @@ -23,7 +23,7 @@ class TestAsyncNodes: '--output-directory', args_pytest["output_dir"], '--listen', args_pytest["listen"], '--port', str(args_pytest["port"]), - '--extra-model-paths-config', 'tests/inference/extra_model_paths.yaml', + '--extra-model-paths-config', 'tests/execution/extra_model_paths.yaml', '--cpu', ] use_lru, lru_size = request.param @@ -81,7 +81,7 @@ class TestAsyncNodes: assert len(result_images) == 1, "Should have 1 image" assert np.array(result_images[0]).min() == 0 and np.array(result_images[0]).max() == 0, "Image should be black" - def test_multiple_async_parallel_execution(self, client: ComfyClient, builder: GraphBuilder): + def test_multiple_async_parallel_execution(self, client: ComfyClient, builder: GraphBuilder, skip_timing_checks): """Test that multiple async nodes execute in parallel.""" # Warmup execution to ensure server is fully initialized run_warmup(client) @@ -104,7 +104,8 @@ class TestAsyncNodes: elapsed_time = time.time() - start_time # Should take ~0.5s (max duration) not 1.2s (sum of durations) - assert elapsed_time < 0.8, f"Parallel execution took {elapsed_time}s, expected < 0.8s" + if not skip_timing_checks: + assert elapsed_time < 0.8, f"Parallel execution took {elapsed_time}s, expected < 0.8s" # Verify all nodes executed assert result.did_run(sleep1) and result.did_run(sleep2) and result.did_run(sleep3) @@ -150,7 +151,7 @@ class TestAsyncNodes: with pytest.raises(urllib.error.HTTPError): client.run(g) - def test_async_lazy_evaluation(self, client: ComfyClient, builder: GraphBuilder): + def test_async_lazy_evaluation(self, client: ComfyClient, builder: GraphBuilder, skip_timing_checks): """Test async nodes with lazy evaluation.""" # Warmup execution to ensure server is fully initialized run_warmup(client, prefix="warmup_lazy") @@ -173,7 +174,8 @@ class TestAsyncNodes: elapsed_time = time.time() - start_time # Should only execute sleep1, not sleep2 - assert elapsed_time < 0.5, f"Should skip sleep2, took {elapsed_time}s" + if not skip_timing_checks: + assert elapsed_time < 0.5, f"Should skip sleep2, took {elapsed_time}s" assert result.did_run(sleep1), "Sleep1 should have executed" assert not result.did_run(sleep2), "Sleep2 should have been skipped" @@ -310,7 +312,7 @@ class TestAsyncNodes: images = result.get_images(output) assert len(images) == 1, "Should have blocked second image" - def test_async_caching_behavior(self, client: ComfyClient, builder: GraphBuilder): + def test_async_caching_behavior(self, client: ComfyClient, builder: GraphBuilder, skip_timing_checks): """Test that async nodes are properly cached.""" # Warmup execution to ensure server is fully initialized run_warmup(client, prefix="warmup_cache") @@ -330,9 +332,10 @@ class TestAsyncNodes: elapsed_time = time.time() - start_time assert not result2.did_run(sleep_node), "Should be cached" - assert elapsed_time < 0.1, f"Cached run took {elapsed_time}s, should be instant" + if not skip_timing_checks: + assert elapsed_time < 0.1, f"Cached run took {elapsed_time}s, should be instant" - def test_async_with_dynamic_prompts(self, client: ComfyClient, builder: GraphBuilder): + def test_async_with_dynamic_prompts(self, client: ComfyClient, builder: GraphBuilder, skip_timing_checks): """Test async nodes within dynamically generated prompts.""" # Warmup execution to ensure server is fully initialized run_warmup(client, prefix="warmup_dynamic") @@ -345,8 +348,8 @@ class TestAsyncNodes: dynamic_async = g.node("TestDynamicAsyncGeneration", image1=image1.out(0), image2=image2.out(0), - num_async_nodes=3, - sleep_duration=0.2) + num_async_nodes=5, + sleep_duration=0.4) g.node("SaveImage", images=dynamic_async.out(0)) start_time = time.time() @@ -354,7 +357,8 @@ class TestAsyncNodes: elapsed_time = time.time() - start_time # Should execute async nodes in parallel within dynamic prompt - assert elapsed_time < 0.5, f"Dynamic async execution took {elapsed_time}s" + if not skip_timing_checks: + assert elapsed_time < 1.0, f"Dynamic async execution took {elapsed_time}s" assert result.did_run(dynamic_async) def test_async_resource_cleanup(self, client: ComfyClient, builder: GraphBuilder): diff --git a/tests/inference/test_execution.py b/tests/execution/test_execution.py similarity index 98% rename from tests/inference/test_execution.py rename to tests/execution/test_execution.py index e7b29302e..8ea05fdd8 100644 --- a/tests/inference/test_execution.py +++ b/tests/execution/test_execution.py @@ -149,7 +149,7 @@ class TestExecution: '--output-directory', args_pytest["output_dir"], '--listen', args_pytest["listen"], '--port', str(args_pytest["port"]), - '--extra-model-paths-config', 'tests/inference/extra_model_paths.yaml', + '--extra-model-paths-config', 'tests/execution/extra_model_paths.yaml', '--cpu', ] use_lru, lru_size = request.param @@ -518,7 +518,7 @@ class TestExecution: assert numpy.array(images[0]).min() == 63 and numpy.array(images[0]).max() == 63, "Image should have value 0.25" assert not result.did_run(test_node), "The execution should have been cached" - def test_parallel_sleep_nodes(self, client: ComfyClient, builder: GraphBuilder): + def test_parallel_sleep_nodes(self, client: ComfyClient, builder: GraphBuilder, skip_timing_checks): # Warmup execution to ensure server is fully initialized run_warmup(client) @@ -541,14 +541,15 @@ class TestExecution: # The test should take around 3.0 seconds (the longest sleep duration) # plus some overhead, but definitely less than the sum of all sleeps (9.0s) - assert elapsed_time < 8.9, f"Parallel execution took {elapsed_time}s, expected less than 8.9s" + if not skip_timing_checks: + assert elapsed_time < 8.9, f"Parallel execution took {elapsed_time}s, expected less than 8.9s" # Verify that all nodes executed assert result.did_run(sleep_node1), "Sleep node 1 should have run" assert result.did_run(sleep_node2), "Sleep node 2 should have run" assert result.did_run(sleep_node3), "Sleep node 3 should have run" - def test_parallel_sleep_expansion(self, client: ComfyClient, builder: GraphBuilder): + def test_parallel_sleep_expansion(self, client: ComfyClient, builder: GraphBuilder, skip_timing_checks): # Warmup execution to ensure server is fully initialized run_warmup(client) @@ -574,7 +575,9 @@ class TestExecution: # Similar to the previous test, expect parallel execution of the sleep nodes # which should complete in less than the sum of all sleeps - assert elapsed_time < 10.0, f"Expansion execution took {elapsed_time}s, expected less than 5.5s" + # Lots of leeway here since Windows CI is slow + if not skip_timing_checks: + assert elapsed_time < 13.0, f"Expansion execution took {elapsed_time}s" # Verify the parallel sleep node executed assert result.did_run(parallel_sleep), "ParallelSleep node should have run" diff --git a/tests/execution/test_progress_isolation.py b/tests/execution/test_progress_isolation.py new file mode 100644 index 000000000..93dc0d41b --- /dev/null +++ b/tests/execution/test_progress_isolation.py @@ -0,0 +1,233 @@ +"""Test that progress updates are properly isolated between WebSocket clients.""" + +import json +import pytest +import time +import threading +import uuid +import websocket +from typing import List, Dict, Any +from comfy_execution.graph_utils import GraphBuilder +from tests.execution.test_execution import ComfyClient + + +class ProgressTracker: + """Tracks progress messages received by a WebSocket client.""" + + def __init__(self, client_id: str): + self.client_id = client_id + self.progress_messages: List[Dict[str, Any]] = [] + self.lock = threading.Lock() + + def add_message(self, message: Dict[str, Any]): + """Thread-safe addition of progress messages.""" + with self.lock: + self.progress_messages.append(message) + + def get_messages_for_prompt(self, prompt_id: str) -> List[Dict[str, Any]]: + """Get all progress messages for a specific prompt_id.""" + with self.lock: + return [ + msg for msg in self.progress_messages + if msg.get('data', {}).get('prompt_id') == prompt_id + ] + + def has_cross_contamination(self, own_prompt_id: str) -> bool: + """Check if this client received progress for other prompts.""" + with self.lock: + for msg in self.progress_messages: + msg_prompt_id = msg.get('data', {}).get('prompt_id') + if msg_prompt_id and msg_prompt_id != own_prompt_id: + return True + return False + + +class IsolatedClient(ComfyClient): + """Extended ComfyClient that tracks all WebSocket messages.""" + + def __init__(self): + super().__init__() + self.progress_tracker = None + self.all_messages: List[Dict[str, Any]] = [] + + def connect(self, listen='127.0.0.1', port=8188, client_id=None): + """Connect with a specific client_id and set up message tracking.""" + if client_id is None: + client_id = str(uuid.uuid4()) + super().connect(listen, port, client_id) + self.progress_tracker = ProgressTracker(client_id) + + def listen_for_messages(self, duration: float = 5.0): + """Listen for WebSocket messages for a specified duration.""" + end_time = time.time() + duration + self.ws.settimeout(0.5) # Non-blocking with timeout + + while time.time() < end_time: + try: + out = self.ws.recv() + if isinstance(out, str): + message = json.loads(out) + self.all_messages.append(message) + + # Track progress_state messages + if message.get('type') == 'progress_state': + self.progress_tracker.add_message(message) + except websocket.WebSocketTimeoutException: + continue + except Exception: + # Log error silently in test context + break + + +@pytest.mark.execution +class TestProgressIsolation: + """Test suite for verifying progress update isolation between clients.""" + + @pytest.fixture(scope="class", autouse=True) + def _server(self, args_pytest): + """Start the ComfyUI server for testing.""" + import subprocess + pargs = [ + 'python', 'main.py', + '--output-directory', args_pytest["output_dir"], + '--listen', args_pytest["listen"], + '--port', str(args_pytest["port"]), + '--extra-model-paths-config', 'tests/execution/extra_model_paths.yaml', + '--cpu', + ] + p = subprocess.Popen(pargs) + yield + p.kill() + + def start_client_with_retry(self, listen: str, port: int, client_id: str = None): + """Start client with connection retries.""" + client = IsolatedClient() + # Connect to server (with retries) + n_tries = 5 + for i in range(n_tries): + time.sleep(4) + try: + client.connect(listen, port, client_id) + return client + except ConnectionRefusedError as e: + print(e) # noqa: T201 + print(f"({i+1}/{n_tries}) Retrying...") # noqa: T201 + raise ConnectionRefusedError(f"Failed to connect after {n_tries} attempts") + + def test_progress_isolation_between_clients(self, args_pytest): + """Test that progress updates are isolated between different clients.""" + listen = args_pytest["listen"] + port = args_pytest["port"] + + # Create two separate clients with unique IDs + client_a_id = "client_a_" + str(uuid.uuid4()) + client_b_id = "client_b_" + str(uuid.uuid4()) + + try: + # Connect both clients with retries + client_a = self.start_client_with_retry(listen, port, client_a_id) + client_b = self.start_client_with_retry(listen, port, client_b_id) + + # Create simple workflows for both clients + graph_a = GraphBuilder(prefix="client_a") + image_a = graph_a.node("StubImage", content="BLACK", height=256, width=256, batch_size=1) + graph_a.node("PreviewImage", images=image_a.out(0)) + + graph_b = GraphBuilder(prefix="client_b") + image_b = graph_b.node("StubImage", content="WHITE", height=256, width=256, batch_size=1) + graph_b.node("PreviewImage", images=image_b.out(0)) + + # Submit workflows from both clients + prompt_a = graph_a.finalize() + prompt_b = graph_b.finalize() + + response_a = client_a.queue_prompt(prompt_a) + prompt_id_a = response_a['prompt_id'] + + response_b = client_b.queue_prompt(prompt_b) + prompt_id_b = response_b['prompt_id'] + + # Start threads to listen for messages on both clients + def listen_client_a(): + client_a.listen_for_messages(duration=10.0) + + def listen_client_b(): + client_b.listen_for_messages(duration=10.0) + + thread_a = threading.Thread(target=listen_client_a) + thread_b = threading.Thread(target=listen_client_b) + + thread_a.start() + thread_b.start() + + # Wait for threads to complete + thread_a.join() + thread_b.join() + + # Verify isolation + # Client A should only receive progress for prompt_id_a + assert not client_a.progress_tracker.has_cross_contamination(prompt_id_a), \ + f"Client A received progress updates for other clients' workflows. " \ + f"Expected only {prompt_id_a}, but got messages for multiple prompts." + + # Client B should only receive progress for prompt_id_b + assert not client_b.progress_tracker.has_cross_contamination(prompt_id_b), \ + f"Client B received progress updates for other clients' workflows. " \ + f"Expected only {prompt_id_b}, but got messages for multiple prompts." + + # Verify each client received their own progress updates + client_a_messages = client_a.progress_tracker.get_messages_for_prompt(prompt_id_a) + client_b_messages = client_b.progress_tracker.get_messages_for_prompt(prompt_id_b) + + assert len(client_a_messages) > 0, \ + "Client A did not receive any progress updates for its own workflow" + assert len(client_b_messages) > 0, \ + "Client B did not receive any progress updates for its own workflow" + + # Ensure no cross-contamination + client_a_other = client_a.progress_tracker.get_messages_for_prompt(prompt_id_b) + client_b_other = client_b.progress_tracker.get_messages_for_prompt(prompt_id_a) + + assert len(client_a_other) == 0, \ + f"Client A incorrectly received {len(client_a_other)} progress updates for Client B's workflow" + assert len(client_b_other) == 0, \ + f"Client B incorrectly received {len(client_b_other)} progress updates for Client A's workflow" + + finally: + # Clean up connections + if hasattr(client_a, 'ws'): + client_a.ws.close() + if hasattr(client_b, 'ws'): + client_b.ws.close() + + def test_progress_with_missing_client_id(self, args_pytest): + """Test that progress updates handle missing client_id gracefully.""" + listen = args_pytest["listen"] + port = args_pytest["port"] + + try: + # Connect client with retries + client = self.start_client_with_retry(listen, port) + + # Create a simple workflow + graph = GraphBuilder(prefix="test_missing_id") + image = graph.node("StubImage", content="BLACK", height=128, width=128, batch_size=1) + graph.node("PreviewImage", images=image.out(0)) + + # Submit workflow + prompt = graph.finalize() + response = client.queue_prompt(prompt) + prompt_id = response['prompt_id'] + + # Listen for messages + client.listen_for_messages(duration=5.0) + + # Should still receive progress updates for own workflow + messages = client.progress_tracker.get_messages_for_prompt(prompt_id) + assert len(messages) > 0, \ + "Client did not receive progress updates even though it initiated the workflow" + + finally: + if hasattr(client, 'ws'): + client.ws.close() + diff --git a/tests/inference/testing_nodes/testing-pack/__init__.py b/tests/execution/testing_nodes/testing-pack/__init__.py similarity index 100% rename from tests/inference/testing_nodes/testing-pack/__init__.py rename to tests/execution/testing_nodes/testing-pack/__init__.py diff --git a/tests/inference/testing_nodes/testing-pack/api_test_nodes.py b/tests/execution/testing_nodes/testing-pack/api_test_nodes.py similarity index 100% rename from tests/inference/testing_nodes/testing-pack/api_test_nodes.py rename to tests/execution/testing_nodes/testing-pack/api_test_nodes.py diff --git a/tests/inference/testing_nodes/testing-pack/async_test_nodes.py b/tests/execution/testing_nodes/testing-pack/async_test_nodes.py similarity index 100% rename from tests/inference/testing_nodes/testing-pack/async_test_nodes.py rename to tests/execution/testing_nodes/testing-pack/async_test_nodes.py diff --git a/tests/inference/testing_nodes/testing-pack/conditions.py b/tests/execution/testing_nodes/testing-pack/conditions.py similarity index 100% rename from tests/inference/testing_nodes/testing-pack/conditions.py rename to tests/execution/testing_nodes/testing-pack/conditions.py diff --git a/tests/inference/testing_nodes/testing-pack/flow_control.py b/tests/execution/testing_nodes/testing-pack/flow_control.py similarity index 100% rename from tests/inference/testing_nodes/testing-pack/flow_control.py rename to tests/execution/testing_nodes/testing-pack/flow_control.py diff --git a/tests/inference/testing_nodes/testing-pack/specific_tests.py b/tests/execution/testing_nodes/testing-pack/specific_tests.py similarity index 100% rename from tests/inference/testing_nodes/testing-pack/specific_tests.py rename to tests/execution/testing_nodes/testing-pack/specific_tests.py diff --git a/tests/inference/testing_nodes/testing-pack/stubs.py b/tests/execution/testing_nodes/testing-pack/stubs.py similarity index 100% rename from tests/inference/testing_nodes/testing-pack/stubs.py rename to tests/execution/testing_nodes/testing-pack/stubs.py diff --git a/tests/inference/testing_nodes/testing-pack/tools.py b/tests/execution/testing_nodes/testing-pack/tools.py similarity index 100% rename from tests/inference/testing_nodes/testing-pack/tools.py rename to tests/execution/testing_nodes/testing-pack/tools.py diff --git a/tests/inference/testing_nodes/testing-pack/util.py b/tests/execution/testing_nodes/testing-pack/util.py similarity index 100% rename from tests/inference/testing_nodes/testing-pack/util.py rename to tests/execution/testing_nodes/testing-pack/util.py From 261421e21899abc8168c71efd8694ade020bcee2 Mon Sep 17 00:00:00 2001 From: "Yousef R. Gamaleldin" <81116377+yousef-rafat@users.noreply.github.com> Date: Fri, 5 Sep 2025 03:36:20 +0300 Subject: [PATCH 240/243] Add Hunyuan 3D 2.1 Support (#8714) --- comfy/clip_vision.py | 231 ++++++++- comfy/image_encoders/dino2.py | 33 +- comfy/image_encoders/dino2_large.json | 22 + comfy/latent_formats.py | 5 + comfy/ldm/hunyuan3d/vae.py | 569 ++++++++++++++++++---- comfy/ldm/hunyuan3dv2_1/hunyuandit.py | 658 ++++++++++++++++++++++++++ comfy/model_base.py | 17 + comfy/model_detection.py | 14 + comfy/sd.py | 49 +- comfy/supported_models.py | 13 +- comfy_extras/nodes_hunyuan3d.py | 24 +- nodes.py | 29 +- requirements.txt | 2 +- 13 files changed, 1537 insertions(+), 129 deletions(-) create mode 100644 comfy/image_encoders/dino2_large.json create mode 100644 comfy/ldm/hunyuan3dv2_1/hunyuandit.py diff --git a/comfy/clip_vision.py b/comfy/clip_vision.py index 2fa410cb7..4bc640e8b 100644 --- a/comfy/clip_vision.py +++ b/comfy/clip_vision.py @@ -17,10 +17,227 @@ class Output: def __setitem__(self, key, item): setattr(self, key, item) -def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711], crop=True): + +def cubic_kernel(x, a: float = -0.75): + absx = x.abs() + absx2 = absx ** 2 + absx3 = absx ** 3 + + w = (a + 2) * absx3 - (a + 3) * absx2 + 1 + w2 = a * absx3 - 5*a * absx2 + 8*a * absx - 4*a + + return torch.where(absx <= 1, w, torch.where(absx < 2, w2, torch.zeros_like(x))) + +def get_indices_weights(in_size, out_size, scale): + # OpenCV-style half-pixel mapping + x = torch.arange(out_size, dtype=torch.float32) + x = (x + 0.5) / scale - 0.5 + + x0 = x.floor().long() + dx = x.unsqueeze(1) - (x0.unsqueeze(1) + torch.arange(-1, 3)) + + weights = cubic_kernel(dx) + weights = weights / weights.sum(dim=1, keepdim=True) + + indices = x0.unsqueeze(1) + torch.arange(-1, 3) + indices = indices.clamp(0, in_size - 1) + + return indices, weights + +def resize_cubic_1d(x, out_size, dim): + b, c, h, w = x.shape + in_size = h if dim == 2 else w + scale = out_size / in_size + + indices, weights = get_indices_weights(in_size, out_size, scale) + + if dim == 2: + x = x.permute(0, 1, 3, 2) + x = x.reshape(-1, h) + else: + x = x.reshape(-1, w) + + gathered = x[:, indices] + out = (gathered * weights.unsqueeze(0)).sum(dim=2) + + if dim == 2: + out = out.reshape(b, c, w, out_size).permute(0, 1, 3, 2) + else: + out = out.reshape(b, c, h, out_size) + + return out + +def resize_cubic(img: torch.Tensor, size: tuple) -> torch.Tensor: + """ + Resize image using OpenCV-equivalent INTER_CUBIC interpolation. + Implemented in pure PyTorch + """ + + if img.ndim == 3: + img = img.unsqueeze(0) + + img = img.permute(0, 3, 1, 2) + + out_h, out_w = size + img = resize_cubic_1d(img, out_h, dim=2) + img = resize_cubic_1d(img, out_w, dim=3) + return img + +def resize_area(img: torch.Tensor, size: tuple) -> torch.Tensor: + # vectorized implementation for OpenCV's INTER_AREA using pure PyTorch + original_shape = img.shape + is_hwc = False + + if img.ndim == 3: + if img.shape[0] <= 4: + img = img.unsqueeze(0) + else: + is_hwc = True + img = img.permute(2, 0, 1).unsqueeze(0) + elif img.ndim == 4: + pass + else: + raise ValueError("Expected image with 3 or 4 dims.") + + B, C, H, W = img.shape + out_h, out_w = size + scale_y = H / out_h + scale_x = W / out_w + + device = img.device + + # compute the grid boundries + y_start = torch.arange(out_h, device=device).float() * scale_y + y_end = y_start + scale_y + x_start = torch.arange(out_w, device=device).float() * scale_x + x_end = x_start + scale_x + + # for each output pixel, we will compute the range for it + y_start_int = torch.floor(y_start).long() + y_end_int = torch.ceil(y_end).long() + x_start_int = torch.floor(x_start).long() + x_end_int = torch.ceil(x_end).long() + + # We will build the weighted sums by iterating over contributing input pixels once + output = torch.zeros((B, C, out_h, out_w), dtype=torch.float32, device=device) + area = torch.zeros((out_h, out_w), dtype=torch.float32, device=device) + + max_kernel_h = int(torch.max(y_end_int - y_start_int).item()) + max_kernel_w = int(torch.max(x_end_int - x_start_int).item()) + + for dy in range(max_kernel_h): + for dx in range(max_kernel_w): + # compute the weights for this offset for all output pixels + + y_idx = y_start_int.unsqueeze(1) + dy + x_idx = x_start_int.unsqueeze(0) + dx + + # clamp indices to image boundaries + y_idx_clamped = torch.clamp(y_idx, 0, H - 1) + x_idx_clamped = torch.clamp(x_idx, 0, W - 1) + + # compute weights by broadcasting + y_weight = (torch.min(y_end.unsqueeze(1), y_idx_clamped.float() + 1.0) - torch.max(y_start.unsqueeze(1), y_idx_clamped.float())).clamp(min=0) + x_weight = (torch.min(x_end.unsqueeze(0), x_idx_clamped.float() + 1.0) - torch.max(x_start.unsqueeze(0), x_idx_clamped.float())).clamp(min=0) + + weight = (y_weight * x_weight) + + y_expand = y_idx_clamped.expand(out_h, out_w) + x_expand = x_idx_clamped.expand(out_h, out_w) + + + pixels = img[:, :, y_expand, x_expand] + + # unsqueeze to broadcast + w = weight.unsqueeze(0).unsqueeze(0) + + output += pixels * w + area += weight + + # Normalize by area + output /= area.unsqueeze(0).unsqueeze(0) + + if is_hwc: + return output[0].permute(1, 2, 0) + elif img.shape[0] == 1 and original_shape[0] <= 4: + return output[0] + else: + return output + +def recenter(image, border_ratio: float = 0.2): + + if image.shape[-1] == 4: + mask = image[..., 3] + else: + mask = torch.ones_like(image[..., 0:1]) * 255 + image = torch.concatenate([image, mask], axis=-1) + mask = mask[..., 0] + + H, W, C = image.shape + + size = max(H, W) + result = torch.zeros((size, size, C), dtype = torch.uint8) + + # as_tuple to match numpy behaviour + x_coords, y_coords = torch.nonzero(mask, as_tuple=True) + + y_min, y_max = y_coords.min(), y_coords.max() + x_min, x_max = x_coords.min(), x_coords.max() + + h = x_max - x_min + w = y_max - y_min + + if h == 0 or w == 0: + raise ValueError('input image is empty') + + desired_size = int(size * (1 - border_ratio)) + scale = desired_size / max(h, w) + + h2 = int(h * scale) + w2 = int(w * scale) + + x2_min = (size - h2) // 2 + x2_max = x2_min + h2 + + y2_min = (size - w2) // 2 + y2_max = y2_min + w2 + + # note: opencv takes columns first (opposite to pytorch and numpy that take the row first) + result[x2_min:x2_max, y2_min:y2_max] = resize_area(image[x_min:x_max, y_min:y_max], (h2, w2)) + + bg = torch.ones((result.shape[0], result.shape[1], 3), dtype = torch.uint8) * 255 + + mask = result[..., 3:].to(torch.float32) / 255 + result = result[..., :3] * mask + bg * (1 - mask) + + mask = mask * 255 + result = result.clip(0, 255).to(torch.uint8) + mask = mask.clip(0, 255).to(torch.uint8) + + return result + +def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711], + crop=True, value_range = (-1, 1), border_ratio: float = None, recenter_size: int = 512): + + if border_ratio is not None: + + image = (image * 255).clamp(0, 255).to(torch.uint8) + image = [recenter(img, border_ratio = border_ratio) for img in image] + + image = torch.stack(image, dim = 0) + image = resize_cubic(image, size = (recenter_size, recenter_size)) + + image = image / 255 * 2 - 1 + low, high = value_range + + image = (image - low) / (high - low) + image = image.permute(0, 2, 3, 1) + image = image[:, :, :, :3] if image.shape[3] > 3 else image + mean = torch.tensor(mean, device=image.device, dtype=image.dtype) std = torch.tensor(std, device=image.device, dtype=image.dtype) + image = image.movedim(-1, 1) if not (image.shape[2] == size and image.shape[3] == size): if crop: @@ -29,7 +246,7 @@ def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], s else: scale_size = (size, size) - image = torch.nn.functional.interpolate(image, size=scale_size, mode="bicubic", antialias=True) + image = torch.nn.functional.interpolate(image, size=scale_size, mode="bilinear" if border_ratio is not None else "bicubic", antialias=True) h = (image.shape[2] - size)//2 w = (image.shape[3] - size)//2 image = image[:,:,h:h+size,w:w+size] @@ -71,9 +288,9 @@ class ClipVisionModel(): def get_sd(self): return self.model.state_dict() - def encode_image(self, image, crop=True): + def encode_image(self, image, crop=True, border_ratio: float = None): comfy.model_management.load_model_gpu(self.patcher) - pixel_values = clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=crop).float() + pixel_values = clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=crop, border_ratio=border_ratio).float() out = self.model(pixel_values=pixel_values, intermediate_output='all' if self.return_all_hidden_states else -2) outputs = Output() @@ -136,8 +353,12 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False): json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json") else: json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json") - elif "embeddings.patch_embeddings.projection.weight" in sd: + + # Dinov2 + elif 'encoder.layer.39.layer_scale2.lambda1' in sd: json_config = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "image_encoders"), "dino2_giant.json") + elif 'encoder.layer.23.layer_scale2.lambda1' in sd: + json_config = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "image_encoders"), "dino2_large.json") else: return None diff --git a/comfy/image_encoders/dino2.py b/comfy/image_encoders/dino2.py index 976f98c65..9b6dace9d 100644 --- a/comfy/image_encoders/dino2.py +++ b/comfy/image_encoders/dino2.py @@ -31,6 +31,20 @@ class LayerScale(torch.nn.Module): def forward(self, x): return x * comfy.model_management.cast_to_device(self.lambda1, x.device, x.dtype) +class Dinov2MLP(torch.nn.Module): + def __init__(self, hidden_size: int, dtype, device, operations): + super().__init__() + + mlp_ratio = 4 + hidden_features = int(hidden_size * mlp_ratio) + self.fc1 = operations.Linear(hidden_size, hidden_features, bias = True, device=device, dtype=dtype) + self.fc2 = operations.Linear(hidden_features, hidden_size, bias = True, device=device, dtype=dtype) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.fc1(hidden_state) + hidden_state = torch.nn.functional.gelu(hidden_state) + hidden_state = self.fc2(hidden_state) + return hidden_state class SwiGLUFFN(torch.nn.Module): def __init__(self, dim, dtype, device, operations): @@ -50,12 +64,15 @@ class SwiGLUFFN(torch.nn.Module): class Dino2Block(torch.nn.Module): - def __init__(self, dim, num_heads, layer_norm_eps, dtype, device, operations): + def __init__(self, dim, num_heads, layer_norm_eps, dtype, device, operations, use_swiglu_ffn): super().__init__() self.attention = Dino2AttentionBlock(dim, num_heads, layer_norm_eps, dtype, device, operations) self.layer_scale1 = LayerScale(dim, dtype, device, operations) self.layer_scale2 = LayerScale(dim, dtype, device, operations) - self.mlp = SwiGLUFFN(dim, dtype, device, operations) + if use_swiglu_ffn: + self.mlp = SwiGLUFFN(dim, dtype, device, operations) + else: + self.mlp = Dinov2MLP(dim, dtype, device, operations) self.norm1 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device) self.norm2 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device) @@ -66,9 +83,10 @@ class Dino2Block(torch.nn.Module): class Dino2Encoder(torch.nn.Module): - def __init__(self, dim, num_heads, layer_norm_eps, num_layers, dtype, device, operations): + def __init__(self, dim, num_heads, layer_norm_eps, num_layers, dtype, device, operations, use_swiglu_ffn): super().__init__() - self.layer = torch.nn.ModuleList([Dino2Block(dim, num_heads, layer_norm_eps, dtype, device, operations) for _ in range(num_layers)]) + self.layer = torch.nn.ModuleList([Dino2Block(dim, num_heads, layer_norm_eps, dtype, device, operations, use_swiglu_ffn = use_swiglu_ffn) + for _ in range(num_layers)]) def forward(self, x, intermediate_output=None): optimized_attention = optimized_attention_for_device(x.device, False, small_input=True) @@ -78,8 +96,8 @@ class Dino2Encoder(torch.nn.Module): intermediate_output = len(self.layer) + intermediate_output intermediate = None - for i, l in enumerate(self.layer): - x = l(x, optimized_attention) + for i, layer in enumerate(self.layer): + x = layer(x, optimized_attention) if i == intermediate_output: intermediate = x.clone() return x, intermediate @@ -128,9 +146,10 @@ class Dinov2Model(torch.nn.Module): dim = config_dict["hidden_size"] heads = config_dict["num_attention_heads"] layer_norm_eps = config_dict["layer_norm_eps"] + use_swiglu_ffn = config_dict["use_swiglu_ffn"] self.embeddings = Dino2Embeddings(dim, dtype, device, operations) - self.encoder = Dino2Encoder(dim, heads, layer_norm_eps, num_layers, dtype, device, operations) + self.encoder = Dino2Encoder(dim, heads, layer_norm_eps, num_layers, dtype, device, operations, use_swiglu_ffn = use_swiglu_ffn) self.layernorm = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device) def forward(self, pixel_values, attention_mask=None, intermediate_output=None): diff --git a/comfy/image_encoders/dino2_large.json b/comfy/image_encoders/dino2_large.json new file mode 100644 index 000000000..43fbb58ff --- /dev/null +++ b/comfy/image_encoders/dino2_large.json @@ -0,0 +1,22 @@ +{ + "hidden_size": 1024, + "use_mask_token": true, + "patch_size": 14, + "image_size": 518, + "num_channels": 3, + "num_attention_heads": 16, + "initializer_range": 0.02, + "attention_probs_dropout_prob": 0.0, + "hidden_dropout_prob": 0.0, + "hidden_act": "gelu", + "mlp_ratio": 4, + "model_type": "dinov2", + "num_hidden_layers": 24, + "layer_norm_eps": 1e-6, + "qkv_bias": true, + "use_swiglu_ffn": false, + "layerscale_value": 1.0, + "drop_path_rate": 0.0, + "image_mean": [0.485, 0.456, 0.406], + "image_std": [0.229, 0.224, 0.225] +} diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py index caf4991fc..0d84994b0 100644 --- a/comfy/latent_formats.py +++ b/comfy/latent_formats.py @@ -538,6 +538,11 @@ class Hunyuan3Dv2(LatentFormat): latent_dimensions = 1 scale_factor = 0.9990943042622529 +class Hunyuan3Dv2_1(LatentFormat): + scale_factor = 1.0039506158752403 + latent_channels = 64 + latent_dimensions = 1 + class Hunyuan3Dv2mini(LatentFormat): latent_channels = 64 latent_dimensions = 1 diff --git a/comfy/ldm/hunyuan3d/vae.py b/comfy/ldm/hunyuan3d/vae.py index 6e8cbf1d9..760944827 100644 --- a/comfy/ldm/hunyuan3d/vae.py +++ b/comfy/ldm/hunyuan3d/vae.py @@ -4,81 +4,458 @@ import torch import torch.nn as nn import torch.nn.functional as F - - -from typing import Union, Tuple, List, Callable, Optional - import numpy as np -from einops import repeat, rearrange +import math from tqdm import tqdm + +from typing import Optional + import logging import comfy.ops ops = comfy.ops.disable_weight_init -def generate_dense_grid_points( - bbox_min: np.ndarray, - bbox_max: np.ndarray, - octree_resolution: int, - indexing: str = "ij", -): - length = bbox_max - bbox_min - num_cells = octree_resolution +def fps(src: torch.Tensor, batch: torch.Tensor, sampling_ratio: float, start_random: bool = True): - x = np.linspace(bbox_min[0], bbox_max[0], int(num_cells) + 1, dtype=np.float32) - y = np.linspace(bbox_min[1], bbox_max[1], int(num_cells) + 1, dtype=np.float32) - z = np.linspace(bbox_min[2], bbox_max[2], int(num_cells) + 1, dtype=np.float32) - [xs, ys, zs] = np.meshgrid(x, y, z, indexing=indexing) - xyz = np.stack((xs, ys, zs), axis=-1) - grid_size = [int(num_cells) + 1, int(num_cells) + 1, int(num_cells) + 1] + # manually create the pointer vector + assert src.size(0) == batch.numel() - return xyz, grid_size, length + batch_size = int(batch.max()) + 1 + deg = src.new_zeros(batch_size, dtype = torch.long) + + deg.scatter_add_(0, batch, torch.ones_like(batch)) + + ptr_vec = deg.new_zeros(batch_size + 1) + torch.cumsum(deg, 0, out=ptr_vec[1:]) + + #return fps_sampling(src, ptr_vec, ratio) + sampled_indicies = [] + + for b in range(batch_size): + # start and the end of each batch + start, end = ptr_vec[b].item(), ptr_vec[b + 1].item() + # points from the point cloud + points = src[start:end] + + num_points = points.size(0) + num_samples = max(1, math.ceil(num_points * sampling_ratio)) + + selected = torch.zeros(num_samples, device = src.device, dtype = torch.long) + distances = torch.full((num_points,), float("inf"), device = src.device) + + # select a random start point + if start_random: + farthest = torch.randint(0, num_points, (1,), device = src.device) + else: + farthest = torch.tensor([0], device = src.device, dtype = torch.long) + + for i in range(num_samples): + selected[i] = farthest + centroid = points[farthest].squeeze(0) + dist = torch.norm(points - centroid, dim = 1) # compute euclidean distance + distances = torch.minimum(distances, dist) + farthest = torch.argmax(distances) + + sampled_indicies.append(torch.arange(start, end)[selected]) + + return torch.cat(sampled_indicies, dim = 0) +class PointCrossAttention(nn.Module): + def __init__(self, + num_latents: int, + downsample_ratio: float, + pc_size: int, + pc_sharpedge_size: int, + point_feats: int, + width: int, + heads: int, + layers: int, + fourier_embedder, + normal_pe: bool = False, + qkv_bias: bool = False, + use_ln_post: bool = True, + qk_norm: bool = True): + + super().__init__() + + self.fourier_embedder = fourier_embedder + + self.pc_size = pc_size + self.normal_pe = normal_pe + self.downsample_ratio = downsample_ratio + self.pc_sharpedge_size = pc_sharpedge_size + self.num_latents = num_latents + self.point_feats = point_feats + + self.input_proj = nn.Linear(self.fourier_embedder.out_dim + point_feats, width) + + self.cross_attn = ResidualCrossAttentionBlock( + width = width, + heads = heads, + qkv_bias = qkv_bias, + qk_norm = qk_norm + ) + + self.self_attn = None + if layers > 0: + self.self_attn = Transformer( + width = width, + heads = heads, + qkv_bias = qkv_bias, + qk_norm = qk_norm, + layers = layers + ) + + if use_ln_post: + self.ln_post = nn.LayerNorm(width) + else: + self.ln_post = None + + def sample_points_and_latents(self, point_cloud: torch.Tensor, features: torch.Tensor): + + """ + Subsample points randomly from the point cloud (input_pc) + Further sample the subsampled points to get query_pc + take the fourier embeddings for both input and query pc + + Mental Note: FPS-sampled points (query_pc) act as latent tokens that attend to and learn from the broader context in input_pc. + Goal: get a smaller represenation (query_pc) to represent the entire scence structure by learning from a broader subset (input_pc). + More computationally efficient. + + Features are additional information for each point in the cloud + """ + + B, _, D = point_cloud.shape + + num_latents = int(self.num_latents) + + num_random_query = self.pc_size / (self.pc_size + self.pc_sharpedge_size) * num_latents + num_sharpedge_query = num_latents - num_random_query + + # Split random and sharpedge surface points + random_pc, sharpedge_pc = torch.split(point_cloud, [self.pc_size, self.pc_sharpedge_size], dim=1) + + # assert statements + assert random_pc.shape[1] <= self.pc_size, "Random surface points size must be less than or equal to pc_size" + assert sharpedge_pc.shape[1] <= self.pc_sharpedge_size, "Sharpedge surface points size must be less than or equal to pc_sharpedge_size" + + input_random_pc_size = int(num_random_query * self.downsample_ratio) + random_query_pc, random_input_pc, random_idx_pc, random_idx_query = \ + self.subsample(pc = random_pc, num_query = num_random_query, input_pc_size = input_random_pc_size) + + input_sharpedge_pc_size = int(num_sharpedge_query * self.downsample_ratio) + + if input_sharpedge_pc_size == 0: + sharpedge_input_pc = torch.zeros(B, 0, D, dtype = random_input_pc.dtype).to(point_cloud.device) + sharpedge_query_pc = torch.zeros(B, 0, D, dtype= random_query_pc.dtype).to(point_cloud.device) + + else: + sharpedge_query_pc, sharpedge_input_pc, sharpedge_idx_pc, sharpedge_idx_query = \ + self.subsample(pc = sharpedge_pc, num_query = num_sharpedge_query, input_pc_size = input_sharpedge_pc_size) + + # concat the random and sharpedges + query_pc = torch.cat([random_query_pc, sharpedge_query_pc], dim = 1) + input_pc = torch.cat([random_input_pc, sharpedge_input_pc], dim = 1) + + query = self.fourier_embedder(query_pc) + data = self.fourier_embedder(input_pc) + + if self.point_feats > 0: + random_surface_features, sharpedge_surface_features = torch.split(features, [self.pc_size, self.pc_sharpedge_size], dim = 1) + + input_random_surface_features, query_random_features = \ + self.handle_features(features = random_surface_features, idx_pc = random_idx_pc, batch_size = B, + input_pc_size = input_random_pc_size, idx_query = random_idx_query) + + if input_sharpedge_pc_size == 0: + input_sharpedge_surface_features = torch.zeros(B, 0, self.point_feats, + dtype = input_random_surface_features.dtype, device = point_cloud.device) + + query_sharpedge_features = torch.zeros(B, 0, self.point_feats, + dtype = query_random_features.dtype, device = point_cloud.device) + else: + + input_sharpedge_surface_features, query_sharpedge_features = \ + self.handle_features(idx_pc = sharpedge_idx_pc, features = sharpedge_surface_features, + batch_size = B, idx_query = sharpedge_idx_query, input_pc_size = input_sharpedge_pc_size) + + query_features = torch.cat([query_random_features, query_sharpedge_features], dim = 1) + input_features = torch.cat([input_random_surface_features, input_sharpedge_surface_features], dim = 1) + + if self.normal_pe: + # apply the fourier embeddings on the first 3 dims (xyz) + input_features_pe = self.fourier_embedder(input_features[..., :3]) + query_features_pe = self.fourier_embedder(query_features[..., :3]) + # replace the first 3 dims with the new PE ones + input_features = torch.cat([input_features_pe, input_features[..., :3]], dim = -1) + query_features = torch.cat([query_features_pe, query_features[..., :3]], dim = -1) + + # concat at the channels dim + query = torch.cat([query, query_features], dim = -1) + data = torch.cat([data, input_features], dim = -1) + + # don't return pc_info to avoid unnecessary memory usuage + return query.view(B, -1, query.shape[-1]), data.view(B, -1, data.shape[-1]) + + def forward(self, point_cloud: torch.Tensor, features: torch.Tensor): + + query, data = self.sample_points_and_latents(point_cloud = point_cloud, features = features) + + # apply projections + query = self.input_proj(query) + data = self.input_proj(data) + + # apply cross attention between query and data + latents = self.cross_attn(query, data) + + if self.self_attn is not None: + latents = self.self_attn(latents) + + if self.ln_post is not None: + latents = self.ln_post(latents) + + return latents -class VanillaVolumeDecoder: + def subsample(self, pc, num_query, input_pc_size: int): + + """ + num_query: number of points to keep after FPS + input_pc_size: number of points to select before FPS + """ + + B, _, D = pc.shape + query_ratio = num_query / input_pc_size + + # random subsampling of points inside the point cloud + idx_pc = torch.randperm(pc.shape[1], device = pc.device)[:input_pc_size] + input_pc = pc[:, idx_pc, :] + + # flatten to allow applying fps across the whole batch + flattent_input_pc = input_pc.view(B * input_pc_size, D) + + # construct a batch_down tensor to tell fps + # which points belong to which batch + N_down = int(flattent_input_pc.shape[0] / B) + batch_down = torch.arange(B).to(pc.device) + batch_down = torch.repeat_interleave(batch_down, N_down) + + idx_query = fps(flattent_input_pc, batch_down, sampling_ratio = query_ratio) + query_pc = flattent_input_pc[idx_query].view(B, -1, D) + + return query_pc, input_pc, idx_pc, idx_query + + def handle_features(self, features, idx_pc, input_pc_size, batch_size: int, idx_query): + + B = batch_size + + input_surface_features = features[:, idx_pc, :] + flattent_input_features = input_surface_features.view(B * input_pc_size, -1) + query_features = flattent_input_features[idx_query].view(B, -1, + flattent_input_features.shape[-1]) + + return input_surface_features, query_features + +def normalize_mesh(mesh, scale = 0.9999): + """Normalize mesh to fit in [-scale, scale]. Translate mesh so its center is [0,0,0]""" + + bbox = mesh.bounds + center = (bbox[1] + bbox[0]) / 2 + + max_extent = (bbox[1] - bbox[0]).max() + mesh.apply_translation(-center) + mesh.apply_scale((2 * scale) / max_extent) + + return mesh + +def sample_pointcloud(mesh, num = 200000): + """ Uniformly sample points from the surface of the mesh """ + + points, face_idx = mesh.sample(num, return_index = True) + normals = mesh.face_normals[face_idx] + return torch.from_numpy(points.astype(np.float32)), torch.from_numpy(normals.astype(np.float32)) + +def detect_sharp_edges(mesh, threshold=0.985): + """Return edge indices (a, b) that lie on sharp boundaries of the mesh.""" + + V, F = mesh.vertices, mesh.faces + VN, FN = mesh.vertex_normals, mesh.face_normals + + sharp_mask = np.ones(V.shape[0]) + for i in range(3): + indices = F[:, i] + alignment = np.einsum('ij,ij->i', VN[indices], FN) + dot_stack = np.stack((sharp_mask[indices], alignment), axis=-1) + sharp_mask[indices] = np.min(dot_stack, axis=-1) + + edge_a = np.concatenate([F[:, 0], F[:, 1], F[:, 2]]) + edge_b = np.concatenate([F[:, 1], F[:, 2], F[:, 0]]) + sharp_edges = (sharp_mask[edge_a] < threshold) & (sharp_mask[edge_b] < threshold) + + return edge_a[sharp_edges], edge_b[sharp_edges] + + +def sharp_sample_pointcloud(mesh, num = 16384): + """ Sample points preferentially from sharp edges in the mesh. """ + + edge_a, edge_b = detect_sharp_edges(mesh) + V, VN = mesh.vertices, mesh.vertex_normals + + va, vb = V[edge_a], V[edge_b] + na, nb = VN[edge_a], VN[edge_b] + + edge_lengths = np.linalg.norm(vb - va, axis=-1) + weights = edge_lengths / edge_lengths.sum() + + indices = np.searchsorted(np.cumsum(weights), np.random.rand(num)) + t = np.random.rand(num, 1) + + samples = t * va[indices] + (1 - t) * vb[indices] + normals = t * na[indices] + (1 - t) * nb[indices] + + return samples.astype(np.float32), normals.astype(np.float32) + +def load_surface_sharpedge(mesh, num_points=4096, num_sharp_points=4096, sharpedge_flag = True, device = "cuda"): + """Load a surface with optional sharp-edge annotations from a trimesh mesh.""" + + import trimesh + + try: + mesh_full = trimesh.util.concatenate(mesh.dump()) + except Exception: + mesh_full = trimesh.util.concatenate(mesh) + + mesh_full = normalize_mesh(mesh_full) + + faces = mesh_full.faces + vertices = mesh_full.vertices + origin_face_count = faces.shape[0] + + mesh_surface = trimesh.Trimesh(vertices=vertices, faces=faces[:origin_face_count]) + mesh_fill = trimesh.Trimesh(vertices=vertices, faces=faces[origin_face_count:]) + + area_surface = mesh_surface.area + area_fill = mesh_fill.area + total_area = area_surface + area_fill + + sample_num = 499712 // 2 + fill_ratio = area_fill / total_area if total_area > 0 else 0 + + num_fill = int(sample_num * fill_ratio) + num_surface = sample_num - num_fill + + surf_pts, surf_normals = sample_pointcloud(mesh_surface, num_surface) + fill_pts, fill_normals = (torch.zeros(0, 3), torch.zeros(0, 3)) if num_fill == 0 else sample_pointcloud(mesh_fill, num_fill) + + sharp_pts, sharp_normals = sharp_sample_pointcloud(mesh_surface, sample_num) + + def assemble_tensor(points, normals, label=None): + + data = torch.cat([points, normals], dim=1).half().to(device) + + if label is not None: + label_tensor = torch.full((data.shape[0], 1), float(label), dtype=torch.float16).to(device) + data = torch.cat([data, label_tensor], dim=1) + + return data + + surface = assemble_tensor(torch.cat([surf_pts.to(device), fill_pts.to(device)], dim=0), + torch.cat([surf_normals.to(device), fill_normals.to(device)], dim=0), + label = 0 if sharpedge_flag else None) + + sharp_surface = assemble_tensor(torch.from_numpy(sharp_pts), torch.from_numpy(sharp_normals), + label = 1 if sharpedge_flag else None) + + rng = np.random.default_rng() + + surface = surface[rng.choice(surface.shape[0], num_points, replace = False)] + sharp_surface = sharp_surface[rng.choice(sharp_surface.shape[0], num_sharp_points, replace = False)] + + full = torch.cat([surface, sharp_surface], dim = 0).unsqueeze(0) + + return full + +class SharpEdgeSurfaceLoader: + """ Load mesh surface and sharp edge samples. """ + + def __init__(self, num_uniform_points = 8192, num_sharp_points = 8192): + + self.num_uniform_points = num_uniform_points + self.num_sharp_points = num_sharp_points + self.total_points = num_uniform_points + num_sharp_points + + def __call__(self, mesh_input, device = "cuda"): + mesh = self._load_mesh(mesh_input) + return load_surface_sharpedge(mesh, self.num_uniform_points, self.num_sharp_points, device = device) + + @staticmethod + def _load_mesh(mesh_input): + import trimesh + + if isinstance(mesh_input, str): + mesh = trimesh.load(mesh_input, force="mesh", merge_primitives = True) + else: + mesh = mesh_input + + if isinstance(mesh, trimesh.Scene): + combined = None + for obj in mesh.geometry.values(): + combined = obj if combined is None else combined + obj + return combined + + return mesh + +class DiagonalGaussianDistribution: + def __init__(self, params: torch.Tensor, feature_dim: int = -1): + + # divide quant channels (8) into mean and log variance + self.mean, self.logvar = torch.chunk(params, 2, dim = feature_dim) + + self.logvar = torch.clamp(self.logvar, -30.0, 20.0) + self.std = torch.exp(0.5 * self.logvar) + + def sample(self): + + eps = torch.randn_like(self.std) + z = self.mean + eps * self.std + + return z + +################################################ +# Volume Decoder +################################################ + +class VanillaVolumeDecoder(): @torch.no_grad() - def __call__( - self, - latents: torch.FloatTensor, - geo_decoder: Callable, - bounds: Union[Tuple[float], List[float], float] = 1.01, - num_chunks: int = 10000, - octree_resolution: int = None, - enable_pbar: bool = True, - **kwargs, - ): - device = latents.device - dtype = latents.dtype - batch_size = latents.shape[0] + def __call__(self, latents: torch.Tensor, geo_decoder: callable, octree_resolution: int, bounds = 1.01, + num_chunks: int = 10_000, enable_pbar: bool = True, **kwargs): - # 1. generate query points if isinstance(bounds, float): bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds] - bbox_min, bbox_max = np.array(bounds[0:3]), np.array(bounds[3:6]) - xyz_samples, grid_size, length = generate_dense_grid_points( - bbox_min=bbox_min, - bbox_max=bbox_max, - octree_resolution=octree_resolution, - indexing="ij" - ) - xyz_samples = torch.from_numpy(xyz_samples).to(device, dtype=dtype).contiguous().reshape(-1, 3) + bbox_min, bbox_max = torch.tensor(bounds[:3]), torch.tensor(bounds[3:]) + + x = torch.linspace(bbox_min[0], bbox_max[0], int(octree_resolution) + 1, dtype = torch.float32) + y = torch.linspace(bbox_min[1], bbox_max[1], int(octree_resolution) + 1, dtype = torch.float32) + z = torch.linspace(bbox_min[2], bbox_max[2], int(octree_resolution) + 1, dtype = torch.float32) + + [xs, ys, zs] = torch.meshgrid(x, y, z, indexing = "ij") + xyz = torch.stack((xs, ys, zs), axis=-1).to(latents.device, dtype = latents.dtype).contiguous().reshape(-1, 3) + grid_size = [int(octree_resolution) + 1, int(octree_resolution) + 1, int(octree_resolution) + 1] - # 2. latents to 3d volume batch_logits = [] - for start in tqdm(range(0, xyz_samples.shape[0], num_chunks), desc="Volume Decoding", + for start in tqdm(range(0, xyz.shape[0], num_chunks), desc="Volume Decoding", disable=not enable_pbar): - chunk_queries = xyz_samples[start: start + num_chunks, :] - chunk_queries = repeat(chunk_queries, "p c -> b p c", b=batch_size) - logits = geo_decoder(queries=chunk_queries, latents=latents) + + chunk_queries = xyz[start: start + num_chunks, :] + chunk_queries = chunk_queries.unsqueeze(0).repeat(latents.shape[0], 1, 1) + logits = geo_decoder(queries = chunk_queries, latents = latents) batch_logits.append(logits) - grid_logits = torch.cat(batch_logits, dim=1) - grid_logits = grid_logits.view((batch_size, *grid_size)).float() + grid_logits = torch.cat(batch_logits, dim = 1) + grid_logits = grid_logits.view((latents.shape[0], *grid_size)).float() return grid_logits - class FourierEmbedder(nn.Module): """The sin/cosine positional embedding. Given an input tensor `x` of shape [n_batch, ..., c_dim], it converts each feature dimension of `x[..., i]` into: @@ -175,13 +552,11 @@ class FourierEmbedder(nn.Module): else: return x - class CrossAttentionProcessor: def __call__(self, attn, q, k, v): out = comfy.ops.scaled_dot_product_attention(q, k, v) return out - class DropPath(nn.Module): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). """ @@ -232,38 +607,41 @@ class MLP(nn.Module): def forward(self, x): return self.drop_path(self.c_proj(self.gelu(self.c_fc(x)))) - class QKVMultiheadCrossAttention(nn.Module): def __init__( self, - *, heads: int, + n_data = None, width=None, qk_norm=False, norm_layer=ops.LayerNorm ): super().__init__() self.heads = heads + self.n_data = n_data self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity() self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity() - self.attn_processor = CrossAttentionProcessor() - def forward(self, q, kv): + _, n_ctx, _ = q.shape bs, n_data, width = kv.shape + attn_ch = width // self.heads // 2 q = q.view(bs, n_ctx, self.heads, -1) + kv = kv.view(bs, n_data, self.heads, -1) k, v = torch.split(kv, attn_ch, dim=-1) q = self.q_norm(q) k = self.k_norm(k) - q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v)) - out = self.attn_processor(self, q, k, v) - out = out.transpose(1, 2).reshape(bs, n_ctx, -1) - return out + q, k, v = [t.permute(0, 2, 1, 3) for t in (q, k, v)] + out = F.scaled_dot_product_attention(q, k, v) + + out = out.transpose(1, 2).reshape(bs, n_ctx, -1) + + return out class MultiheadCrossAttention(nn.Module): def __init__( @@ -306,7 +684,6 @@ class MultiheadCrossAttention(nn.Module): x = self.c_proj(x) return x - class ResidualCrossAttentionBlock(nn.Module): def __init__( self, @@ -366,7 +743,7 @@ class QKVMultiheadAttention(nn.Module): q = self.q_norm(q) k = self.k_norm(k) - q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v)) + q, k, v = [t.permute(0, 2, 1, 3) for t in (q, k, v)] out = F.scaled_dot_product_attention(q, k, v).transpose(1, 2).reshape(bs, n_ctx, -1) return out @@ -383,8 +760,7 @@ class MultiheadAttention(nn.Module): drop_path_rate: float = 0.0 ): super().__init__() - self.width = width - self.heads = heads + self.c_qkv = ops.Linear(width, width * 3, bias=qkv_bias) self.c_proj = ops.Linear(width, width) self.attention = QKVMultiheadAttention( @@ -491,7 +867,7 @@ class CrossAttentionDecoder(nn.Module): self.query_proj = ops.Linear(self.fourier_embedder.out_dim, width) if self.downsample_ratio != 1: self.latents_proj = ops.Linear(width * downsample_ratio, width) - if self.enable_ln_post == False: + if not self.enable_ln_post: qk_norm = False self.cross_attn_decoder = ResidualCrossAttentionBlock( width=width, @@ -522,28 +898,44 @@ class CrossAttentionDecoder(nn.Module): class ShapeVAE(nn.Module): def __init__( - self, - *, - embed_dim: int, - width: int, - heads: int, - num_decoder_layers: int, - geo_decoder_downsample_ratio: int = 1, - geo_decoder_mlp_expand_ratio: int = 4, - geo_decoder_ln_post: bool = True, - num_freqs: int = 8, - include_pi: bool = True, - qkv_bias: bool = True, - qk_norm: bool = False, - label_type: str = "binary", - drop_path_rate: float = 0.0, - scale_factor: float = 1.0, + self, + *, + num_latents: int = 4096, + embed_dim: int = 64, + width: int = 1024, + heads: int = 16, + num_decoder_layers: int = 16, + num_encoder_layers: int = 8, + pc_size: int = 81920, + pc_sharpedge_size: int = 0, + point_feats: int = 4, + downsample_ratio: int = 20, + geo_decoder_downsample_ratio: int = 1, + geo_decoder_mlp_expand_ratio: int = 4, + geo_decoder_ln_post: bool = True, + num_freqs: int = 8, + qkv_bias: bool = False, + qk_norm: bool = True, + drop_path_rate: float = 0.0, + include_pi: bool = False, + scale_factor: float = 1.0039506158752403, + label_type: str = "binary", ): super().__init__() self.geo_decoder_ln_post = geo_decoder_ln_post self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi) + self.encoder = PointCrossAttention(layers = num_encoder_layers, + num_latents = num_latents, + downsample_ratio = downsample_ratio, + heads = heads, + pc_size = pc_size, + width = width, + point_feats = point_feats, + fourier_embedder = self.fourier_embedder, + pc_sharpedge_size = pc_sharpedge_size) + self.post_kl = ops.Linear(embed_dim, width) self.transformer = Transformer( @@ -583,5 +975,14 @@ class ShapeVAE(nn.Module): grid_logits = self.volume_decoder(latents, self.geo_decoder, bounds=bounds, num_chunks=num_chunks, octree_resolution=octree_resolution, enable_pbar=enable_pbar) return grid_logits.movedim(-2, -1) - def encode(self, x): - return None + def encode(self, surface): + + pc, feats = surface[:, :, :3], surface[:, :, 3:] + latents = self.encoder(pc, feats) + + moments = self.pre_kl(latents) + posterior = DiagonalGaussianDistribution(moments, feature_dim = -1) + + latents = posterior.sample() + + return latents diff --git a/comfy/ldm/hunyuan3dv2_1/hunyuandit.py b/comfy/ldm/hunyuan3dv2_1/hunyuandit.py new file mode 100644 index 000000000..48575bb3c --- /dev/null +++ b/comfy/ldm/hunyuan3dv2_1/hunyuandit.py @@ -0,0 +1,658 @@ +import math +import torch +import torch.nn as nn +import torch.nn.functional as F +from comfy.ldm.modules.attention import optimized_attention + +class GELU(nn.Module): + + def __init__(self, dim_in: int, dim_out: int, operations, device, dtype): + super().__init__() + self.proj = operations.Linear(dim_in, dim_out, device = device, dtype = dtype) + + def gelu(self, gate: torch.Tensor) -> torch.Tensor: + + if gate.device.type == "mps": + return F.gelu(gate.to(dtype = torch.float32)).to(dtype = gate.dtype) + + return F.gelu(gate) + + def forward(self, hidden_states): + + hidden_states = self.proj(hidden_states) + hidden_states = self.gelu(hidden_states) + + return hidden_states + +class FeedForward(nn.Module): + + def __init__(self, dim: int, dim_out = None, mult: int = 4, + dropout: float = 0.0, inner_dim = None, operations = None, device = None, dtype = None): + + super().__init__() + if inner_dim is None: + inner_dim = int(dim * mult) + + dim_out = dim_out if dim_out is not None else dim + + act_fn = GELU(dim, inner_dim, operations = operations, device = device, dtype = dtype) + + self.net = nn.ModuleList([]) + self.net.append(act_fn) + + self.net.append(nn.Dropout(dropout)) + self.net.append(operations.Linear(inner_dim, dim_out, device = device, dtype = dtype)) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + for module in self.net: + hidden_states = module(hidden_states) + return hidden_states + +class AddAuxLoss(torch.autograd.Function): + + @staticmethod + def forward(ctx, x, loss): + # do nothing in forward (no computation) + ctx.requires_aux_loss = loss.requires_grad + ctx.dtype = loss.dtype + + return x + + @staticmethod + def backward(ctx, grad_output): + # add the aux loss gradients + grad_loss = None + # put the aux grad the same as the main grad loss + # aux grad contributes equally + if ctx.requires_aux_loss: + grad_loss = torch.ones(1, dtype = ctx.dtype, device = grad_output.device) + + return grad_output, grad_loss + +class MoEGate(nn.Module): + + def __init__(self, embed_dim, num_experts=16, num_experts_per_tok=2, aux_loss_alpha=0.01, device = None, dtype = None): + + super().__init__() + self.top_k = num_experts_per_tok + self.n_routed_experts = num_experts + + self.alpha = aux_loss_alpha + + self.gating_dim = embed_dim + self.weight = nn.Parameter(torch.empty((self.n_routed_experts, self.gating_dim), device = device, dtype = dtype)) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + + # flatten hidden states + hidden_states = hidden_states.view(-1, hidden_states.size(-1)) + + # get logits and pass it to softmax + logits = F.linear(hidden_states, self.weight, bias = None) + scores = logits.softmax(dim = -1) + + topk_weight, topk_idx = torch.topk(scores, k = self.top_k, dim = -1, sorted = False) + + if self.training and self.alpha > 0.0: + scores_for_aux = scores + + # used bincount instead of one hot encoding + counts = torch.bincount(topk_idx.view(-1), minlength = self.n_routed_experts).float() + ce = counts / topk_idx.numel() # normalized expert usage + + # mean expert score + Pi = scores_for_aux.mean(0) + + # expert balance loss + aux_loss = (Pi * ce * self.n_routed_experts).sum() * self.alpha + else: + aux_loss = None + + return topk_idx, topk_weight, aux_loss + +class MoEBlock(nn.Module): + def __init__(self, dim, num_experts: int = 6, moe_top_k: int = 2, dropout: float = 0.0, + ff_inner_dim: int = None, operations = None, device = None, dtype = None): + super().__init__() + + self.moe_top_k = moe_top_k + self.num_experts = num_experts + + self.experts = nn.ModuleList([ + FeedForward(dim, dropout = dropout, inner_dim = ff_inner_dim, operations = operations, device = device, dtype = dtype) + for _ in range(num_experts) + ]) + + self.gate = MoEGate(dim, num_experts = num_experts, num_experts_per_tok = moe_top_k, device = device, dtype = dtype) + self.shared_experts = FeedForward(dim, dropout = dropout, inner_dim = ff_inner_dim, operations = operations, device = device, dtype = dtype) + + def forward(self, hidden_states) -> torch.Tensor: + + identity = hidden_states + orig_shape = hidden_states.shape + topk_idx, topk_weight, aux_loss = self.gate(hidden_states) + + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + flat_topk_idx = topk_idx.view(-1) + + if self.training: + + hidden_states = hidden_states.repeat_interleave(self.moe_top_k, dim = 0) + y = torch.empty_like(hidden_states, dtype = hidden_states.dtype) + + for i, expert in enumerate(self.experts): + tmp = expert(hidden_states[flat_topk_idx == i]) + y[flat_topk_idx == i] = tmp.to(hidden_states.dtype) + + y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim = 1) + y = y.view(*orig_shape) + + y = AddAuxLoss.apply(y, aux_loss) + else: + y = self.moe_infer(hidden_states, flat_expert_indices = flat_topk_idx,flat_expert_weights = topk_weight.view(-1, 1)).view(*orig_shape) + + y = y + self.shared_experts(identity) + + return y + + @torch.no_grad() + def moe_infer(self, x, flat_expert_indices, flat_expert_weights): + + expert_cache = torch.zeros_like(x) + idxs = flat_expert_indices.argsort() + + # no need for .numpy().cpu() here + tokens_per_expert = flat_expert_indices.bincount().cumsum(0) + token_idxs = idxs // self.moe_top_k + + for i, end_idx in enumerate(tokens_per_expert): + + start_idx = 0 if i == 0 else tokens_per_expert[i-1] + + if start_idx == end_idx: + continue + + expert = self.experts[i] + exp_token_idx = token_idxs[start_idx:end_idx] + + expert_tokens = x[exp_token_idx] + expert_out = expert(expert_tokens) + + expert_out.mul_(flat_expert_weights[idxs[start_idx:end_idx]]) + + # use index_add_ with a 1-D index tensor directly avoids building a large [N, D] index map and extra memcopy required by scatter_reduce_ + # + avoid dtype conversion + expert_cache.index_add_(0, exp_token_idx, expert_out) + + return expert_cache + +class Timesteps(nn.Module): + def __init__(self, num_channels: int, downscale_freq_shift: float = 0.0, + scale: float = 1.0, max_period: int = 10000): + super().__init__() + + self.num_channels = num_channels + half_dim = num_channels // 2 + + # precompute the “inv_freq” vector once + exponent = -math.log(max_period) * torch.arange( + half_dim, dtype=torch.float32 + ) / (half_dim - downscale_freq_shift) + + inv_freq = torch.exp(exponent) + + # pad + if num_channels % 2 == 1: + # we’ll pad a zero at the end of the cos-half + inv_freq = torch.cat([inv_freq, inv_freq.new_zeros(1)]) + + # register to buffer so it moves with the device + self.register_buffer("inv_freq", inv_freq, persistent = False) + self.scale = scale + + def forward(self, timesteps: torch.Tensor): + + x = timesteps.float().unsqueeze(1) * self.inv_freq.to(timesteps.device).unsqueeze(0) + + + # fused CUDA kernels for sin and cos + sin_emb = x.sin() + cos_emb = x.cos() + + emb = torch.cat([sin_emb, cos_emb], dim = 1) + + # scale factor + if self.scale != 1.0: + emb = emb * self.scale + + # If we padded inv_freq for odd, emb is already wide enough; otherwise: + if emb.shape[1] > self.num_channels: + emb = emb[:, :self.num_channels] + + return emb + +class TimestepEmbedder(nn.Module): + def __init__(self, hidden_size, frequency_embedding_size = 256, cond_proj_dim = None, operations = None, device = None, dtype = None): + super().__init__() + + self.mlp = nn.Sequential( + operations.Linear(hidden_size, frequency_embedding_size, bias=True, device = device, dtype = dtype), + nn.GELU(), + operations.Linear(frequency_embedding_size, hidden_size, bias=True, device = device, dtype = dtype), + ) + self.frequency_embedding_size = frequency_embedding_size + + if cond_proj_dim is not None: + self.cond_proj = operations.Linear(cond_proj_dim, frequency_embedding_size, bias=False, device = device, dtype = dtype) + + self.time_embed = Timesteps(hidden_size) + + def forward(self, timesteps, condition): + + timestep_embed = self.time_embed(timesteps).type(self.mlp[0].weight.dtype) + + if condition is not None: + cond_embed = self.cond_proj(condition) + timestep_embed = timestep_embed + cond_embed + + time_conditioned = self.mlp(timestep_embed.to(self.mlp[0].weight.device)) + + # for broadcasting with image tokens + return time_conditioned.unsqueeze(1) + +class MLP(nn.Module): + def __init__(self, *, width: int, operations = None, device = None, dtype = None): + super().__init__() + self.width = width + self.fc1 = operations.Linear(width, width * 4, device = device, dtype = dtype) + self.fc2 = operations.Linear(width * 4, width, device = device, dtype = dtype) + self.gelu = nn.GELU() + + def forward(self, x): + return self.fc2(self.gelu(self.fc1(x))) + +class CrossAttention(nn.Module): + def __init__( + self, + qdim, + kdim, + num_heads, + qkv_bias=True, + qk_norm=False, + norm_layer=nn.LayerNorm, + use_fp16: bool = False, + operations = None, + dtype = None, + device = None, + **kwargs, + ): + super().__init__() + self.qdim = qdim + self.kdim = kdim + + self.num_heads = num_heads + self.head_dim = self.qdim // num_heads + + self.scale = self.head_dim ** -0.5 + + self.to_q = operations.Linear(qdim, qdim, bias=qkv_bias, device = device, dtype = dtype) + self.to_k = operations.Linear(kdim, qdim, bias=qkv_bias, device = device, dtype = dtype) + self.to_v = operations.Linear(kdim, qdim, bias=qkv_bias, device = device, dtype = dtype) + + if use_fp16: + eps = 1.0 / 65504 + else: + eps = 1e-6 + + if norm_layer == nn.LayerNorm: + norm_layer = operations.LayerNorm + else: + norm_layer = operations.RMSNorm + + self.q_norm = norm_layer(self.head_dim, elementwise_affine=True, eps = eps, device = device, dtype = dtype) if qk_norm else nn.Identity() + self.k_norm = norm_layer(self.head_dim, elementwise_affine=True, eps = eps, device = device, dtype = dtype) if qk_norm else nn.Identity() + self.out_proj = operations.Linear(qdim, qdim, bias=True, device = device, dtype = dtype) + + def forward(self, x, y): + + b, s1, _ = x.shape + _, s2, _ = y.shape + + y = y.to(next(self.to_k.parameters()).dtype) + + q = self.to_q(x) + k = self.to_k(y) + v = self.to_v(y) + + kv = torch.cat((k, v), dim=-1) + split_size = kv.shape[-1] // self.num_heads // 2 + + kv = kv.view(1, -1, self.num_heads, split_size * 2) + k, v = torch.split(kv, split_size, dim=-1) + + q = q.view(b, s1, self.num_heads, self.head_dim) + k = k.view(b, s2, self.num_heads, self.head_dim) + v = v.reshape(b, s2, self.num_heads * self.head_dim) + + q = self.q_norm(q) + k = self.k_norm(k) + + x = optimized_attention( + q.reshape(b, s1, self.num_heads * self.head_dim), + k.reshape(b, s2, self.num_heads * self.head_dim), + v, + heads=self.num_heads, + ) + + out = self.out_proj(x) + + return out + +class Attention(nn.Module): + + def __init__( + self, + dim, + num_heads, + qkv_bias = True, + qk_norm = False, + norm_layer = nn.LayerNorm, + use_fp16: bool = False, + operations = None, + device = None, + dtype = None + ): + super().__init__() + self.dim = dim + self.num_heads = num_heads + self.head_dim = self.dim // num_heads + self.scale = self.head_dim ** -0.5 + + self.to_q = operations.Linear(dim, dim, bias = qkv_bias, device = device, dtype = dtype) + self.to_k = operations.Linear(dim, dim, bias = qkv_bias, device = device, dtype = dtype) + self.to_v = operations.Linear(dim, dim, bias = qkv_bias, device = device, dtype = dtype) + + if use_fp16: + eps = 1.0 / 65504 + else: + eps = 1e-6 + + if norm_layer == nn.LayerNorm: + norm_layer = operations.LayerNorm + else: + norm_layer = operations.RMSNorm + + self.q_norm = norm_layer(self.head_dim, elementwise_affine=True, eps = eps, device = device, dtype = dtype) if qk_norm else nn.Identity() + self.k_norm = norm_layer(self.head_dim, elementwise_affine=True, eps = eps, device = device, dtype = dtype) if qk_norm else nn.Identity() + self.out_proj = operations.Linear(dim, dim, device = device, dtype = dtype) + + def forward(self, x): + B, N, _ = x.shape + + query = self.to_q(x) + key = self.to_k(x) + value = self.to_v(x) + + qkv_combined = torch.cat((query, key, value), dim=-1) + split_size = qkv_combined.shape[-1] // self.num_heads // 3 + + qkv = qkv_combined.view(1, -1, self.num_heads, split_size * 3) + query, key, value = torch.split(qkv, split_size, dim=-1) + + query = query.reshape(B, N, self.num_heads, self.head_dim) + key = key.reshape(B, N, self.num_heads, self.head_dim) + value = value.reshape(B, N, self.num_heads * self.head_dim) + + query = self.q_norm(query) + key = self.k_norm(key) + + x = optimized_attention( + query.reshape(B, N, self.num_heads * self.head_dim), + key.reshape(B, N, self.num_heads * self.head_dim), + value, + heads=self.num_heads, + ) + + x = self.out_proj(x) + return x + +class HunYuanDiTBlock(nn.Module): + def __init__( + self, + hidden_size, + c_emb_size, + num_heads, + text_states_dim=1024, + qk_norm=False, + norm_layer=nn.LayerNorm, + qk_norm_layer=nn.RMSNorm, + qkv_bias=True, + skip_connection=True, + timested_modulate=False, + use_moe: bool = False, + num_experts: int = 8, + moe_top_k: int = 2, + use_fp16: bool = False, + operations = None, + device = None, dtype = None + ): + super().__init__() + + # eps can't be 1e-6 in fp16 mode because of numerical stability issues + if use_fp16: + eps = 1.0 / 65504 + else: + eps = 1e-6 + + self.norm1 = norm_layer(hidden_size, elementwise_affine = True, eps = eps, device = device, dtype = dtype) + + self.attn1 = Attention(hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, qk_norm=qk_norm, + norm_layer=qk_norm_layer, use_fp16 = use_fp16, device = device, dtype = dtype, operations = operations) + + self.norm2 = norm_layer(hidden_size, elementwise_affine = True, eps = eps, device = device, dtype = dtype) + + self.timested_modulate = timested_modulate + if self.timested_modulate: + self.default_modulation = nn.Sequential( + nn.SiLU(), + operations.Linear(c_emb_size, hidden_size, bias=True, device = device, dtype = dtype) + ) + + self.attn2 = CrossAttention(hidden_size, text_states_dim, num_heads=num_heads, qkv_bias=qkv_bias, + qk_norm=qk_norm, norm_layer=qk_norm_layer, use_fp16 = use_fp16, + device = device, dtype = dtype, operations = operations) + + self.norm3 = norm_layer(hidden_size, elementwise_affine = True, eps = eps, device = device, dtype = dtype) + + if skip_connection: + self.skip_norm = norm_layer(hidden_size, elementwise_affine = True, eps = eps, device = device, dtype = dtype) + self.skip_linear = operations.Linear(2 * hidden_size, hidden_size, device = device, dtype = dtype) + else: + self.skip_linear = None + + self.use_moe = use_moe + + if self.use_moe: + self.moe = MoEBlock( + hidden_size, + num_experts = num_experts, + moe_top_k = moe_top_k, + dropout = 0.0, + ff_inner_dim = int(hidden_size * 4.0), + device = device, dtype = dtype, + operations = operations + ) + else: + self.mlp = MLP(width=hidden_size, operations=operations, device = device, dtype = dtype) + + def forward(self, hidden_states, conditioning=None, text_states=None, skip_tensor=None): + + if self.skip_linear is not None: + combined = torch.cat([skip_tensor, hidden_states], dim=-1) + hidden_states = self.skip_linear(combined) + hidden_states = self.skip_norm(hidden_states) + + # self attention + if self.timested_modulate: + modulation_shift = self.default_modulation(conditioning).unsqueeze(dim=1) + hidden_states = hidden_states + modulation_shift + + self_attn_out = self.attn1(self.norm1(hidden_states)) + hidden_states = hidden_states + self_attn_out + + # cross attention + hidden_states = hidden_states + self.attn2(self.norm2(hidden_states), text_states) + + # MLP Layer + mlp_input = self.norm3(hidden_states) + + if self.use_moe: + hidden_states = hidden_states + self.moe(mlp_input) + else: + hidden_states = hidden_states + self.mlp(mlp_input) + + return hidden_states + +class FinalLayer(nn.Module): + + def __init__(self, final_hidden_size, out_channels, operations, use_fp16: bool = False, device = None, dtype = None): + super().__init__() + + if use_fp16: + eps = 1.0 / 65504 + else: + eps = 1e-6 + + self.norm_final = operations.LayerNorm(final_hidden_size, elementwise_affine = True, eps = eps, device = device, dtype = dtype) + self.linear = operations.Linear(final_hidden_size, out_channels, bias = True, device = device, dtype = dtype) + + def forward(self, x): + x = self.norm_final(x) + x = x[:, 1:] + x = self.linear(x) + return x + +class HunYuanDiTPlain(nn.Module): + + # init with the defaults values from https://huggingface.co/tencent/Hunyuan3D-2.1/blob/main/hunyuan3d-dit-v2-1/config.yaml + def __init__( + self, + in_channels: int = 64, + hidden_size: int = 2048, + context_dim: int = 1024, + depth: int = 21, + num_heads: int = 16, + qk_norm: bool = True, + qkv_bias: bool = False, + num_moe_layers: int = 6, + guidance_cond_proj_dim = 2048, + norm_type = 'layer', + num_experts: int = 8, + moe_top_k: int = 2, + use_fp16: bool = False, + dtype = None, + device = None, + operations = None, + **kwargs + ): + + self.dtype = dtype + + super().__init__() + + self.depth = depth + + self.in_channels = in_channels + self.out_channels = in_channels + + self.num_heads = num_heads + self.hidden_size = hidden_size + + norm = operations.LayerNorm if norm_type == 'layer' else operations.RMSNorm + qk_norm = operations.RMSNorm + + self.context_dim = context_dim + self.guidance_cond_proj_dim = guidance_cond_proj_dim + + self.x_embedder = operations.Linear(in_channels, hidden_size, bias = True, device = device, dtype = dtype) + self.t_embedder = TimestepEmbedder(hidden_size, hidden_size * 4, cond_proj_dim = guidance_cond_proj_dim, device = device, dtype = dtype, operations = operations) + + + # HUnYuanDiT Blocks + self.blocks = nn.ModuleList([ + HunYuanDiTBlock(hidden_size=hidden_size, + c_emb_size=hidden_size, + num_heads=num_heads, + text_states_dim=context_dim, + qk_norm=qk_norm, + norm_layer = norm, + qk_norm_layer = qk_norm, + skip_connection=layer > depth // 2, + qkv_bias=qkv_bias, + use_moe=True if depth - layer <= num_moe_layers else False, + num_experts=num_experts, + moe_top_k=moe_top_k, + use_fp16 = use_fp16, + device = device, dtype = dtype, operations = operations) + for layer in range(depth) + ]) + + self.depth = depth + + self.final_layer = FinalLayer(hidden_size, self.out_channels, use_fp16 = use_fp16, operations = operations, device = device, dtype = dtype) + + def forward(self, x, t, context, transformer_options = {}, **kwargs): + + x = x.movedim(-1, -2) + uncond_emb, cond_emb = context.chunk(2, dim = 0) + + context = torch.cat([cond_emb, uncond_emb], dim = 0) + main_condition = context + + t = 1.0 - t + + time_embedded = self.t_embedder(t, condition = kwargs.get('guidance_cond')) + + x = x.to(dtype = next(self.x_embedder.parameters()).dtype) + x_embedded = self.x_embedder(x) + + combined = torch.cat([time_embedded, x_embedded], dim=1) + + def block_wrap(args): + return block( + args["x"], + args["t"], + args["cond"], + skip_tensor=args.get("skip"),) + + skip_stack = [] + patches_replace = transformer_options.get("patches_replace", {}) + blocks_replace = patches_replace.get("dit", {}) + for idx, block in enumerate(self.blocks): + if idx <= self.depth // 2: + skip_input = None + else: + skip_input = skip_stack.pop() + + if ("block", idx) in blocks_replace: + + combined = blocks_replace[("block", idx)]( + { + "x": combined, + "t": time_embedded, + "cond": main_condition, + "skip": skip_input, + }, + {"original_block": block_wrap}, + ) + else: + combined = block(combined, time_embedded, main_condition, skip_tensor=skip_input) + + if idx < self.depth // 2: + skip_stack.append(combined) + + output = self.final_layer(combined) + output = output.movedim(-2, -1) * (-1.0) + + cond_emb, uncond_emb = output.chunk(2, dim = 0) + return torch.cat([uncond_emb, cond_emb]) diff --git a/comfy/model_base.py b/comfy/model_base.py index 56a6798be..39a3344bc 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -16,6 +16,8 @@ along with this program. If not, see . """ +import comfy.ldm.hunyuan3dv2_1 +import comfy.ldm.hunyuan3dv2_1.hunyuandit import torch import logging from comfy.ldm.modules.diffusionmodules.openaimodel import UNetModel, Timestep @@ -1282,6 +1284,21 @@ class Hunyuan3Dv2(BaseModel): out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance])) return out +class Hunyuan3Dv2_1(BaseModel): + def __init__(self, model_config, model_type=ModelType.FLOW, device=None): + super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hunyuan3dv2_1.hunyuandit.HunYuanDiTPlain) + + def extra_conds(self, **kwargs): + out = super().extra_conds(**kwargs) + cross_attn = kwargs.get("cross_attn", None) + if cross_attn is not None: + out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) + + guidance = kwargs.get("guidance", 5.0) + if guidance is not None: + out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance])) + return out + class HiDream(BaseModel): def __init__(self, model_config, model_type=ModelType.FLOW, device=None): super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hidream.model.HiDreamImageTransformer2DModel) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 9f3ab64df..75552ede9 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -400,6 +400,20 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys return dit_config + if f"{key_prefix}t_embedder.mlp.2.weight" in state_dict_keys: # Hunyuan 3D 2.1 + + dit_config = {} + dit_config["image_model"] = "hunyuan3d2_1" + dit_config["in_channels"] = state_dict[f"{key_prefix}x_embedder.weight"].shape[1] + dit_config["context_dim"] = 1024 + dit_config["hidden_size"] = state_dict[f"{key_prefix}x_embedder.weight"].shape[0] + dit_config["mlp_ratio"] = 4.0 + dit_config["num_heads"] = 16 + dit_config["depth"] = count_blocks(state_dict_keys, f"{key_prefix}blocks.{{}}") + dit_config["qkv_bias"] = False + dit_config["guidance_cond_proj_dim"] = None#f"{key_prefix}t_embedder.cond_proj.weight" in state_dict_keys + return dit_config + if '{}caption_projection.0.linear.weight'.format(key_prefix) in state_dict_keys: # HiDream dit_config = {} dit_config["image_model"] = "hidream" diff --git a/comfy/sd.py b/comfy/sd.py index bb5d61fb3..014f797ca 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -446,17 +446,29 @@ class VAE: self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32] self.memory_used_encode = lambda shape, dtype: 6000 * shape[3] * shape[4] * model_management.dtype_size(dtype) self.memory_used_decode = lambda shape, dtype: 7000 * shape[3] * shape[4] * (8 * 8) * model_management.dtype_size(dtype) + # Hunyuan 3d v2 2.0 & 2.1 elif "geo_decoder.cross_attn_decoder.ln_1.bias" in sd: + self.latent_dim = 1 - ln_post = "geo_decoder.ln_post.weight" in sd - inner_size = sd["geo_decoder.output_proj.weight"].shape[1] - downsample_ratio = sd["post_kl.weight"].shape[0] // inner_size - mlp_expand = sd["geo_decoder.cross_attn_decoder.mlp.c_fc.weight"].shape[0] // inner_size - self.memory_used_encode = lambda shape, dtype: (1000 * shape[2]) * model_management.dtype_size(dtype) # TODO - self.memory_used_decode = lambda shape, dtype: (1024 * 1024 * 1024 * 2.0) * model_management.dtype_size(dtype) # TODO - ddconfig = {"embed_dim": 64, "num_freqs": 8, "include_pi": False, "heads": 16, "width": 1024, "num_decoder_layers": 16, "qkv_bias": False, "qk_norm": True, "geo_decoder_mlp_expand_ratio": mlp_expand, "geo_decoder_downsample_ratio": downsample_ratio, "geo_decoder_ln_post": ln_post} - self.first_stage_model = comfy.ldm.hunyuan3d.vae.ShapeVAE(**ddconfig) + + def estimate_memory(shape, dtype, num_layers = 16, kv_cache_multiplier = 2): + batch, num_tokens, hidden_dim = shape + dtype_size = model_management.dtype_size(dtype) + + total_mem = batch * num_tokens * hidden_dim * dtype_size * (1 + kv_cache_multiplier * num_layers) + return total_mem + + # better memory estimations + self.memory_used_encode = lambda shape, dtype, num_layers = 8, kv_cache_multiplier = 0:\ + estimate_memory(shape, dtype, num_layers, kv_cache_multiplier) + + self.memory_used_decode = lambda shape, dtype, num_layers = 16, kv_cache_multiplier = 2: \ + estimate_memory(shape, dtype, num_layers, kv_cache_multiplier) + + self.first_stage_model = comfy.ldm.hunyuan3d.vae.ShapeVAE() self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32] + + elif "vocoder.backbone.channel_layers.0.0.bias" in sd: #Ace Step Audio self.first_stage_model = comfy.ldm.ace.vae.music_dcae_pipeline.MusicDCAE(source_sample_rate=44100) self.memory_used_encode = lambda shape, dtype: (shape[2] * 330) * model_management.dtype_size(dtype) @@ -1046,6 +1058,27 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c model = None model_patcher = None + if isinstance(sd, dict) and all(k in sd for k in ["model", "vae", "conditioner"]): + from collections import OrderedDict + import gc + + merged_sd = OrderedDict() + + for k, v in sd["model"].items(): + merged_sd[f"model.{k}"] = v + + for k, v in sd["vae"].items(): + merged_sd[f"vae.{k}"] = v + + for key, value in sd["conditioner"].items(): + merged_sd[f"conditioner.{key}"] = value + + sd = merged_sd + + del merged_sd + gc.collect() + torch.cuda.empty_cache() + diffusion_model_prefix = model_detection.unet_prefix_from_state_dict(sd) parameters = comfy.utils.calculate_parameters(sd, diffusion_model_prefix) weight_dtype = comfy.utils.weight_dtype(sd, diffusion_model_prefix) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 76260de00..75dad277d 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1128,6 +1128,17 @@ class Hunyuan3Dv2(supported_models_base.BASE): def clip_target(self, state_dict={}): return None +class Hunyuan3Dv2_1(Hunyuan3Dv2): + unet_config = { + "image_model": "hunyuan3d2_1", + } + + latent_format = latent_formats.Hunyuan3Dv2_1 + + def get_model(self, state_dict, prefix="", device=None): + out = model_base.Hunyuan3Dv2_1(self, device = device) + return out + class Hunyuan3Dv2mini(Hunyuan3Dv2): unet_config = { "image_model": "hunyuan3d2", @@ -1285,6 +1296,6 @@ class QwenImage(supported_models_base.BASE): return supported_models_base.ClipTarget(comfy.text_encoders.qwen_image.QwenImageTokenizer, comfy.text_encoders.qwen_image.te(**hunyuan_detect)) -models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep, Omnigen2, QwenImage] +models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ACEStep, Omnigen2, QwenImage] models += [SVD_img2vid] diff --git a/comfy_extras/nodes_hunyuan3d.py b/comfy_extras/nodes_hunyuan3d.py index 51e45336a..f6e71e0a8 100644 --- a/comfy_extras/nodes_hunyuan3d.py +++ b/comfy_extras/nodes_hunyuan3d.py @@ -8,13 +8,16 @@ import folder_paths import comfy.model_management from comfy.cli_args import args - class EmptyLatentHunyuan3Dv2: @classmethod def INPUT_TYPES(s): - return {"required": {"resolution": ("INT", {"default": 3072, "min": 1, "max": 8192}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096, "tooltip": "The number of latent images in the batch."}), - }} + return { + "required": { + "resolution": ("INT", {"default": 3072, "min": 1, "max": 8192}), + "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096, "tooltip": "The number of latent images in the batch."}), + } + } + RETURN_TYPES = ("LATENT",) FUNCTION = "generate" @@ -24,7 +27,6 @@ class EmptyLatentHunyuan3Dv2: latent = torch.zeros([batch_size, 64, resolution], device=comfy.model_management.intermediate_device()) return ({"samples": latent, "type": "hunyuan3dv2"}, ) - class Hunyuan3Dv2Conditioning: @classmethod def INPUT_TYPES(s): @@ -81,7 +83,6 @@ class VOXEL: def __init__(self, data): self.data = data - class VAEDecodeHunyuan3D: @classmethod def INPUT_TYPES(s): @@ -99,7 +100,6 @@ class VAEDecodeHunyuan3D: voxels = VOXEL(vae.decode(samples["samples"], vae_options={"num_chunks": num_chunks, "octree_resolution": octree_resolution})) return (voxels, ) - def voxel_to_mesh(voxels, threshold=0.5, device=None): if device is None: device = torch.device("cpu") @@ -230,13 +230,9 @@ def voxel_to_mesh_surfnet(voxels, threshold=0.5, device=None): [0, 0, 1], [1, 0, 1], [0, 1, 1], [1, 1, 1] ], device=device) - corner_values = torch.zeros((cell_positions.shape[0], 8), device=device) - for c, (dz, dy, dx) in enumerate(corner_offsets): - corner_values[:, c] = padded[ - cell_positions[:, 0] + dz, - cell_positions[:, 1] + dy, - cell_positions[:, 2] + dx - ] + pos = cell_positions.unsqueeze(1) + corner_offsets.unsqueeze(0) + z_idx, y_idx, x_idx = pos.unbind(-1) + corner_values = padded[z_idx, y_idx, x_idx] corner_signs = corner_values > threshold has_inside = torch.any(corner_signs, dim=1) diff --git a/nodes.py b/nodes.py index 6c2f9dd14..1afe5601a 100644 --- a/nodes.py +++ b/nodes.py @@ -998,20 +998,31 @@ class CLIPVisionLoader: class CLIPVisionEncode: @classmethod def INPUT_TYPES(s): - return {"required": { "clip_vision": ("CLIP_VISION",), - "image": ("IMAGE",), - "crop": (["center", "none"],) - }} + return { + "required": { + "clip_vision": ("CLIP_VISION",), + "image": ("IMAGE",), + "crop": (["center", "none", "recenter"],), + }, + "optional": { + "border_ratio": ("FLOAT", {"default": 0.15, "min": 0.0, "max": 0.5, "step": 0.01, "visible_if": {"crop": "recenter"},}), + } + } + RETURN_TYPES = ("CLIP_VISION_OUTPUT",) FUNCTION = "encode" CATEGORY = "conditioning" - def encode(self, clip_vision, image, crop): - crop_image = True - if crop != "center": - crop_image = False - output = clip_vision.encode_image(image, crop=crop_image) + def encode(self, clip_vision, image, crop, border_ratio): + crop_image = crop == "center" + + if crop == "recenter": + crop_image = True + else: + border_ratio = None + + output = clip_vision.encode_image(image, crop=crop_image, border_ratio = border_ratio) return (output,) class StyleModelLoader: diff --git a/requirements.txt b/requirements.txt index 3008a5dc3..564fa6e23 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,4 +27,4 @@ kornia>=0.7.1 spandrel soundfile pydantic~=2.0 -pydantic-settings~=2.0 +pydantic-settings~=2.0 \ No newline at end of file From c9ebe70072213a875ffbe40cc1b36820b2005211 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 4 Sep 2025 17:39:02 -0700 Subject: [PATCH 241/243] Some changes to the previous hunyuan PR. (#9725) --- comfy/clip_vision.py | 225 +------------------------------------------ comfy/sd.py | 21 ---- nodes.py | 29 ++---- requirements.txt | 2 +- 4 files changed, 14 insertions(+), 263 deletions(-) diff --git a/comfy/clip_vision.py b/comfy/clip_vision.py index 4bc640e8b..447b1ce4a 100644 --- a/comfy/clip_vision.py +++ b/comfy/clip_vision.py @@ -17,227 +17,10 @@ class Output: def __setitem__(self, key, item): setattr(self, key, item) - -def cubic_kernel(x, a: float = -0.75): - absx = x.abs() - absx2 = absx ** 2 - absx3 = absx ** 3 - - w = (a + 2) * absx3 - (a + 3) * absx2 + 1 - w2 = a * absx3 - 5*a * absx2 + 8*a * absx - 4*a - - return torch.where(absx <= 1, w, torch.where(absx < 2, w2, torch.zeros_like(x))) - -def get_indices_weights(in_size, out_size, scale): - # OpenCV-style half-pixel mapping - x = torch.arange(out_size, dtype=torch.float32) - x = (x + 0.5) / scale - 0.5 - - x0 = x.floor().long() - dx = x.unsqueeze(1) - (x0.unsqueeze(1) + torch.arange(-1, 3)) - - weights = cubic_kernel(dx) - weights = weights / weights.sum(dim=1, keepdim=True) - - indices = x0.unsqueeze(1) + torch.arange(-1, 3) - indices = indices.clamp(0, in_size - 1) - - return indices, weights - -def resize_cubic_1d(x, out_size, dim): - b, c, h, w = x.shape - in_size = h if dim == 2 else w - scale = out_size / in_size - - indices, weights = get_indices_weights(in_size, out_size, scale) - - if dim == 2: - x = x.permute(0, 1, 3, 2) - x = x.reshape(-1, h) - else: - x = x.reshape(-1, w) - - gathered = x[:, indices] - out = (gathered * weights.unsqueeze(0)).sum(dim=2) - - if dim == 2: - out = out.reshape(b, c, w, out_size).permute(0, 1, 3, 2) - else: - out = out.reshape(b, c, h, out_size) - - return out - -def resize_cubic(img: torch.Tensor, size: tuple) -> torch.Tensor: - """ - Resize image using OpenCV-equivalent INTER_CUBIC interpolation. - Implemented in pure PyTorch - """ - - if img.ndim == 3: - img = img.unsqueeze(0) - - img = img.permute(0, 3, 1, 2) - - out_h, out_w = size - img = resize_cubic_1d(img, out_h, dim=2) - img = resize_cubic_1d(img, out_w, dim=3) - return img - -def resize_area(img: torch.Tensor, size: tuple) -> torch.Tensor: - # vectorized implementation for OpenCV's INTER_AREA using pure PyTorch - original_shape = img.shape - is_hwc = False - - if img.ndim == 3: - if img.shape[0] <= 4: - img = img.unsqueeze(0) - else: - is_hwc = True - img = img.permute(2, 0, 1).unsqueeze(0) - elif img.ndim == 4: - pass - else: - raise ValueError("Expected image with 3 or 4 dims.") - - B, C, H, W = img.shape - out_h, out_w = size - scale_y = H / out_h - scale_x = W / out_w - - device = img.device - - # compute the grid boundries - y_start = torch.arange(out_h, device=device).float() * scale_y - y_end = y_start + scale_y - x_start = torch.arange(out_w, device=device).float() * scale_x - x_end = x_start + scale_x - - # for each output pixel, we will compute the range for it - y_start_int = torch.floor(y_start).long() - y_end_int = torch.ceil(y_end).long() - x_start_int = torch.floor(x_start).long() - x_end_int = torch.ceil(x_end).long() - - # We will build the weighted sums by iterating over contributing input pixels once - output = torch.zeros((B, C, out_h, out_w), dtype=torch.float32, device=device) - area = torch.zeros((out_h, out_w), dtype=torch.float32, device=device) - - max_kernel_h = int(torch.max(y_end_int - y_start_int).item()) - max_kernel_w = int(torch.max(x_end_int - x_start_int).item()) - - for dy in range(max_kernel_h): - for dx in range(max_kernel_w): - # compute the weights for this offset for all output pixels - - y_idx = y_start_int.unsqueeze(1) + dy - x_idx = x_start_int.unsqueeze(0) + dx - - # clamp indices to image boundaries - y_idx_clamped = torch.clamp(y_idx, 0, H - 1) - x_idx_clamped = torch.clamp(x_idx, 0, W - 1) - - # compute weights by broadcasting - y_weight = (torch.min(y_end.unsqueeze(1), y_idx_clamped.float() + 1.0) - torch.max(y_start.unsqueeze(1), y_idx_clamped.float())).clamp(min=0) - x_weight = (torch.min(x_end.unsqueeze(0), x_idx_clamped.float() + 1.0) - torch.max(x_start.unsqueeze(0), x_idx_clamped.float())).clamp(min=0) - - weight = (y_weight * x_weight) - - y_expand = y_idx_clamped.expand(out_h, out_w) - x_expand = x_idx_clamped.expand(out_h, out_w) - - - pixels = img[:, :, y_expand, x_expand] - - # unsqueeze to broadcast - w = weight.unsqueeze(0).unsqueeze(0) - - output += pixels * w - area += weight - - # Normalize by area - output /= area.unsqueeze(0).unsqueeze(0) - - if is_hwc: - return output[0].permute(1, 2, 0) - elif img.shape[0] == 1 and original_shape[0] <= 4: - return output[0] - else: - return output - -def recenter(image, border_ratio: float = 0.2): - - if image.shape[-1] == 4: - mask = image[..., 3] - else: - mask = torch.ones_like(image[..., 0:1]) * 255 - image = torch.concatenate([image, mask], axis=-1) - mask = mask[..., 0] - - H, W, C = image.shape - - size = max(H, W) - result = torch.zeros((size, size, C), dtype = torch.uint8) - - # as_tuple to match numpy behaviour - x_coords, y_coords = torch.nonzero(mask, as_tuple=True) - - y_min, y_max = y_coords.min(), y_coords.max() - x_min, x_max = x_coords.min(), x_coords.max() - - h = x_max - x_min - w = y_max - y_min - - if h == 0 or w == 0: - raise ValueError('input image is empty') - - desired_size = int(size * (1 - border_ratio)) - scale = desired_size / max(h, w) - - h2 = int(h * scale) - w2 = int(w * scale) - - x2_min = (size - h2) // 2 - x2_max = x2_min + h2 - - y2_min = (size - w2) // 2 - y2_max = y2_min + w2 - - # note: opencv takes columns first (opposite to pytorch and numpy that take the row first) - result[x2_min:x2_max, y2_min:y2_max] = resize_area(image[x_min:x_max, y_min:y_max], (h2, w2)) - - bg = torch.ones((result.shape[0], result.shape[1], 3), dtype = torch.uint8) * 255 - - mask = result[..., 3:].to(torch.float32) / 255 - result = result[..., :3] * mask + bg * (1 - mask) - - mask = mask * 255 - result = result.clip(0, 255).to(torch.uint8) - mask = mask.clip(0, 255).to(torch.uint8) - - return result - -def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711], - crop=True, value_range = (-1, 1), border_ratio: float = None, recenter_size: int = 512): - - if border_ratio is not None: - - image = (image * 255).clamp(0, 255).to(torch.uint8) - image = [recenter(img, border_ratio = border_ratio) for img in image] - - image = torch.stack(image, dim = 0) - image = resize_cubic(image, size = (recenter_size, recenter_size)) - - image = image / 255 * 2 - 1 - low, high = value_range - - image = (image - low) / (high - low) - image = image.permute(0, 2, 3, 1) - +def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711], crop=True): image = image[:, :, :, :3] if image.shape[3] > 3 else image - mean = torch.tensor(mean, device=image.device, dtype=image.dtype) std = torch.tensor(std, device=image.device, dtype=image.dtype) - image = image.movedim(-1, 1) if not (image.shape[2] == size and image.shape[3] == size): if crop: @@ -246,7 +29,7 @@ def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], s else: scale_size = (size, size) - image = torch.nn.functional.interpolate(image, size=scale_size, mode="bilinear" if border_ratio is not None else "bicubic", antialias=True) + image = torch.nn.functional.interpolate(image, size=scale_size, mode="bicubic", antialias=True) h = (image.shape[2] - size)//2 w = (image.shape[3] - size)//2 image = image[:,:,h:h+size,w:w+size] @@ -288,9 +71,9 @@ class ClipVisionModel(): def get_sd(self): return self.model.state_dict() - def encode_image(self, image, crop=True, border_ratio: float = None): + def encode_image(self, image, crop=True): comfy.model_management.load_model_gpu(self.patcher) - pixel_values = clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=crop, border_ratio=border_ratio).float() + pixel_values = clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=crop).float() out = self.model(pixel_values=pixel_values, intermediate_output='all' if self.return_all_hidden_states else -2) outputs = Output() diff --git a/comfy/sd.py b/comfy/sd.py index 014f797ca..be5aa8dc8 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -1058,27 +1058,6 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c model = None model_patcher = None - if isinstance(sd, dict) and all(k in sd for k in ["model", "vae", "conditioner"]): - from collections import OrderedDict - import gc - - merged_sd = OrderedDict() - - for k, v in sd["model"].items(): - merged_sd[f"model.{k}"] = v - - for k, v in sd["vae"].items(): - merged_sd[f"vae.{k}"] = v - - for key, value in sd["conditioner"].items(): - merged_sd[f"conditioner.{key}"] = value - - sd = merged_sd - - del merged_sd - gc.collect() - torch.cuda.empty_cache() - diffusion_model_prefix = model_detection.unet_prefix_from_state_dict(sd) parameters = comfy.utils.calculate_parameters(sd, diffusion_model_prefix) weight_dtype = comfy.utils.weight_dtype(sd, diffusion_model_prefix) diff --git a/nodes.py b/nodes.py index 1afe5601a..6c2f9dd14 100644 --- a/nodes.py +++ b/nodes.py @@ -998,31 +998,20 @@ class CLIPVisionLoader: class CLIPVisionEncode: @classmethod def INPUT_TYPES(s): - return { - "required": { - "clip_vision": ("CLIP_VISION",), - "image": ("IMAGE",), - "crop": (["center", "none", "recenter"],), - }, - "optional": { - "border_ratio": ("FLOAT", {"default": 0.15, "min": 0.0, "max": 0.5, "step": 0.01, "visible_if": {"crop": "recenter"},}), - } - } - + return {"required": { "clip_vision": ("CLIP_VISION",), + "image": ("IMAGE",), + "crop": (["center", "none"],) + }} RETURN_TYPES = ("CLIP_VISION_OUTPUT",) FUNCTION = "encode" CATEGORY = "conditioning" - def encode(self, clip_vision, image, crop, border_ratio): - crop_image = crop == "center" - - if crop == "recenter": - crop_image = True - else: - border_ratio = None - - output = clip_vision.encode_image(image, crop=crop_image, border_ratio = border_ratio) + def encode(self, clip_vision, image, crop): + crop_image = True + if crop != "center": + crop_image = False + output = clip_vision.encode_image(image, crop=crop_image) return (output,) class StyleModelLoader: diff --git a/requirements.txt b/requirements.txt index 564fa6e23..3008a5dc3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,4 +27,4 @@ kornia>=0.7.1 spandrel soundfile pydantic~=2.0 -pydantic-settings~=2.0 \ No newline at end of file +pydantic-settings~=2.0 From 3493b9cb1f9a9a66b1b86ed908cf87bc382b647a Mon Sep 17 00:00:00 2001 From: Arjan Singh <1598641+arjansingh@users.noreply.github.com> Date: Fri, 5 Sep 2025 11:32:25 -0700 Subject: [PATCH 242/243] fix: add cache headers for images (#9560) --- middleware/__init__.py | 1 + middleware/cache_middleware.py | 52 ++++ server.py | 11 +- tests-unit/server_test/test_cache_control.py | 255 +++++++++++++++++++ 4 files changed, 311 insertions(+), 8 deletions(-) create mode 100644 middleware/__init__.py create mode 100644 middleware/cache_middleware.py create mode 100644 tests-unit/server_test/test_cache_control.py diff --git a/middleware/__init__.py b/middleware/__init__.py new file mode 100644 index 000000000..2d7c7c3a9 --- /dev/null +++ b/middleware/__init__.py @@ -0,0 +1 @@ +"""Server middleware modules""" diff --git a/middleware/cache_middleware.py b/middleware/cache_middleware.py new file mode 100644 index 000000000..374ef7934 --- /dev/null +++ b/middleware/cache_middleware.py @@ -0,0 +1,52 @@ +"""Cache control middleware for ComfyUI server""" + +from aiohttp import web +from typing import Callable, Awaitable + +# Time in seconds +ONE_HOUR: int = 3600 +ONE_DAY: int = 86400 +IMG_EXTENSIONS = ( + ".jpg", + ".jpeg", + ".png", + ".ppm", + ".bmp", + ".pgm", + ".tif", + ".tiff", + ".webp", +) + + +@web.middleware +async def cache_control( + request: web.Request, handler: Callable[[web.Request], Awaitable[web.Response]] +) -> web.Response: + """Cache control middleware that sets appropriate cache headers based on file type and response status""" + response: web.Response = await handler(request) + + if ( + request.path.endswith(".js") + or request.path.endswith(".css") + or request.path.endswith("index.json") + ): + response.headers.setdefault("Cache-Control", "no-cache") + return response + + # Early return for non-image files - no cache headers needed + if not request.path.lower().endswith(IMG_EXTENSIONS): + return response + + # Handle image files + if response.status == 404: + response.headers.setdefault("Cache-Control", f"public, max-age={ONE_HOUR}") + elif response.status in (200, 201, 202, 203, 204, 205, 206, 301, 308): + # Success responses and permanent redirects - cache for 1 day + response.headers.setdefault("Cache-Control", f"public, max-age={ONE_DAY}") + elif response.status in (302, 303, 307): + # Temporary redirects - no cache + response.headers.setdefault("Cache-Control", "no-cache") + # Note: 304 Not Modified falls through - no cache headers set + + return response diff --git a/server.py b/server.py index 3d323eaf8..43816a8cd 100644 --- a/server.py +++ b/server.py @@ -39,20 +39,15 @@ from typing import Optional, Union from api_server.routes.internal.internal_routes import InternalRoutes from protocol import BinaryEventTypes +# Import cache control middleware +from middleware.cache_middleware import cache_control + async def send_socket_catch_exception(function, message): try: await function(message) except (aiohttp.ClientError, aiohttp.ClientPayloadError, ConnectionResetError, BrokenPipeError, ConnectionError) as err: logging.warning("send error: {}".format(err)) -@web.middleware -async def cache_control(request: web.Request, handler): - response: web.Response = await handler(request) - if request.path.endswith('.js') or request.path.endswith('.css') or request.path.endswith('index.json'): - response.headers.setdefault('Cache-Control', 'no-cache') - return response - - @web.middleware async def compress_body(request: web.Request, handler): accept_encoding = request.headers.get("Accept-Encoding", "") diff --git a/tests-unit/server_test/test_cache_control.py b/tests-unit/server_test/test_cache_control.py new file mode 100644 index 000000000..8de59125a --- /dev/null +++ b/tests-unit/server_test/test_cache_control.py @@ -0,0 +1,255 @@ +"""Tests for server cache control middleware""" + +import pytest +from aiohttp import web +from aiohttp.test_utils import make_mocked_request +from typing import Dict, Any + +from middleware.cache_middleware import cache_control, ONE_HOUR, ONE_DAY, IMG_EXTENSIONS + +pytestmark = pytest.mark.asyncio # Apply asyncio mark to all tests + +# Test configuration data +CACHE_SCENARIOS = [ + # Image file scenarios + { + "name": "image_200_status", + "path": "/test.jpg", + "status": 200, + "expected_cache": f"public, max-age={ONE_DAY}", + "should_have_header": True, + }, + { + "name": "image_404_status", + "path": "/missing.jpg", + "status": 404, + "expected_cache": f"public, max-age={ONE_HOUR}", + "should_have_header": True, + }, + # JavaScript/CSS scenarios + { + "name": "js_no_cache", + "path": "/script.js", + "status": 200, + "expected_cache": "no-cache", + "should_have_header": True, + }, + { + "name": "css_no_cache", + "path": "/styles.css", + "status": 200, + "expected_cache": "no-cache", + "should_have_header": True, + }, + { + "name": "index_json_no_cache", + "path": "/api/index.json", + "status": 200, + "expected_cache": "no-cache", + "should_have_header": True, + }, + # Non-matching files + { + "name": "html_no_header", + "path": "/index.html", + "status": 200, + "expected_cache": None, + "should_have_header": False, + }, + { + "name": "txt_no_header", + "path": "/data.txt", + "status": 200, + "expected_cache": None, + "should_have_header": False, + }, + { + "name": "api_endpoint_no_header", + "path": "/api/endpoint", + "status": 200, + "expected_cache": None, + "should_have_header": False, + }, + { + "name": "pdf_no_header", + "path": "/file.pdf", + "status": 200, + "expected_cache": None, + "should_have_header": False, + }, +] + +# Status code scenarios for images +IMAGE_STATUS_SCENARIOS = [ + # Success statuses get long cache + {"status": 200, "expected": f"public, max-age={ONE_DAY}"}, + {"status": 201, "expected": f"public, max-age={ONE_DAY}"}, + {"status": 202, "expected": f"public, max-age={ONE_DAY}"}, + {"status": 204, "expected": f"public, max-age={ONE_DAY}"}, + {"status": 206, "expected": f"public, max-age={ONE_DAY}"}, + # Permanent redirects get long cache + {"status": 301, "expected": f"public, max-age={ONE_DAY}"}, + {"status": 308, "expected": f"public, max-age={ONE_DAY}"}, + # Temporary redirects get no cache + {"status": 302, "expected": "no-cache"}, + {"status": 303, "expected": "no-cache"}, + {"status": 307, "expected": "no-cache"}, + # 404 gets short cache + {"status": 404, "expected": f"public, max-age={ONE_HOUR}"}, +] + +# Case sensitivity test paths +CASE_SENSITIVITY_PATHS = ["/image.JPG", "/photo.PNG", "/pic.JpEg"] + +# Edge case test paths +EDGE_CASE_PATHS = [ + { + "name": "query_strings_ignored", + "path": "/image.jpg?v=123&size=large", + "expected": f"public, max-age={ONE_DAY}", + }, + { + "name": "multiple_dots_in_path", + "path": "/image.min.jpg", + "expected": f"public, max-age={ONE_DAY}", + }, + { + "name": "nested_paths_with_images", + "path": "/static/images/photo.jpg", + "expected": f"public, max-age={ONE_DAY}", + }, +] + + +class TestCacheControl: + """Test cache control middleware functionality""" + + @pytest.fixture + def status_handler_factory(self): + """Create a factory for handlers that return specific status codes""" + + def factory(status: int, headers: Dict[str, str] = None): + async def handler(request): + return web.Response(status=status, headers=headers or {}) + + return handler + + return factory + + @pytest.fixture + def mock_handler(self, status_handler_factory): + """Create a mock handler that returns a response with 200 status""" + return status_handler_factory(200) + + @pytest.fixture + def handler_with_existing_cache(self, status_handler_factory): + """Create a handler that returns response with existing Cache-Control header""" + return status_handler_factory(200, {"Cache-Control": "max-age=3600"}) + + async def assert_cache_header( + self, + response: web.Response, + expected_cache: str = None, + should_have_header: bool = True, + ): + """Helper to assert cache control headers""" + if should_have_header: + assert "Cache-Control" in response.headers + if expected_cache: + assert response.headers["Cache-Control"] == expected_cache + else: + assert "Cache-Control" not in response.headers + + # Parameterized tests + @pytest.mark.parametrize("scenario", CACHE_SCENARIOS, ids=lambda x: x["name"]) + async def test_cache_control_scenarios( + self, scenario: Dict[str, Any], status_handler_factory + ): + """Test various cache control scenarios""" + handler = status_handler_factory(scenario["status"]) + request = make_mocked_request("GET", scenario["path"]) + response = await cache_control(request, handler) + + assert response.status == scenario["status"] + await self.assert_cache_header( + response, scenario["expected_cache"], scenario["should_have_header"] + ) + + @pytest.mark.parametrize("ext", IMG_EXTENSIONS) + async def test_all_image_extensions(self, ext: str, mock_handler): + """Test all defined image extensions are handled correctly""" + request = make_mocked_request("GET", f"/image{ext}") + response = await cache_control(request, mock_handler) + + assert response.status == 200 + assert "Cache-Control" in response.headers + assert response.headers["Cache-Control"] == f"public, max-age={ONE_DAY}" + + @pytest.mark.parametrize( + "status_scenario", IMAGE_STATUS_SCENARIOS, ids=lambda x: f"status_{x['status']}" + ) + async def test_image_status_codes( + self, status_scenario: Dict[str, Any], status_handler_factory + ): + """Test different status codes for image requests""" + handler = status_handler_factory(status_scenario["status"]) + request = make_mocked_request("GET", "/image.jpg") + response = await cache_control(request, handler) + + assert response.status == status_scenario["status"] + assert "Cache-Control" in response.headers + assert response.headers["Cache-Control"] == status_scenario["expected"] + + @pytest.mark.parametrize("path", CASE_SENSITIVITY_PATHS) + async def test_case_insensitive_image_extension(self, path: str, mock_handler): + """Test that image extensions are matched case-insensitively""" + request = make_mocked_request("GET", path) + response = await cache_control(request, mock_handler) + + assert "Cache-Control" in response.headers + assert response.headers["Cache-Control"] == f"public, max-age={ONE_DAY}" + + @pytest.mark.parametrize("edge_case", EDGE_CASE_PATHS, ids=lambda x: x["name"]) + async def test_edge_cases(self, edge_case: Dict[str, str], mock_handler): + """Test edge cases like query strings, nested paths, etc.""" + request = make_mocked_request("GET", edge_case["path"]) + response = await cache_control(request, mock_handler) + + assert "Cache-Control" in response.headers + assert response.headers["Cache-Control"] == edge_case["expected"] + + # Header preservation tests (special cases not covered by parameterization) + async def test_js_preserves_existing_headers(self, handler_with_existing_cache): + """Test that .js files preserve existing Cache-Control headers""" + request = make_mocked_request("GET", "/script.js") + response = await cache_control(request, handler_with_existing_cache) + + # setdefault should preserve existing header + assert response.headers["Cache-Control"] == "max-age=3600" + + async def test_css_preserves_existing_headers(self, handler_with_existing_cache): + """Test that .css files preserve existing Cache-Control headers""" + request = make_mocked_request("GET", "/styles.css") + response = await cache_control(request, handler_with_existing_cache) + + # setdefault should preserve existing header + assert response.headers["Cache-Control"] == "max-age=3600" + + async def test_image_preserves_existing_headers(self, status_handler_factory): + """Test that image cache headers preserve existing Cache-Control""" + handler = status_handler_factory(200, {"Cache-Control": "private, no-cache"}) + request = make_mocked_request("GET", "/image.jpg") + response = await cache_control(request, handler) + + # setdefault should preserve existing header + assert response.headers["Cache-Control"] == "private, no-cache" + + async def test_304_not_modified_inherits_cache(self, status_handler_factory): + """Test that 304 Not Modified doesn't set cache headers for images""" + handler = status_handler_factory(304, {"Cache-Control": "max-age=7200"}) + request = make_mocked_request("GET", "/not-modified.jpg") + response = await cache_control(request, handler) + + assert response.status == 304 + # Should preserve existing cache header, not override + assert response.headers["Cache-Control"] == "max-age=7200" From 2ee7879a0bdbf507bfd26f8b36eca2fef147c29d Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 5 Sep 2025 11:57:35 -0700 Subject: [PATCH 243/243] Fix lowvram issues with hunyuan3d 2.1 (#9735) --- comfy/ldm/hunyuan3dv2_1/hunyuandit.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/comfy/ldm/hunyuan3dv2_1/hunyuandit.py b/comfy/ldm/hunyuan3dv2_1/hunyuandit.py index 48575bb3c..ca1a83001 100644 --- a/comfy/ldm/hunyuan3dv2_1/hunyuandit.py +++ b/comfy/ldm/hunyuan3dv2_1/hunyuandit.py @@ -3,6 +3,7 @@ import torch import torch.nn as nn import torch.nn.functional as F from comfy.ldm.modules.attention import optimized_attention +import comfy.model_management class GELU(nn.Module): @@ -88,7 +89,7 @@ class MoEGate(nn.Module): hidden_states = hidden_states.view(-1, hidden_states.size(-1)) # get logits and pass it to softmax - logits = F.linear(hidden_states, self.weight, bias = None) + logits = F.linear(hidden_states, comfy.model_management.cast_to(self.weight, dtype=hidden_states.dtype, device=hidden_states.device), bias = None) scores = logits.softmax(dim = -1) topk_weight, topk_idx = torch.topk(scores, k = self.top_k, dim = -1, sorted = False) @@ -255,7 +256,7 @@ class TimestepEmbedder(nn.Module): cond_embed = self.cond_proj(condition) timestep_embed = timestep_embed + cond_embed - time_conditioned = self.mlp(timestep_embed.to(self.mlp[0].weight.device)) + time_conditioned = self.mlp(timestep_embed) # for broadcasting with image tokens return time_conditioned.unsqueeze(1)